diff --git "a/med-Llava3-slake/trainer_state.json" "b/med-Llava3-slake/trainer_state.json" new file mode 100644--- /dev/null +++ "b/med-Llava3-slake/trainer_state.json" @@ -0,0 +1,223811 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.0, + "eval_steps": 500, + "global_step": 31967, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00040666937779585197, + "grad_norm": 336.3790431862325, + "learning_rate": 2.0833333333333335e-08, + "loss": 7.9445, + "step": 1 + }, + { + "epoch": 0.0008133387555917039, + "grad_norm": 267.19097824955827, + "learning_rate": 4.166666666666667e-08, + "loss": 7.4385, + "step": 2 + }, + { + "epoch": 0.0012200081333875558, + "grad_norm": 304.4738323574854, + "learning_rate": 6.250000000000001e-08, + "loss": 8.8053, + "step": 3 + }, + { + "epoch": 0.0016266775111834079, + "grad_norm": 500.23939015252944, + "learning_rate": 8.333333333333334e-08, + "loss": 8.9834, + "step": 4 + }, + { + "epoch": 0.0020333468889792597, + "grad_norm": 415.41444908068, + "learning_rate": 1.0416666666666667e-07, + "loss": 10.3582, + "step": 5 + }, + { + "epoch": 0.0024400162667751117, + "grad_norm": 465.5587776865917, + "learning_rate": 1.2500000000000002e-07, + "loss": 8.487, + "step": 6 + }, + { + "epoch": 0.0028466856445709637, + "grad_norm": 302.81291168783287, + "learning_rate": 1.4583333333333335e-07, + "loss": 9.7042, + "step": 7 + }, + { + "epoch": 0.0032533550223668157, + "grad_norm": 252.28578294518337, + "learning_rate": 1.6666666666666668e-07, + "loss": 8.6736, + "step": 8 + }, + { + "epoch": 0.0036600244001626678, + "grad_norm": 362.86883926023876, + "learning_rate": 1.875e-07, + "loss": 8.5463, + "step": 9 + }, + { + "epoch": 0.004066693777958519, + "grad_norm": 426.3440596628242, + "learning_rate": 2.0833333333333333e-07, + "loss": 8.7594, + "step": 10 + }, + { + "epoch": 0.004473363155754371, + "grad_norm": 613.9716832592837, + "learning_rate": 2.2916666666666666e-07, + "loss": 12.4395, + "step": 11 + }, + { + "epoch": 0.004880032533550223, + "grad_norm": 417.05333694277346, + "learning_rate": 2.5000000000000004e-07, + "loss": 9.176, + "step": 12 + }, + { + "epoch": 0.005286701911346075, + "grad_norm": 295.16758340994403, + "learning_rate": 2.7083333333333337e-07, + "loss": 7.4001, + "step": 13 + }, + { + "epoch": 0.0056933712891419274, + "grad_norm": 311.22539910373536, + "learning_rate": 2.916666666666667e-07, + "loss": 10.9608, + "step": 14 + }, + { + "epoch": 0.0061000406669377795, + "grad_norm": 360.42032061244265, + "learning_rate": 3.125e-07, + "loss": 8.0277, + "step": 15 + }, + { + "epoch": 0.0065067100447336315, + "grad_norm": 495.23019819422484, + "learning_rate": 3.3333333333333335e-07, + "loss": 8.3296, + "step": 16 + }, + { + "epoch": 0.0069133794225294835, + "grad_norm": 473.3454656340052, + "learning_rate": 3.541666666666667e-07, + "loss": 8.6705, + "step": 17 + }, + { + "epoch": 0.0073200488003253355, + "grad_norm": 436.28792876708417, + "learning_rate": 3.75e-07, + "loss": 6.9811, + "step": 18 + }, + { + "epoch": 0.0077267181781211875, + "grad_norm": 308.35301137678175, + "learning_rate": 3.9583333333333334e-07, + "loss": 7.0424, + "step": 19 + }, + { + "epoch": 0.008133387555917039, + "grad_norm": 376.27633199716433, + "learning_rate": 4.1666666666666667e-07, + "loss": 9.9495, + "step": 20 + }, + { + "epoch": 0.00854005693371289, + "grad_norm": 398.00524527960755, + "learning_rate": 4.375e-07, + "loss": 8.1846, + "step": 21 + }, + { + "epoch": 0.008946726311508743, + "grad_norm": 370.4279363976788, + "learning_rate": 4.583333333333333e-07, + "loss": 9.1595, + "step": 22 + }, + { + "epoch": 0.009353395689304595, + "grad_norm": 436.32766594544165, + "learning_rate": 4.791666666666667e-07, + "loss": 12.5175, + "step": 23 + }, + { + "epoch": 0.009760065067100447, + "grad_norm": 472.64259955955225, + "learning_rate": 5.000000000000001e-07, + "loss": 8.9049, + "step": 24 + }, + { + "epoch": 0.010166734444896299, + "grad_norm": 420.48644412548947, + "learning_rate": 5.208333333333334e-07, + "loss": 9.6596, + "step": 25 + }, + { + "epoch": 0.01057340382269215, + "grad_norm": 381.1238611016002, + "learning_rate": 5.416666666666667e-07, + "loss": 8.7802, + "step": 26 + }, + { + "epoch": 0.010980073200488003, + "grad_norm": 331.09661774700794, + "learning_rate": 5.625e-07, + "loss": 6.6174, + "step": 27 + }, + { + "epoch": 0.011386742578283855, + "grad_norm": 270.3724783957498, + "learning_rate": 5.833333333333334e-07, + "loss": 8.1777, + "step": 28 + }, + { + "epoch": 0.011793411956079707, + "grad_norm": 516.6443725597692, + "learning_rate": 6.041666666666667e-07, + "loss": 10.3248, + "step": 29 + }, + { + "epoch": 0.012200081333875559, + "grad_norm": 468.1350352292663, + "learning_rate": 6.25e-07, + "loss": 9.3022, + "step": 30 + }, + { + "epoch": 0.012606750711671411, + "grad_norm": 366.5159616208184, + "learning_rate": 6.458333333333334e-07, + "loss": 9.59, + "step": 31 + }, + { + "epoch": 0.013013420089467263, + "grad_norm": 501.60173047237225, + "learning_rate": 6.666666666666667e-07, + "loss": 8.6585, + "step": 32 + }, + { + "epoch": 0.013420089467263115, + "grad_norm": 309.77101028283056, + "learning_rate": 6.875000000000001e-07, + "loss": 7.1797, + "step": 33 + }, + { + "epoch": 0.013826758845058967, + "grad_norm": 340.1357810033751, + "learning_rate": 7.083333333333334e-07, + "loss": 6.1228, + "step": 34 + }, + { + "epoch": 0.014233428222854819, + "grad_norm": 282.1338757009092, + "learning_rate": 7.291666666666667e-07, + "loss": 5.2787, + "step": 35 + }, + { + "epoch": 0.014640097600650671, + "grad_norm": 426.6008485071156, + "learning_rate": 7.5e-07, + "loss": 6.9065, + "step": 36 + }, + { + "epoch": 0.015046766978446523, + "grad_norm": 459.51358638099657, + "learning_rate": 7.708333333333334e-07, + "loss": 4.4858, + "step": 37 + }, + { + "epoch": 0.015453436356242375, + "grad_norm": 448.9668875550965, + "learning_rate": 7.916666666666667e-07, + "loss": 5.3878, + "step": 38 + }, + { + "epoch": 0.015860105734038225, + "grad_norm": 210.5459410309468, + "learning_rate": 8.125000000000001e-07, + "loss": 4.2814, + "step": 39 + }, + { + "epoch": 0.016266775111834077, + "grad_norm": 307.11379267140927, + "learning_rate": 8.333333333333333e-07, + "loss": 8.9886, + "step": 40 + }, + { + "epoch": 0.01667344448962993, + "grad_norm": 271.7571472449342, + "learning_rate": 8.541666666666667e-07, + "loss": 3.7136, + "step": 41 + }, + { + "epoch": 0.01708011386742578, + "grad_norm": 341.9377152015707, + "learning_rate": 8.75e-07, + "loss": 6.888, + "step": 42 + }, + { + "epoch": 0.017486783245221633, + "grad_norm": 440.7561058588699, + "learning_rate": 8.958333333333334e-07, + "loss": 5.6216, + "step": 43 + }, + { + "epoch": 0.017893452623017485, + "grad_norm": 269.75687490899827, + "learning_rate": 9.166666666666666e-07, + "loss": 6.2915, + "step": 44 + }, + { + "epoch": 0.018300122000813338, + "grad_norm": 371.2076636016579, + "learning_rate": 9.375000000000001e-07, + "loss": 7.2051, + "step": 45 + }, + { + "epoch": 0.01870679137860919, + "grad_norm": 317.93977863566823, + "learning_rate": 9.583333333333334e-07, + "loss": 4.3385, + "step": 46 + }, + { + "epoch": 0.01911346075640504, + "grad_norm": 193.73680175866107, + "learning_rate": 9.791666666666667e-07, + "loss": 2.6512, + "step": 47 + }, + { + "epoch": 0.019520130134200894, + "grad_norm": 284.9843302769116, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.1926, + "step": 48 + }, + { + "epoch": 0.019926799511996746, + "grad_norm": 353.92239882789255, + "learning_rate": 1.0208333333333334e-06, + "loss": 5.3454, + "step": 49 + }, + { + "epoch": 0.020333468889792598, + "grad_norm": 326.81707814450243, + "learning_rate": 1.0416666666666667e-06, + "loss": 3.7833, + "step": 50 + }, + { + "epoch": 0.02074013826758845, + "grad_norm": 263.478315565019, + "learning_rate": 1.0625e-06, + "loss": 4.331, + "step": 51 + }, + { + "epoch": 0.0211468076453843, + "grad_norm": 300.227875490734, + "learning_rate": 1.0833333333333335e-06, + "loss": 3.913, + "step": 52 + }, + { + "epoch": 0.021553477023180154, + "grad_norm": 272.5121622293714, + "learning_rate": 1.1041666666666668e-06, + "loss": 1.9239, + "step": 53 + }, + { + "epoch": 0.021960146400976006, + "grad_norm": 273.17343009772725, + "learning_rate": 1.125e-06, + "loss": 1.9901, + "step": 54 + }, + { + "epoch": 0.022366815778771858, + "grad_norm": 288.59508987884027, + "learning_rate": 1.1458333333333333e-06, + "loss": 1.5875, + "step": 55 + }, + { + "epoch": 0.02277348515656771, + "grad_norm": 156.3842171712975, + "learning_rate": 1.1666666666666668e-06, + "loss": 2.73, + "step": 56 + }, + { + "epoch": 0.023180154534363562, + "grad_norm": 167.67631028782296, + "learning_rate": 1.1875e-06, + "loss": 1.3888, + "step": 57 + }, + { + "epoch": 0.023586823912159414, + "grad_norm": 185.2426733094498, + "learning_rate": 1.2083333333333333e-06, + "loss": 3.2537, + "step": 58 + }, + { + "epoch": 0.023993493289955266, + "grad_norm": 180.01234187094116, + "learning_rate": 1.2291666666666666e-06, + "loss": 1.2347, + "step": 59 + }, + { + "epoch": 0.024400162667751118, + "grad_norm": 207.15521493636757, + "learning_rate": 1.25e-06, + "loss": 2.1108, + "step": 60 + }, + { + "epoch": 0.02480683204554697, + "grad_norm": 301.1686620336556, + "learning_rate": 1.2708333333333334e-06, + "loss": 4.6259, + "step": 61 + }, + { + "epoch": 0.025213501423342822, + "grad_norm": 272.54155919649725, + "learning_rate": 1.2916666666666669e-06, + "loss": 4.71, + "step": 62 + }, + { + "epoch": 0.025620170801138674, + "grad_norm": 165.4810577548504, + "learning_rate": 1.3125000000000001e-06, + "loss": 5.1377, + "step": 63 + }, + { + "epoch": 0.026026840178934526, + "grad_norm": 141.5578342755166, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.7269, + "step": 64 + }, + { + "epoch": 0.026433509556730378, + "grad_norm": 197.43273255925106, + "learning_rate": 1.3541666666666667e-06, + "loss": 3.0767, + "step": 65 + }, + { + "epoch": 0.02684017893452623, + "grad_norm": 209.69093912547692, + "learning_rate": 1.3750000000000002e-06, + "loss": 3.4276, + "step": 66 + }, + { + "epoch": 0.027246848312322082, + "grad_norm": 202.77388193327602, + "learning_rate": 1.3958333333333335e-06, + "loss": 2.5599, + "step": 67 + }, + { + "epoch": 0.027653517690117934, + "grad_norm": 316.60132333246383, + "learning_rate": 1.4166666666666667e-06, + "loss": 3.3666, + "step": 68 + }, + { + "epoch": 0.028060187067913786, + "grad_norm": 210.3945914817105, + "learning_rate": 1.4375e-06, + "loss": 3.1803, + "step": 69 + }, + { + "epoch": 0.028466856445709638, + "grad_norm": 188.32345893540264, + "learning_rate": 1.4583333333333335e-06, + "loss": 3.1475, + "step": 70 + }, + { + "epoch": 0.02887352582350549, + "grad_norm": 122.70297796915062, + "learning_rate": 1.4791666666666668e-06, + "loss": 1.2568, + "step": 71 + }, + { + "epoch": 0.029280195201301342, + "grad_norm": 181.43166539833464, + "learning_rate": 1.5e-06, + "loss": 2.155, + "step": 72 + }, + { + "epoch": 0.029686864579097194, + "grad_norm": 175.0293827239928, + "learning_rate": 1.5208333333333333e-06, + "loss": 2.3524, + "step": 73 + }, + { + "epoch": 0.030093533956893046, + "grad_norm": 151.21377503578944, + "learning_rate": 1.5416666666666668e-06, + "loss": 1.8398, + "step": 74 + }, + { + "epoch": 0.030500203334688898, + "grad_norm": 165.7823727094222, + "learning_rate": 1.5625e-06, + "loss": 3.7349, + "step": 75 + }, + { + "epoch": 0.03090687271248475, + "grad_norm": 269.5528505412701, + "learning_rate": 1.5833333333333333e-06, + "loss": 4.5182, + "step": 76 + }, + { + "epoch": 0.0313135420902806, + "grad_norm": 223.53600500876436, + "learning_rate": 1.6041666666666668e-06, + "loss": 2.1544, + "step": 77 + }, + { + "epoch": 0.03172021146807645, + "grad_norm": 172.36151040733938, + "learning_rate": 1.6250000000000001e-06, + "loss": 2.199, + "step": 78 + }, + { + "epoch": 0.032126880845872306, + "grad_norm": 241.63996605621386, + "learning_rate": 1.6458333333333334e-06, + "loss": 1.8479, + "step": 79 + }, + { + "epoch": 0.032533550223668155, + "grad_norm": 115.53245821572042, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.4091, + "step": 80 + }, + { + "epoch": 0.03294021960146401, + "grad_norm": 203.22090071998232, + "learning_rate": 1.6875000000000001e-06, + "loss": 1.7492, + "step": 81 + }, + { + "epoch": 0.03334688897925986, + "grad_norm": 65.28480711559256, + "learning_rate": 1.7083333333333334e-06, + "loss": 0.2028, + "step": 82 + }, + { + "epoch": 0.033753558357055714, + "grad_norm": 110.92471805920985, + "learning_rate": 1.7291666666666667e-06, + "loss": 2.5054, + "step": 83 + }, + { + "epoch": 0.03416022773485156, + "grad_norm": 186.7059000817685, + "learning_rate": 1.75e-06, + "loss": 2.7205, + "step": 84 + }, + { + "epoch": 0.03456689711264742, + "grad_norm": 202.8175983087916, + "learning_rate": 1.7708333333333337e-06, + "loss": 4.1424, + "step": 85 + }, + { + "epoch": 0.03497356649044327, + "grad_norm": 204.73483906672718, + "learning_rate": 1.7916666666666667e-06, + "loss": 3.9714, + "step": 86 + }, + { + "epoch": 0.03538023586823912, + "grad_norm": 215.19972767298603, + "learning_rate": 1.8125e-06, + "loss": 1.8404, + "step": 87 + }, + { + "epoch": 0.03578690524603497, + "grad_norm": 148.10264617391448, + "learning_rate": 1.8333333333333333e-06, + "loss": 2.5163, + "step": 88 + }, + { + "epoch": 0.036193574623830826, + "grad_norm": 136.0083362990631, + "learning_rate": 1.854166666666667e-06, + "loss": 1.4539, + "step": 89 + }, + { + "epoch": 0.036600244001626675, + "grad_norm": 175.22874192237137, + "learning_rate": 1.8750000000000003e-06, + "loss": 3.2634, + "step": 90 + }, + { + "epoch": 0.03700691337942253, + "grad_norm": 129.55690867039286, + "learning_rate": 1.8958333333333333e-06, + "loss": 1.5216, + "step": 91 + }, + { + "epoch": 0.03741358275721838, + "grad_norm": 311.5984363213065, + "learning_rate": 1.916666666666667e-06, + "loss": 2.519, + "step": 92 + }, + { + "epoch": 0.037820252135014235, + "grad_norm": 219.70798902402024, + "learning_rate": 1.9375e-06, + "loss": 2.5045, + "step": 93 + }, + { + "epoch": 0.03822692151281008, + "grad_norm": 225.83142682874288, + "learning_rate": 1.9583333333333334e-06, + "loss": 2.192, + "step": 94 + }, + { + "epoch": 0.03863359089060594, + "grad_norm": 121.4516719886771, + "learning_rate": 1.9791666666666666e-06, + "loss": 1.5077, + "step": 95 + }, + { + "epoch": 0.03904026026840179, + "grad_norm": 86.25301368258124, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7171, + "step": 96 + }, + { + "epoch": 0.03944692964619764, + "grad_norm": 110.78564663436143, + "learning_rate": 2.0208333333333336e-06, + "loss": 0.5562, + "step": 97 + }, + { + "epoch": 0.03985359902399349, + "grad_norm": 194.0457637861635, + "learning_rate": 2.041666666666667e-06, + "loss": 2.1042, + "step": 98 + }, + { + "epoch": 0.04026026840178935, + "grad_norm": 109.93589172276664, + "learning_rate": 2.0625e-06, + "loss": 0.7602, + "step": 99 + }, + { + "epoch": 0.040666937779585195, + "grad_norm": 108.52637309708655, + "learning_rate": 2.0833333333333334e-06, + "loss": 1.5992, + "step": 100 + }, + { + "epoch": 0.04107360715738105, + "grad_norm": 169.91253554501594, + "learning_rate": 2.1041666666666667e-06, + "loss": 3.1571, + "step": 101 + }, + { + "epoch": 0.0414802765351769, + "grad_norm": 128.5644023728551, + "learning_rate": 2.125e-06, + "loss": 1.7513, + "step": 102 + }, + { + "epoch": 0.041886945912972755, + "grad_norm": 91.56850730015543, + "learning_rate": 2.1458333333333333e-06, + "loss": 0.8185, + "step": 103 + }, + { + "epoch": 0.0422936152907686, + "grad_norm": 77.7991994450966, + "learning_rate": 2.166666666666667e-06, + "loss": 1.431, + "step": 104 + }, + { + "epoch": 0.04270028466856446, + "grad_norm": 153.70946463939563, + "learning_rate": 2.1875000000000002e-06, + "loss": 1.7443, + "step": 105 + }, + { + "epoch": 0.04310695404636031, + "grad_norm": 80.63781007533551, + "learning_rate": 2.2083333333333335e-06, + "loss": 0.5303, + "step": 106 + }, + { + "epoch": 0.04351362342415616, + "grad_norm": 143.32003358453542, + "learning_rate": 2.2291666666666668e-06, + "loss": 3.003, + "step": 107 + }, + { + "epoch": 0.04392029280195201, + "grad_norm": 126.42870760718671, + "learning_rate": 2.25e-06, + "loss": 1.2571, + "step": 108 + }, + { + "epoch": 0.04432696217974787, + "grad_norm": 84.80452473966143, + "learning_rate": 2.2708333333333333e-06, + "loss": 0.3288, + "step": 109 + }, + { + "epoch": 0.044733631557543715, + "grad_norm": 185.4686243129134, + "learning_rate": 2.2916666666666666e-06, + "loss": 2.1091, + "step": 110 + }, + { + "epoch": 0.04514030093533957, + "grad_norm": 87.21573323985257, + "learning_rate": 2.3125000000000003e-06, + "loss": 0.5493, + "step": 111 + }, + { + "epoch": 0.04554697031313542, + "grad_norm": 112.27021628742513, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.6612, + "step": 112 + }, + { + "epoch": 0.045953639690931275, + "grad_norm": 69.92383657421136, + "learning_rate": 2.354166666666667e-06, + "loss": 0.4619, + "step": 113 + }, + { + "epoch": 0.046360309068727124, + "grad_norm": 87.71070073837274, + "learning_rate": 2.375e-06, + "loss": 0.7549, + "step": 114 + }, + { + "epoch": 0.04676697844652298, + "grad_norm": 74.62135698611227, + "learning_rate": 2.395833333333334e-06, + "loss": 1.9222, + "step": 115 + }, + { + "epoch": 0.04717364782431883, + "grad_norm": 54.39472399144978, + "learning_rate": 2.4166666666666667e-06, + "loss": 0.1813, + "step": 116 + }, + { + "epoch": 0.04758031720211468, + "grad_norm": 127.27832700921313, + "learning_rate": 2.4375e-06, + "loss": 2.4776, + "step": 117 + }, + { + "epoch": 0.04798698657991053, + "grad_norm": 220.18104202454296, + "learning_rate": 2.4583333333333332e-06, + "loss": 3.0817, + "step": 118 + }, + { + "epoch": 0.04839365595770639, + "grad_norm": 30.972346013343984, + "learning_rate": 2.479166666666667e-06, + "loss": 0.1563, + "step": 119 + }, + { + "epoch": 0.048800325335502236, + "grad_norm": 92.04801854601959, + "learning_rate": 2.5e-06, + "loss": 0.4842, + "step": 120 + }, + { + "epoch": 0.04920699471329809, + "grad_norm": 165.57546906817353, + "learning_rate": 2.5208333333333335e-06, + "loss": 1.895, + "step": 121 + }, + { + "epoch": 0.04961366409109394, + "grad_norm": 271.0945028104416, + "learning_rate": 2.5416666666666668e-06, + "loss": 3.4618, + "step": 122 + }, + { + "epoch": 0.050020333468889795, + "grad_norm": 111.48880984301083, + "learning_rate": 2.5625e-06, + "loss": 1.2026, + "step": 123 + }, + { + "epoch": 0.050427002846685644, + "grad_norm": 86.90901315761968, + "learning_rate": 2.5833333333333337e-06, + "loss": 1.296, + "step": 124 + }, + { + "epoch": 0.0508336722244815, + "grad_norm": 138.41652616274206, + "learning_rate": 2.604166666666667e-06, + "loss": 1.3158, + "step": 125 + }, + { + "epoch": 0.05124034160227735, + "grad_norm": 124.56331196705514, + "learning_rate": 2.6250000000000003e-06, + "loss": 1.5102, + "step": 126 + }, + { + "epoch": 0.0516470109800732, + "grad_norm": 97.86036510688417, + "learning_rate": 2.6458333333333336e-06, + "loss": 1.0128, + "step": 127 + }, + { + "epoch": 0.05205368035786905, + "grad_norm": 112.38369606580319, + "learning_rate": 2.666666666666667e-06, + "loss": 0.7055, + "step": 128 + }, + { + "epoch": 0.05246034973566491, + "grad_norm": 58.66426403012563, + "learning_rate": 2.6875e-06, + "loss": 0.5273, + "step": 129 + }, + { + "epoch": 0.052867019113460756, + "grad_norm": 100.53701268114106, + "learning_rate": 2.7083333333333334e-06, + "loss": 0.4817, + "step": 130 + }, + { + "epoch": 0.05327368849125661, + "grad_norm": 122.2283262140796, + "learning_rate": 2.7291666666666667e-06, + "loss": 1.5543, + "step": 131 + }, + { + "epoch": 0.05368035786905246, + "grad_norm": 173.07050064180694, + "learning_rate": 2.7500000000000004e-06, + "loss": 2.4653, + "step": 132 + }, + { + "epoch": 0.054087027246848315, + "grad_norm": 131.12277620537202, + "learning_rate": 2.7708333333333336e-06, + "loss": 1.7302, + "step": 133 + }, + { + "epoch": 0.054493696624644164, + "grad_norm": 96.71523198953807, + "learning_rate": 2.791666666666667e-06, + "loss": 1.5493, + "step": 134 + }, + { + "epoch": 0.05490036600244002, + "grad_norm": 49.05370831412189, + "learning_rate": 2.8125e-06, + "loss": 0.1753, + "step": 135 + }, + { + "epoch": 0.05530703538023587, + "grad_norm": 86.38694747352895, + "learning_rate": 2.8333333333333335e-06, + "loss": 0.5342, + "step": 136 + }, + { + "epoch": 0.055713704758031724, + "grad_norm": 38.22227273309076, + "learning_rate": 2.8541666666666667e-06, + "loss": 0.1203, + "step": 137 + }, + { + "epoch": 0.05612037413582757, + "grad_norm": 133.7638530170472, + "learning_rate": 2.875e-06, + "loss": 1.324, + "step": 138 + }, + { + "epoch": 0.05652704351362343, + "grad_norm": 107.73103062153183, + "learning_rate": 2.8958333333333337e-06, + "loss": 1.4278, + "step": 139 + }, + { + "epoch": 0.056933712891419276, + "grad_norm": 41.372529892500054, + "learning_rate": 2.916666666666667e-06, + "loss": 0.1545, + "step": 140 + }, + { + "epoch": 0.057340382269215125, + "grad_norm": 224.95272967862968, + "learning_rate": 2.9375000000000003e-06, + "loss": 2.4372, + "step": 141 + }, + { + "epoch": 0.05774705164701098, + "grad_norm": 144.7628902187281, + "learning_rate": 2.9583333333333335e-06, + "loss": 2.0181, + "step": 142 + }, + { + "epoch": 0.05815372102480683, + "grad_norm": 101.38079264088881, + "learning_rate": 2.979166666666667e-06, + "loss": 1.7179, + "step": 143 + }, + { + "epoch": 0.058560390402602684, + "grad_norm": 123.19364113211871, + "learning_rate": 3e-06, + "loss": 1.3426, + "step": 144 + }, + { + "epoch": 0.05896705978039853, + "grad_norm": 120.50148304940744, + "learning_rate": 3.0208333333333334e-06, + "loss": 1.5376, + "step": 145 + }, + { + "epoch": 0.05937372915819439, + "grad_norm": 100.72974853133886, + "learning_rate": 3.0416666666666666e-06, + "loss": 1.0604, + "step": 146 + }, + { + "epoch": 0.05978039853599024, + "grad_norm": 57.30276465073036, + "learning_rate": 3.0625000000000003e-06, + "loss": 0.2944, + "step": 147 + }, + { + "epoch": 0.06018706791378609, + "grad_norm": 143.17969254213594, + "learning_rate": 3.0833333333333336e-06, + "loss": 1.1813, + "step": 148 + }, + { + "epoch": 0.06059373729158194, + "grad_norm": 137.71387095126028, + "learning_rate": 3.104166666666667e-06, + "loss": 1.6455, + "step": 149 + }, + { + "epoch": 0.061000406669377796, + "grad_norm": 134.53582601263398, + "learning_rate": 3.125e-06, + "loss": 3.0437, + "step": 150 + }, + { + "epoch": 0.061407076047173645, + "grad_norm": 57.66145259469052, + "learning_rate": 3.1458333333333334e-06, + "loss": 0.5405, + "step": 151 + }, + { + "epoch": 0.0618137454249695, + "grad_norm": 28.345769291770754, + "learning_rate": 3.1666666666666667e-06, + "loss": 0.1031, + "step": 152 + }, + { + "epoch": 0.06222041480276535, + "grad_norm": 66.62656493601261, + "learning_rate": 3.1875e-06, + "loss": 0.4745, + "step": 153 + }, + { + "epoch": 0.0626270841805612, + "grad_norm": 61.5993083645454, + "learning_rate": 3.2083333333333337e-06, + "loss": 0.3036, + "step": 154 + }, + { + "epoch": 0.06303375355835705, + "grad_norm": 31.866128838927196, + "learning_rate": 3.229166666666667e-06, + "loss": 0.1762, + "step": 155 + }, + { + "epoch": 0.0634404229361529, + "grad_norm": 32.30842902301004, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.3008, + "step": 156 + }, + { + "epoch": 0.06384709231394876, + "grad_norm": 103.69898809491467, + "learning_rate": 3.2708333333333335e-06, + "loss": 1.9546, + "step": 157 + }, + { + "epoch": 0.06425376169174461, + "grad_norm": 61.19176624442112, + "learning_rate": 3.2916666666666668e-06, + "loss": 0.4124, + "step": 158 + }, + { + "epoch": 0.06466043106954046, + "grad_norm": 174.246962286168, + "learning_rate": 3.3125e-06, + "loss": 1.1847, + "step": 159 + }, + { + "epoch": 0.06506710044733631, + "grad_norm": 188.90001914679056, + "learning_rate": 3.3333333333333333e-06, + "loss": 4.1365, + "step": 160 + }, + { + "epoch": 0.06547376982513217, + "grad_norm": 106.78958150966507, + "learning_rate": 3.3541666666666666e-06, + "loss": 1.6035, + "step": 161 + }, + { + "epoch": 0.06588043920292802, + "grad_norm": 139.50008295084083, + "learning_rate": 3.3750000000000003e-06, + "loss": 2.1317, + "step": 162 + }, + { + "epoch": 0.06628710858072387, + "grad_norm": 11.32235910147733, + "learning_rate": 3.3958333333333336e-06, + "loss": 0.033, + "step": 163 + }, + { + "epoch": 0.06669377795851972, + "grad_norm": 100.68586515215864, + "learning_rate": 3.416666666666667e-06, + "loss": 0.6339, + "step": 164 + }, + { + "epoch": 0.06710044733631558, + "grad_norm": 108.46157321690046, + "learning_rate": 3.4375e-06, + "loss": 2.4534, + "step": 165 + }, + { + "epoch": 0.06750711671411143, + "grad_norm": 12.652655165217883, + "learning_rate": 3.4583333333333334e-06, + "loss": 0.0447, + "step": 166 + }, + { + "epoch": 0.06791378609190728, + "grad_norm": 53.8367531899667, + "learning_rate": 3.4791666666666667e-06, + "loss": 0.3559, + "step": 167 + }, + { + "epoch": 0.06832045546970313, + "grad_norm": 124.41801079298457, + "learning_rate": 3.5e-06, + "loss": 1.2427, + "step": 168 + }, + { + "epoch": 0.06872712484749899, + "grad_norm": 98.26797021380435, + "learning_rate": 3.520833333333334e-06, + "loss": 1.1727, + "step": 169 + }, + { + "epoch": 0.06913379422529484, + "grad_norm": 107.5522043232857, + "learning_rate": 3.5416666666666673e-06, + "loss": 0.8958, + "step": 170 + }, + { + "epoch": 0.06954046360309069, + "grad_norm": 82.05304959146217, + "learning_rate": 3.5625e-06, + "loss": 1.2963, + "step": 171 + }, + { + "epoch": 0.06994713298088653, + "grad_norm": 83.55434255791826, + "learning_rate": 3.5833333333333335e-06, + "loss": 1.1529, + "step": 172 + }, + { + "epoch": 0.0703538023586824, + "grad_norm": 143.25778712082953, + "learning_rate": 3.6041666666666667e-06, + "loss": 2.2317, + "step": 173 + }, + { + "epoch": 0.07076047173647824, + "grad_norm": 109.29671825209552, + "learning_rate": 3.625e-06, + "loss": 1.9655, + "step": 174 + }, + { + "epoch": 0.0711671411142741, + "grad_norm": 116.77811120816737, + "learning_rate": 3.6458333333333333e-06, + "loss": 2.0665, + "step": 175 + }, + { + "epoch": 0.07157381049206994, + "grad_norm": 153.15428795809927, + "learning_rate": 3.6666666666666666e-06, + "loss": 1.566, + "step": 176 + }, + { + "epoch": 0.0719804798698658, + "grad_norm": 103.43662129342978, + "learning_rate": 3.6875000000000007e-06, + "loss": 0.4309, + "step": 177 + }, + { + "epoch": 0.07238714924766165, + "grad_norm": 88.21179791730988, + "learning_rate": 3.708333333333334e-06, + "loss": 0.4077, + "step": 178 + }, + { + "epoch": 0.0727938186254575, + "grad_norm": 110.37760189743852, + "learning_rate": 3.7291666666666672e-06, + "loss": 1.8419, + "step": 179 + }, + { + "epoch": 0.07320048800325335, + "grad_norm": 139.35460839784542, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.6677, + "step": 180 + }, + { + "epoch": 0.07360715738104921, + "grad_norm": 60.86735999350134, + "learning_rate": 3.7708333333333334e-06, + "loss": 0.6756, + "step": 181 + }, + { + "epoch": 0.07401382675884506, + "grad_norm": 178.15777972116163, + "learning_rate": 3.7916666666666666e-06, + "loss": 0.837, + "step": 182 + }, + { + "epoch": 0.07442049613664091, + "grad_norm": 34.50112026838684, + "learning_rate": 3.8125e-06, + "loss": 0.2794, + "step": 183 + }, + { + "epoch": 0.07482716551443676, + "grad_norm": 142.4696248105672, + "learning_rate": 3.833333333333334e-06, + "loss": 2.1666, + "step": 184 + }, + { + "epoch": 0.07523383489223262, + "grad_norm": 113.05424360133159, + "learning_rate": 3.854166666666667e-06, + "loss": 0.8109, + "step": 185 + }, + { + "epoch": 0.07564050427002847, + "grad_norm": 8.855558876602052, + "learning_rate": 3.875e-06, + "loss": 0.0273, + "step": 186 + }, + { + "epoch": 0.07604717364782432, + "grad_norm": 98.01486713110955, + "learning_rate": 3.8958333333333334e-06, + "loss": 1.1045, + "step": 187 + }, + { + "epoch": 0.07645384302562017, + "grad_norm": 114.37413303901324, + "learning_rate": 3.916666666666667e-06, + "loss": 1.1439, + "step": 188 + }, + { + "epoch": 0.07686051240341603, + "grad_norm": 136.26002163209216, + "learning_rate": 3.9375e-06, + "loss": 1.1163, + "step": 189 + }, + { + "epoch": 0.07726718178121188, + "grad_norm": 88.11455031320419, + "learning_rate": 3.958333333333333e-06, + "loss": 1.6097, + "step": 190 + }, + { + "epoch": 0.07767385115900773, + "grad_norm": 41.44169198562533, + "learning_rate": 3.9791666666666665e-06, + "loss": 0.4748, + "step": 191 + }, + { + "epoch": 0.07808052053680357, + "grad_norm": 43.73773654313404, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1893, + "step": 192 + }, + { + "epoch": 0.07848718991459944, + "grad_norm": 79.0203510867777, + "learning_rate": 4.020833333333334e-06, + "loss": 1.1746, + "step": 193 + }, + { + "epoch": 0.07889385929239529, + "grad_norm": 92.15150198827509, + "learning_rate": 4.041666666666667e-06, + "loss": 1.7928, + "step": 194 + }, + { + "epoch": 0.07930052867019113, + "grad_norm": 58.74090775938593, + "learning_rate": 4.0625000000000005e-06, + "loss": 1.0084, + "step": 195 + }, + { + "epoch": 0.07970719804798698, + "grad_norm": 34.145010165932085, + "learning_rate": 4.083333333333334e-06, + "loss": 0.2569, + "step": 196 + }, + { + "epoch": 0.08011386742578284, + "grad_norm": 95.47732756613823, + "learning_rate": 4.104166666666667e-06, + "loss": 1.5122, + "step": 197 + }, + { + "epoch": 0.0805205368035787, + "grad_norm": 65.63533295072774, + "learning_rate": 4.125e-06, + "loss": 0.8326, + "step": 198 + }, + { + "epoch": 0.08092720618137454, + "grad_norm": 36.8725336955361, + "learning_rate": 4.145833333333334e-06, + "loss": 0.2286, + "step": 199 + }, + { + "epoch": 0.08133387555917039, + "grad_norm": 144.82500575154663, + "learning_rate": 4.166666666666667e-06, + "loss": 2.697, + "step": 200 + }, + { + "epoch": 0.08174054493696625, + "grad_norm": 144.64036636683812, + "learning_rate": 4.1875e-06, + "loss": 0.7676, + "step": 201 + }, + { + "epoch": 0.0821472143147621, + "grad_norm": 109.47657260079413, + "learning_rate": 4.208333333333333e-06, + "loss": 1.3423, + "step": 202 + }, + { + "epoch": 0.08255388369255795, + "grad_norm": 81.91227908305505, + "learning_rate": 4.229166666666667e-06, + "loss": 0.8615, + "step": 203 + }, + { + "epoch": 0.0829605530703538, + "grad_norm": 152.44295927206045, + "learning_rate": 4.25e-06, + "loss": 2.2964, + "step": 204 + }, + { + "epoch": 0.08336722244814966, + "grad_norm": 83.87290059438382, + "learning_rate": 4.270833333333333e-06, + "loss": 1.2686, + "step": 205 + }, + { + "epoch": 0.08377389182594551, + "grad_norm": 48.00152363134702, + "learning_rate": 4.2916666666666665e-06, + "loss": 0.9284, + "step": 206 + }, + { + "epoch": 0.08418056120374136, + "grad_norm": 81.03629059860732, + "learning_rate": 4.312500000000001e-06, + "loss": 1.7201, + "step": 207 + }, + { + "epoch": 0.0845872305815372, + "grad_norm": 113.35391290756976, + "learning_rate": 4.333333333333334e-06, + "loss": 1.6503, + "step": 208 + }, + { + "epoch": 0.08499389995933307, + "grad_norm": 35.64605517685762, + "learning_rate": 4.354166666666667e-06, + "loss": 0.1895, + "step": 209 + }, + { + "epoch": 0.08540056933712892, + "grad_norm": 79.15831100003598, + "learning_rate": 4.3750000000000005e-06, + "loss": 1.4249, + "step": 210 + }, + { + "epoch": 0.08580723871492477, + "grad_norm": 74.96635090661226, + "learning_rate": 4.395833333333334e-06, + "loss": 1.0456, + "step": 211 + }, + { + "epoch": 0.08621390809272061, + "grad_norm": 41.64431946846148, + "learning_rate": 4.416666666666667e-06, + "loss": 0.1912, + "step": 212 + }, + { + "epoch": 0.08662057747051646, + "grad_norm": 30.969007993329832, + "learning_rate": 4.4375e-06, + "loss": 0.2765, + "step": 213 + }, + { + "epoch": 0.08702724684831233, + "grad_norm": 57.56046201264697, + "learning_rate": 4.4583333333333336e-06, + "loss": 0.9265, + "step": 214 + }, + { + "epoch": 0.08743391622610817, + "grad_norm": 72.91272761390967, + "learning_rate": 4.479166666666667e-06, + "loss": 0.5754, + "step": 215 + }, + { + "epoch": 0.08784058560390402, + "grad_norm": 54.0372756887612, + "learning_rate": 4.5e-06, + "loss": 0.6329, + "step": 216 + }, + { + "epoch": 0.08824725498169987, + "grad_norm": 49.110077948728645, + "learning_rate": 4.520833333333333e-06, + "loss": 0.5271, + "step": 217 + }, + { + "epoch": 0.08865392435949573, + "grad_norm": 75.83157513174311, + "learning_rate": 4.541666666666667e-06, + "loss": 1.0764, + "step": 218 + }, + { + "epoch": 0.08906059373729158, + "grad_norm": 39.029552544973996, + "learning_rate": 4.5625e-06, + "loss": 0.2696, + "step": 219 + }, + { + "epoch": 0.08946726311508743, + "grad_norm": 103.63092278556039, + "learning_rate": 4.583333333333333e-06, + "loss": 1.5114, + "step": 220 + }, + { + "epoch": 0.08987393249288328, + "grad_norm": 164.24003371175766, + "learning_rate": 4.6041666666666665e-06, + "loss": 3.3725, + "step": 221 + }, + { + "epoch": 0.09028060187067914, + "grad_norm": 62.95108687386291, + "learning_rate": 4.625000000000001e-06, + "loss": 1.1856, + "step": 222 + }, + { + "epoch": 0.09068727124847499, + "grad_norm": 39.44161986302329, + "learning_rate": 4.645833333333334e-06, + "loss": 0.4022, + "step": 223 + }, + { + "epoch": 0.09109394062627084, + "grad_norm": 91.79045848091717, + "learning_rate": 4.666666666666667e-06, + "loss": 1.0348, + "step": 224 + }, + { + "epoch": 0.09150061000406669, + "grad_norm": 29.13723507497965, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.2469, + "step": 225 + }, + { + "epoch": 0.09190727938186255, + "grad_norm": 87.6332090398933, + "learning_rate": 4.708333333333334e-06, + "loss": 1.4219, + "step": 226 + }, + { + "epoch": 0.0923139487596584, + "grad_norm": 97.60355237335281, + "learning_rate": 4.729166666666667e-06, + "loss": 0.5945, + "step": 227 + }, + { + "epoch": 0.09272061813745425, + "grad_norm": 73.38020574024709, + "learning_rate": 4.75e-06, + "loss": 0.9985, + "step": 228 + }, + { + "epoch": 0.0931272875152501, + "grad_norm": 55.22529334524213, + "learning_rate": 4.770833333333334e-06, + "loss": 0.5253, + "step": 229 + }, + { + "epoch": 0.09353395689304596, + "grad_norm": 129.15028048937884, + "learning_rate": 4.791666666666668e-06, + "loss": 2.6744, + "step": 230 + }, + { + "epoch": 0.0939406262708418, + "grad_norm": 54.88152486464927, + "learning_rate": 4.8125e-06, + "loss": 0.7303, + "step": 231 + }, + { + "epoch": 0.09434729564863766, + "grad_norm": 32.75744858400195, + "learning_rate": 4.833333333333333e-06, + "loss": 0.3329, + "step": 232 + }, + { + "epoch": 0.0947539650264335, + "grad_norm": 35.69609690468295, + "learning_rate": 4.854166666666667e-06, + "loss": 0.4551, + "step": 233 + }, + { + "epoch": 0.09516063440422937, + "grad_norm": 52.59579946648724, + "learning_rate": 4.875e-06, + "loss": 0.8841, + "step": 234 + }, + { + "epoch": 0.09556730378202521, + "grad_norm": 42.96709963871628, + "learning_rate": 4.895833333333333e-06, + "loss": 0.5091, + "step": 235 + }, + { + "epoch": 0.09597397315982106, + "grad_norm": 117.3532720331694, + "learning_rate": 4.9166666666666665e-06, + "loss": 0.8106, + "step": 236 + }, + { + "epoch": 0.09638064253761691, + "grad_norm": 30.380432115356587, + "learning_rate": 4.937500000000001e-06, + "loss": 0.3746, + "step": 237 + }, + { + "epoch": 0.09678731191541277, + "grad_norm": 78.28498084339542, + "learning_rate": 4.958333333333334e-06, + "loss": 1.0227, + "step": 238 + }, + { + "epoch": 0.09719398129320862, + "grad_norm": 59.931311675237005, + "learning_rate": 4.979166666666667e-06, + "loss": 0.5583, + "step": 239 + }, + { + "epoch": 0.09760065067100447, + "grad_norm": 31.547523086720066, + "learning_rate": 5e-06, + "loss": 0.1028, + "step": 240 + }, + { + "epoch": 0.09800732004880032, + "grad_norm": 65.48570675179523, + "learning_rate": 5.020833333333334e-06, + "loss": 0.6768, + "step": 241 + }, + { + "epoch": 0.09841398942659618, + "grad_norm": 71.30443496968672, + "learning_rate": 5.041666666666667e-06, + "loss": 1.0523, + "step": 242 + }, + { + "epoch": 0.09882065880439203, + "grad_norm": 99.47750018070415, + "learning_rate": 5.0625e-06, + "loss": 1.3311, + "step": 243 + }, + { + "epoch": 0.09922732818218788, + "grad_norm": 95.64495912081871, + "learning_rate": 5.0833333333333335e-06, + "loss": 0.9476, + "step": 244 + }, + { + "epoch": 0.09963399755998373, + "grad_norm": 42.40759051028956, + "learning_rate": 5.104166666666667e-06, + "loss": 0.2961, + "step": 245 + }, + { + "epoch": 0.10004066693777959, + "grad_norm": 31.88745015493197, + "learning_rate": 5.125e-06, + "loss": 0.3585, + "step": 246 + }, + { + "epoch": 0.10044733631557544, + "grad_norm": 91.3652673761742, + "learning_rate": 5.145833333333333e-06, + "loss": 1.0672, + "step": 247 + }, + { + "epoch": 0.10085400569337129, + "grad_norm": 44.727096879548704, + "learning_rate": 5.1666666666666675e-06, + "loss": 0.5725, + "step": 248 + }, + { + "epoch": 0.10126067507116714, + "grad_norm": 23.75214906932736, + "learning_rate": 5.187500000000001e-06, + "loss": 0.3062, + "step": 249 + }, + { + "epoch": 0.101667344448963, + "grad_norm": 51.98639855116888, + "learning_rate": 5.208333333333334e-06, + "loss": 0.376, + "step": 250 + }, + { + "epoch": 0.10207401382675885, + "grad_norm": 31.835867777554977, + "learning_rate": 5.229166666666667e-06, + "loss": 0.311, + "step": 251 + }, + { + "epoch": 0.1024806832045547, + "grad_norm": 75.82000026727032, + "learning_rate": 5.2500000000000006e-06, + "loss": 2.2296, + "step": 252 + }, + { + "epoch": 0.10288735258235054, + "grad_norm": 25.05893634474981, + "learning_rate": 5.270833333333334e-06, + "loss": 0.1574, + "step": 253 + }, + { + "epoch": 0.1032940219601464, + "grad_norm": 36.25955191753974, + "learning_rate": 5.291666666666667e-06, + "loss": 0.1218, + "step": 254 + }, + { + "epoch": 0.10370069133794226, + "grad_norm": 44.554035113095345, + "learning_rate": 5.3125e-06, + "loss": 0.2756, + "step": 255 + }, + { + "epoch": 0.1041073607157381, + "grad_norm": 55.77665346157822, + "learning_rate": 5.333333333333334e-06, + "loss": 0.6042, + "step": 256 + }, + { + "epoch": 0.10451403009353395, + "grad_norm": 85.18686186389826, + "learning_rate": 5.354166666666667e-06, + "loss": 1.1703, + "step": 257 + }, + { + "epoch": 0.10492069947132981, + "grad_norm": 148.15365282255286, + "learning_rate": 5.375e-06, + "loss": 1.3501, + "step": 258 + }, + { + "epoch": 0.10532736884912566, + "grad_norm": 75.49548218643676, + "learning_rate": 5.3958333333333335e-06, + "loss": 1.1486, + "step": 259 + }, + { + "epoch": 0.10573403822692151, + "grad_norm": 16.18324770466539, + "learning_rate": 5.416666666666667e-06, + "loss": 0.1081, + "step": 260 + }, + { + "epoch": 0.10614070760471736, + "grad_norm": 16.911997153846773, + "learning_rate": 5.4375e-06, + "loss": 0.2006, + "step": 261 + }, + { + "epoch": 0.10654737698251322, + "grad_norm": 70.0380195025149, + "learning_rate": 5.458333333333333e-06, + "loss": 0.4652, + "step": 262 + }, + { + "epoch": 0.10695404636030907, + "grad_norm": 122.33862843517788, + "learning_rate": 5.4791666666666674e-06, + "loss": 1.8119, + "step": 263 + }, + { + "epoch": 0.10736071573810492, + "grad_norm": 56.2720655941083, + "learning_rate": 5.500000000000001e-06, + "loss": 1.1933, + "step": 264 + }, + { + "epoch": 0.10776738511590077, + "grad_norm": 20.606633951589085, + "learning_rate": 5.520833333333334e-06, + "loss": 0.1788, + "step": 265 + }, + { + "epoch": 0.10817405449369663, + "grad_norm": 56.71770516581295, + "learning_rate": 5.541666666666667e-06, + "loss": 0.7774, + "step": 266 + }, + { + "epoch": 0.10858072387149248, + "grad_norm": 73.93569281970146, + "learning_rate": 5.5625000000000005e-06, + "loss": 0.8029, + "step": 267 + }, + { + "epoch": 0.10898739324928833, + "grad_norm": 79.82875960137555, + "learning_rate": 5.583333333333334e-06, + "loss": 1.1698, + "step": 268 + }, + { + "epoch": 0.10939406262708418, + "grad_norm": 122.80800254876283, + "learning_rate": 5.604166666666667e-06, + "loss": 2.5283, + "step": 269 + }, + { + "epoch": 0.10980073200488004, + "grad_norm": 37.85700594889011, + "learning_rate": 5.625e-06, + "loss": 0.5778, + "step": 270 + }, + { + "epoch": 0.11020740138267589, + "grad_norm": 60.29863742685712, + "learning_rate": 5.645833333333334e-06, + "loss": 0.9227, + "step": 271 + }, + { + "epoch": 0.11061407076047174, + "grad_norm": 95.31473900588405, + "learning_rate": 5.666666666666667e-06, + "loss": 1.8188, + "step": 272 + }, + { + "epoch": 0.11102074013826758, + "grad_norm": 83.00141053695987, + "learning_rate": 5.6875e-06, + "loss": 1.2457, + "step": 273 + }, + { + "epoch": 0.11142740951606345, + "grad_norm": 67.02902028392347, + "learning_rate": 5.7083333333333335e-06, + "loss": 0.9823, + "step": 274 + }, + { + "epoch": 0.1118340788938593, + "grad_norm": 66.38790736221586, + "learning_rate": 5.729166666666667e-06, + "loss": 0.7616, + "step": 275 + }, + { + "epoch": 0.11224074827165514, + "grad_norm": 36.38148230556286, + "learning_rate": 5.75e-06, + "loss": 0.37, + "step": 276 + }, + { + "epoch": 0.11264741764945099, + "grad_norm": 60.89006099581659, + "learning_rate": 5.770833333333333e-06, + "loss": 0.9572, + "step": 277 + }, + { + "epoch": 0.11305408702724686, + "grad_norm": 54.34772852918405, + "learning_rate": 5.791666666666667e-06, + "loss": 0.8955, + "step": 278 + }, + { + "epoch": 0.1134607564050427, + "grad_norm": 94.5438786766939, + "learning_rate": 5.812500000000001e-06, + "loss": 1.1583, + "step": 279 + }, + { + "epoch": 0.11386742578283855, + "grad_norm": 105.93775533424541, + "learning_rate": 5.833333333333334e-06, + "loss": 2.166, + "step": 280 + }, + { + "epoch": 0.1142740951606344, + "grad_norm": 35.205292270554054, + "learning_rate": 5.854166666666667e-06, + "loss": 0.2873, + "step": 281 + }, + { + "epoch": 0.11468076453843025, + "grad_norm": 71.3055547262556, + "learning_rate": 5.8750000000000005e-06, + "loss": 0.9899, + "step": 282 + }, + { + "epoch": 0.11508743391622611, + "grad_norm": 39.0112834175429, + "learning_rate": 5.895833333333334e-06, + "loss": 0.3953, + "step": 283 + }, + { + "epoch": 0.11549410329402196, + "grad_norm": 30.0492425828629, + "learning_rate": 5.916666666666667e-06, + "loss": 0.6596, + "step": 284 + }, + { + "epoch": 0.11590077267181781, + "grad_norm": 36.41521026895171, + "learning_rate": 5.9375e-06, + "loss": 0.3625, + "step": 285 + }, + { + "epoch": 0.11630744204961366, + "grad_norm": 116.88989614935869, + "learning_rate": 5.958333333333334e-06, + "loss": 1.2302, + "step": 286 + }, + { + "epoch": 0.11671411142740952, + "grad_norm": 27.554460588255825, + "learning_rate": 5.979166666666667e-06, + "loss": 0.2859, + "step": 287 + }, + { + "epoch": 0.11712078080520537, + "grad_norm": 49.36733547587855, + "learning_rate": 6e-06, + "loss": 0.6222, + "step": 288 + }, + { + "epoch": 0.11752745018300122, + "grad_norm": 150.25242635637056, + "learning_rate": 6.0208333333333334e-06, + "loss": 1.8661, + "step": 289 + }, + { + "epoch": 0.11793411956079707, + "grad_norm": 30.990221255919867, + "learning_rate": 6.041666666666667e-06, + "loss": 0.6506, + "step": 290 + }, + { + "epoch": 0.11834078893859293, + "grad_norm": 87.76773291038377, + "learning_rate": 6.0625e-06, + "loss": 1.169, + "step": 291 + }, + { + "epoch": 0.11874745831638878, + "grad_norm": 82.18367590764824, + "learning_rate": 6.083333333333333e-06, + "loss": 1.0294, + "step": 292 + }, + { + "epoch": 0.11915412769418463, + "grad_norm": 28.575527021713974, + "learning_rate": 6.104166666666667e-06, + "loss": 0.3058, + "step": 293 + }, + { + "epoch": 0.11956079707198047, + "grad_norm": 40.375478347876566, + "learning_rate": 6.125000000000001e-06, + "loss": 0.2497, + "step": 294 + }, + { + "epoch": 0.11996746644977634, + "grad_norm": 53.76109987465498, + "learning_rate": 6.145833333333334e-06, + "loss": 0.3559, + "step": 295 + }, + { + "epoch": 0.12037413582757218, + "grad_norm": 34.39829075892675, + "learning_rate": 6.166666666666667e-06, + "loss": 0.6336, + "step": 296 + }, + { + "epoch": 0.12078080520536803, + "grad_norm": 7.158999774924311, + "learning_rate": 6.1875000000000005e-06, + "loss": 0.0662, + "step": 297 + }, + { + "epoch": 0.12118747458316388, + "grad_norm": 73.66171062118299, + "learning_rate": 6.208333333333334e-06, + "loss": 1.4739, + "step": 298 + }, + { + "epoch": 0.12159414396095974, + "grad_norm": 115.2100519906905, + "learning_rate": 6.229166666666667e-06, + "loss": 1.179, + "step": 299 + }, + { + "epoch": 0.12200081333875559, + "grad_norm": 61.629063006125115, + "learning_rate": 6.25e-06, + "loss": 1.2592, + "step": 300 + }, + { + "epoch": 0.12240748271655144, + "grad_norm": 145.79089538785945, + "learning_rate": 6.2708333333333336e-06, + "loss": 1.34, + "step": 301 + }, + { + "epoch": 0.12281415209434729, + "grad_norm": 96.31169956377182, + "learning_rate": 6.291666666666667e-06, + "loss": 1.8207, + "step": 302 + }, + { + "epoch": 0.12322082147214315, + "grad_norm": 53.013988645007956, + "learning_rate": 6.3125e-06, + "loss": 1.0485, + "step": 303 + }, + { + "epoch": 0.123627490849939, + "grad_norm": 93.42561819575333, + "learning_rate": 6.333333333333333e-06, + "loss": 1.5606, + "step": 304 + }, + { + "epoch": 0.12403416022773485, + "grad_norm": 25.4649095651461, + "learning_rate": 6.354166666666667e-06, + "loss": 0.2444, + "step": 305 + }, + { + "epoch": 0.1244408296055307, + "grad_norm": 12.288612195350101, + "learning_rate": 6.375e-06, + "loss": 0.101, + "step": 306 + }, + { + "epoch": 0.12484749898332656, + "grad_norm": 68.36924959197758, + "learning_rate": 6.395833333333333e-06, + "loss": 1.5949, + "step": 307 + }, + { + "epoch": 0.1252541683611224, + "grad_norm": 68.18265156418656, + "learning_rate": 6.416666666666667e-06, + "loss": 0.5624, + "step": 308 + }, + { + "epoch": 0.12566083773891826, + "grad_norm": 31.322206367616925, + "learning_rate": 6.437500000000001e-06, + "loss": 0.6888, + "step": 309 + }, + { + "epoch": 0.1260675071167141, + "grad_norm": 35.584167245074305, + "learning_rate": 6.458333333333334e-06, + "loss": 0.1473, + "step": 310 + }, + { + "epoch": 0.12647417649450995, + "grad_norm": 47.61486176341431, + "learning_rate": 6.479166666666667e-06, + "loss": 1.0086, + "step": 311 + }, + { + "epoch": 0.1268808458723058, + "grad_norm": 60.55620101500928, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.3934, + "step": 312 + }, + { + "epoch": 0.12728751525010168, + "grad_norm": 42.50716735554995, + "learning_rate": 6.520833333333334e-06, + "loss": 0.7618, + "step": 313 + }, + { + "epoch": 0.12769418462789753, + "grad_norm": 18.327104667419828, + "learning_rate": 6.541666666666667e-06, + "loss": 0.1275, + "step": 314 + }, + { + "epoch": 0.12810085400569338, + "grad_norm": 24.925229891690254, + "learning_rate": 6.5625e-06, + "loss": 0.2641, + "step": 315 + }, + { + "epoch": 0.12850752338348922, + "grad_norm": 66.28867787174816, + "learning_rate": 6.5833333333333335e-06, + "loss": 0.2627, + "step": 316 + }, + { + "epoch": 0.12891419276128507, + "grad_norm": 75.28892087749902, + "learning_rate": 6.604166666666667e-06, + "loss": 0.7671, + "step": 317 + }, + { + "epoch": 0.12932086213908092, + "grad_norm": 29.433141357347342, + "learning_rate": 6.625e-06, + "loss": 0.5362, + "step": 318 + }, + { + "epoch": 0.12972753151687677, + "grad_norm": 65.10730365097598, + "learning_rate": 6.645833333333333e-06, + "loss": 1.5713, + "step": 319 + }, + { + "epoch": 0.13013420089467262, + "grad_norm": 72.9097312622601, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1415, + "step": 320 + }, + { + "epoch": 0.1305408702724685, + "grad_norm": 38.41076485970153, + "learning_rate": 6.6875e-06, + "loss": 0.5391, + "step": 321 + }, + { + "epoch": 0.13094753965026434, + "grad_norm": 33.61837726722102, + "learning_rate": 6.708333333333333e-06, + "loss": 0.7416, + "step": 322 + }, + { + "epoch": 0.1313542090280602, + "grad_norm": 111.71224822431853, + "learning_rate": 6.729166666666667e-06, + "loss": 1.4179, + "step": 323 + }, + { + "epoch": 0.13176087840585604, + "grad_norm": 131.72323986198057, + "learning_rate": 6.750000000000001e-06, + "loss": 1.827, + "step": 324 + }, + { + "epoch": 0.1321675477836519, + "grad_norm": 60.22874183586326, + "learning_rate": 6.770833333333334e-06, + "loss": 1.6732, + "step": 325 + }, + { + "epoch": 0.13257421716144774, + "grad_norm": 72.85418852790401, + "learning_rate": 6.791666666666667e-06, + "loss": 0.8674, + "step": 326 + }, + { + "epoch": 0.1329808865392436, + "grad_norm": 77.6871452722407, + "learning_rate": 6.8125e-06, + "loss": 1.3253, + "step": 327 + }, + { + "epoch": 0.13338755591703944, + "grad_norm": 82.6593074460262, + "learning_rate": 6.833333333333334e-06, + "loss": 2.7005, + "step": 328 + }, + { + "epoch": 0.1337942252948353, + "grad_norm": 40.74800066599511, + "learning_rate": 6.854166666666667e-06, + "loss": 0.6796, + "step": 329 + }, + { + "epoch": 0.13420089467263116, + "grad_norm": 51.70340407622741, + "learning_rate": 6.875e-06, + "loss": 0.7598, + "step": 330 + }, + { + "epoch": 0.134607564050427, + "grad_norm": 60.64983417859017, + "learning_rate": 6.8958333333333335e-06, + "loss": 2.2498, + "step": 331 + }, + { + "epoch": 0.13501423342822286, + "grad_norm": 49.35197635185438, + "learning_rate": 6.916666666666667e-06, + "loss": 0.4918, + "step": 332 + }, + { + "epoch": 0.1354209028060187, + "grad_norm": 53.30484258835198, + "learning_rate": 6.9375e-06, + "loss": 0.5234, + "step": 333 + }, + { + "epoch": 0.13582757218381455, + "grad_norm": 78.67207954478641, + "learning_rate": 6.958333333333333e-06, + "loss": 1.6529, + "step": 334 + }, + { + "epoch": 0.1362342415616104, + "grad_norm": 37.74273674923941, + "learning_rate": 6.979166666666667e-06, + "loss": 0.3949, + "step": 335 + }, + { + "epoch": 0.13664091093940625, + "grad_norm": 32.44239893306836, + "learning_rate": 7e-06, + "loss": 0.3002, + "step": 336 + }, + { + "epoch": 0.13704758031720213, + "grad_norm": 24.109192812132974, + "learning_rate": 7.020833333333333e-06, + "loss": 0.3334, + "step": 337 + }, + { + "epoch": 0.13745424969499798, + "grad_norm": 44.41708073354437, + "learning_rate": 7.041666666666668e-06, + "loss": 1.4986, + "step": 338 + }, + { + "epoch": 0.13786091907279382, + "grad_norm": 13.589456383778407, + "learning_rate": 7.062500000000001e-06, + "loss": 0.1077, + "step": 339 + }, + { + "epoch": 0.13826758845058967, + "grad_norm": 30.174555933743758, + "learning_rate": 7.083333333333335e-06, + "loss": 0.2073, + "step": 340 + }, + { + "epoch": 0.13867425782838552, + "grad_norm": 47.79995725379288, + "learning_rate": 7.104166666666668e-06, + "loss": 0.4506, + "step": 341 + }, + { + "epoch": 0.13908092720618137, + "grad_norm": 16.403040151627348, + "learning_rate": 7.125e-06, + "loss": 0.1446, + "step": 342 + }, + { + "epoch": 0.13948759658397722, + "grad_norm": 15.000759860936355, + "learning_rate": 7.145833333333334e-06, + "loss": 0.1029, + "step": 343 + }, + { + "epoch": 0.13989426596177307, + "grad_norm": 54.69245824639208, + "learning_rate": 7.166666666666667e-06, + "loss": 1.0567, + "step": 344 + }, + { + "epoch": 0.14030093533956894, + "grad_norm": 81.62955727495354, + "learning_rate": 7.1875e-06, + "loss": 0.3942, + "step": 345 + }, + { + "epoch": 0.1407076047173648, + "grad_norm": 44.382593902485354, + "learning_rate": 7.2083333333333335e-06, + "loss": 0.1282, + "step": 346 + }, + { + "epoch": 0.14111427409516064, + "grad_norm": 25.798317342441333, + "learning_rate": 7.229166666666667e-06, + "loss": 0.3489, + "step": 347 + }, + { + "epoch": 0.1415209434729565, + "grad_norm": 31.921389520531964, + "learning_rate": 7.25e-06, + "loss": 0.5214, + "step": 348 + }, + { + "epoch": 0.14192761285075234, + "grad_norm": 38.48800541383903, + "learning_rate": 7.270833333333333e-06, + "loss": 0.6767, + "step": 349 + }, + { + "epoch": 0.1423342822285482, + "grad_norm": 10.759505686231492, + "learning_rate": 7.291666666666667e-06, + "loss": 0.0626, + "step": 350 + }, + { + "epoch": 0.14274095160634404, + "grad_norm": 43.327109657521476, + "learning_rate": 7.3125e-06, + "loss": 0.3831, + "step": 351 + }, + { + "epoch": 0.14314762098413988, + "grad_norm": 94.23365015571191, + "learning_rate": 7.333333333333333e-06, + "loss": 2.5651, + "step": 352 + }, + { + "epoch": 0.14355429036193573, + "grad_norm": 51.26003486764627, + "learning_rate": 7.354166666666668e-06, + "loss": 0.9502, + "step": 353 + }, + { + "epoch": 0.1439609597397316, + "grad_norm": 11.408693276210691, + "learning_rate": 7.375000000000001e-06, + "loss": 0.0772, + "step": 354 + }, + { + "epoch": 0.14436762911752746, + "grad_norm": 52.27654344130972, + "learning_rate": 7.395833333333335e-06, + "loss": 1.4488, + "step": 355 + }, + { + "epoch": 0.1447742984953233, + "grad_norm": 59.490861851130084, + "learning_rate": 7.416666666666668e-06, + "loss": 0.8236, + "step": 356 + }, + { + "epoch": 0.14518096787311915, + "grad_norm": 30.030771021441996, + "learning_rate": 7.437500000000001e-06, + "loss": 0.5327, + "step": 357 + }, + { + "epoch": 0.145587637250915, + "grad_norm": 59.78467848698099, + "learning_rate": 7.4583333333333345e-06, + "loss": 1.5468, + "step": 358 + }, + { + "epoch": 0.14599430662871085, + "grad_norm": 57.88557834996238, + "learning_rate": 7.479166666666668e-06, + "loss": 1.1804, + "step": 359 + }, + { + "epoch": 0.1464009760065067, + "grad_norm": 55.37403484250382, + "learning_rate": 7.500000000000001e-06, + "loss": 0.758, + "step": 360 + }, + { + "epoch": 0.14680764538430255, + "grad_norm": 37.85471595092012, + "learning_rate": 7.5208333333333335e-06, + "loss": 0.3003, + "step": 361 + }, + { + "epoch": 0.14721431476209842, + "grad_norm": 18.300707128287144, + "learning_rate": 7.541666666666667e-06, + "loss": 0.1085, + "step": 362 + }, + { + "epoch": 0.14762098413989427, + "grad_norm": 22.557157401393834, + "learning_rate": 7.5625e-06, + "loss": 0.3753, + "step": 363 + }, + { + "epoch": 0.14802765351769012, + "grad_norm": 13.890189401822564, + "learning_rate": 7.583333333333333e-06, + "loss": 0.1546, + "step": 364 + }, + { + "epoch": 0.14843432289548597, + "grad_norm": 56.405872224826574, + "learning_rate": 7.6041666666666666e-06, + "loss": 1.3458, + "step": 365 + }, + { + "epoch": 0.14884099227328182, + "grad_norm": 54.41470902189611, + "learning_rate": 7.625e-06, + "loss": 1.1994, + "step": 366 + }, + { + "epoch": 0.14924766165107767, + "grad_norm": 40.55709563123388, + "learning_rate": 7.645833333333334e-06, + "loss": 0.7243, + "step": 367 + }, + { + "epoch": 0.14965433102887352, + "grad_norm": 36.43003563793312, + "learning_rate": 7.666666666666667e-06, + "loss": 0.5381, + "step": 368 + }, + { + "epoch": 0.15006100040666936, + "grad_norm": 32.52333253620867, + "learning_rate": 7.6875e-06, + "loss": 0.867, + "step": 369 + }, + { + "epoch": 0.15046766978446524, + "grad_norm": 24.100756303110963, + "learning_rate": 7.708333333333334e-06, + "loss": 0.126, + "step": 370 + }, + { + "epoch": 0.1508743391622611, + "grad_norm": 46.301351571245476, + "learning_rate": 7.729166666666667e-06, + "loss": 0.8926, + "step": 371 + }, + { + "epoch": 0.15128100854005694, + "grad_norm": 13.68026910111389, + "learning_rate": 7.75e-06, + "loss": 0.0901, + "step": 372 + }, + { + "epoch": 0.1516876779178528, + "grad_norm": 27.742740300900774, + "learning_rate": 7.770833333333334e-06, + "loss": 0.4017, + "step": 373 + }, + { + "epoch": 0.15209434729564864, + "grad_norm": 32.69380641759009, + "learning_rate": 7.791666666666667e-06, + "loss": 0.4595, + "step": 374 + }, + { + "epoch": 0.15250101667344448, + "grad_norm": 73.01345271351195, + "learning_rate": 7.8125e-06, + "loss": 3.0641, + "step": 375 + }, + { + "epoch": 0.15290768605124033, + "grad_norm": 30.01186922287125, + "learning_rate": 7.833333333333333e-06, + "loss": 0.3776, + "step": 376 + }, + { + "epoch": 0.15331435542903618, + "grad_norm": 25.422465940909465, + "learning_rate": 7.854166666666667e-06, + "loss": 0.4204, + "step": 377 + }, + { + "epoch": 0.15372102480683206, + "grad_norm": 30.159601329959774, + "learning_rate": 7.875e-06, + "loss": 0.3043, + "step": 378 + }, + { + "epoch": 0.1541276941846279, + "grad_norm": 19.00910347424478, + "learning_rate": 7.895833333333333e-06, + "loss": 0.1801, + "step": 379 + }, + { + "epoch": 0.15453436356242375, + "grad_norm": 71.86792734861932, + "learning_rate": 7.916666666666667e-06, + "loss": 0.9563, + "step": 380 + }, + { + "epoch": 0.1549410329402196, + "grad_norm": 35.02066639338573, + "learning_rate": 7.9375e-06, + "loss": 0.1951, + "step": 381 + }, + { + "epoch": 0.15534770231801545, + "grad_norm": 33.434447114386955, + "learning_rate": 7.958333333333333e-06, + "loss": 0.5159, + "step": 382 + }, + { + "epoch": 0.1557543716958113, + "grad_norm": 34.14801448489325, + "learning_rate": 7.979166666666668e-06, + "loss": 0.6164, + "step": 383 + }, + { + "epoch": 0.15616104107360715, + "grad_norm": 26.543489022450657, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2668, + "step": 384 + }, + { + "epoch": 0.156567710451403, + "grad_norm": 8.180506167254736, + "learning_rate": 8.020833333333335e-06, + "loss": 0.0973, + "step": 385 + }, + { + "epoch": 0.15697437982919887, + "grad_norm": 56.71276967911715, + "learning_rate": 8.041666666666668e-06, + "loss": 0.6445, + "step": 386 + }, + { + "epoch": 0.15738104920699472, + "grad_norm": 6.439701164382684, + "learning_rate": 8.062500000000001e-06, + "loss": 0.0413, + "step": 387 + }, + { + "epoch": 0.15778771858479057, + "grad_norm": 27.916440431665038, + "learning_rate": 8.083333333333334e-06, + "loss": 0.1844, + "step": 388 + }, + { + "epoch": 0.15819438796258642, + "grad_norm": 28.12388691500953, + "learning_rate": 8.104166666666668e-06, + "loss": 0.3658, + "step": 389 + }, + { + "epoch": 0.15860105734038227, + "grad_norm": 124.89643505970646, + "learning_rate": 8.125000000000001e-06, + "loss": 1.3604, + "step": 390 + }, + { + "epoch": 0.15900772671817812, + "grad_norm": 50.09373358406754, + "learning_rate": 8.145833333333334e-06, + "loss": 0.7342, + "step": 391 + }, + { + "epoch": 0.15941439609597396, + "grad_norm": 9.436296533293145, + "learning_rate": 8.166666666666668e-06, + "loss": 0.0797, + "step": 392 + }, + { + "epoch": 0.1598210654737698, + "grad_norm": 30.668395667821052, + "learning_rate": 8.1875e-06, + "loss": 0.6796, + "step": 393 + }, + { + "epoch": 0.1602277348515657, + "grad_norm": 52.52196593310497, + "learning_rate": 8.208333333333334e-06, + "loss": 1.2321, + "step": 394 + }, + { + "epoch": 0.16063440422936154, + "grad_norm": 28.7546602465631, + "learning_rate": 8.229166666666667e-06, + "loss": 0.1619, + "step": 395 + }, + { + "epoch": 0.1610410736071574, + "grad_norm": 6.239691116970831, + "learning_rate": 8.25e-06, + "loss": 0.0153, + "step": 396 + }, + { + "epoch": 0.16144774298495324, + "grad_norm": 45.95688911380342, + "learning_rate": 8.270833333333334e-06, + "loss": 1.0343, + "step": 397 + }, + { + "epoch": 0.16185441236274908, + "grad_norm": 58.58529605150606, + "learning_rate": 8.291666666666667e-06, + "loss": 0.8245, + "step": 398 + }, + { + "epoch": 0.16226108174054493, + "grad_norm": 34.74303531491524, + "learning_rate": 8.3125e-06, + "loss": 0.4856, + "step": 399 + }, + { + "epoch": 0.16266775111834078, + "grad_norm": 12.18603219705827, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0745, + "step": 400 + }, + { + "epoch": 0.16307442049613663, + "grad_norm": 44.06043755603017, + "learning_rate": 8.354166666666667e-06, + "loss": 0.8958, + "step": 401 + }, + { + "epoch": 0.1634810898739325, + "grad_norm": 7.758736583587454, + "learning_rate": 8.375e-06, + "loss": 0.0554, + "step": 402 + }, + { + "epoch": 0.16388775925172835, + "grad_norm": 41.30990457172979, + "learning_rate": 8.395833333333334e-06, + "loss": 0.7035, + "step": 403 + }, + { + "epoch": 0.1642944286295242, + "grad_norm": 25.644110798929905, + "learning_rate": 8.416666666666667e-06, + "loss": 0.2607, + "step": 404 + }, + { + "epoch": 0.16470109800732005, + "grad_norm": 42.98116183188003, + "learning_rate": 8.4375e-06, + "loss": 0.7718, + "step": 405 + }, + { + "epoch": 0.1651077673851159, + "grad_norm": 62.61598146244513, + "learning_rate": 8.458333333333333e-06, + "loss": 1.1736, + "step": 406 + }, + { + "epoch": 0.16551443676291175, + "grad_norm": 33.12252793534539, + "learning_rate": 8.479166666666667e-06, + "loss": 0.8806, + "step": 407 + }, + { + "epoch": 0.1659211061407076, + "grad_norm": 60.959032448648856, + "learning_rate": 8.5e-06, + "loss": 1.3954, + "step": 408 + }, + { + "epoch": 0.16632777551850345, + "grad_norm": 34.95027640033073, + "learning_rate": 8.520833333333333e-06, + "loss": 0.4315, + "step": 409 + }, + { + "epoch": 0.16673444489629932, + "grad_norm": 84.36002960558739, + "learning_rate": 8.541666666666666e-06, + "loss": 2.3702, + "step": 410 + }, + { + "epoch": 0.16714111427409517, + "grad_norm": 67.11631343264855, + "learning_rate": 8.5625e-06, + "loss": 1.8927, + "step": 411 + }, + { + "epoch": 0.16754778365189102, + "grad_norm": 20.549604294069994, + "learning_rate": 8.583333333333333e-06, + "loss": 0.2486, + "step": 412 + }, + { + "epoch": 0.16795445302968687, + "grad_norm": 44.94495180311174, + "learning_rate": 8.604166666666668e-06, + "loss": 0.5333, + "step": 413 + }, + { + "epoch": 0.16836112240748272, + "grad_norm": 47.46909176112974, + "learning_rate": 8.625000000000001e-06, + "loss": 0.7567, + "step": 414 + }, + { + "epoch": 0.16876779178527856, + "grad_norm": 34.19008097999648, + "learning_rate": 8.645833333333335e-06, + "loss": 0.7881, + "step": 415 + }, + { + "epoch": 0.1691744611630744, + "grad_norm": 37.304108785564274, + "learning_rate": 8.666666666666668e-06, + "loss": 0.4425, + "step": 416 + }, + { + "epoch": 0.16958113054087026, + "grad_norm": 19.385833113174005, + "learning_rate": 8.687500000000001e-06, + "loss": 0.2055, + "step": 417 + }, + { + "epoch": 0.16998779991866614, + "grad_norm": 44.3906108221067, + "learning_rate": 8.708333333333334e-06, + "loss": 0.3695, + "step": 418 + }, + { + "epoch": 0.170394469296462, + "grad_norm": 30.01036721808023, + "learning_rate": 8.729166666666668e-06, + "loss": 0.5968, + "step": 419 + }, + { + "epoch": 0.17080113867425784, + "grad_norm": 26.678460429445447, + "learning_rate": 8.750000000000001e-06, + "loss": 0.3436, + "step": 420 + }, + { + "epoch": 0.17120780805205368, + "grad_norm": 16.3924881448555, + "learning_rate": 8.770833333333334e-06, + "loss": 0.1498, + "step": 421 + }, + { + "epoch": 0.17161447742984953, + "grad_norm": 6.540939956795041, + "learning_rate": 8.791666666666667e-06, + "loss": 0.0629, + "step": 422 + }, + { + "epoch": 0.17202114680764538, + "grad_norm": 54.126735780559216, + "learning_rate": 8.8125e-06, + "loss": 0.8589, + "step": 423 + }, + { + "epoch": 0.17242781618544123, + "grad_norm": 38.7581281827909, + "learning_rate": 8.833333333333334e-06, + "loss": 0.5509, + "step": 424 + }, + { + "epoch": 0.17283448556323708, + "grad_norm": 44.86530710710729, + "learning_rate": 8.854166666666667e-06, + "loss": 0.9497, + "step": 425 + }, + { + "epoch": 0.17324115494103293, + "grad_norm": 19.19707506000619, + "learning_rate": 8.875e-06, + "loss": 0.4398, + "step": 426 + }, + { + "epoch": 0.1736478243188288, + "grad_norm": 9.710957386580253, + "learning_rate": 8.895833333333334e-06, + "loss": 0.1879, + "step": 427 + }, + { + "epoch": 0.17405449369662465, + "grad_norm": 6.634160800454023, + "learning_rate": 8.916666666666667e-06, + "loss": 0.0549, + "step": 428 + }, + { + "epoch": 0.1744611630744205, + "grad_norm": 28.16656003729693, + "learning_rate": 8.9375e-06, + "loss": 0.4961, + "step": 429 + }, + { + "epoch": 0.17486783245221635, + "grad_norm": 17.032369630270797, + "learning_rate": 8.958333333333334e-06, + "loss": 0.1927, + "step": 430 + }, + { + "epoch": 0.1752745018300122, + "grad_norm": 23.71086138944021, + "learning_rate": 8.979166666666667e-06, + "loss": 0.4375, + "step": 431 + }, + { + "epoch": 0.17568117120780805, + "grad_norm": 21.525148728706398, + "learning_rate": 9e-06, + "loss": 0.2138, + "step": 432 + }, + { + "epoch": 0.1760878405856039, + "grad_norm": 5.162480601525476, + "learning_rate": 9.020833333333334e-06, + "loss": 0.1084, + "step": 433 + }, + { + "epoch": 0.17649450996339974, + "grad_norm": 36.95372311751226, + "learning_rate": 9.041666666666667e-06, + "loss": 0.2927, + "step": 434 + }, + { + "epoch": 0.17690117934119562, + "grad_norm": 4.101208222889586, + "learning_rate": 9.0625e-06, + "loss": 0.0266, + "step": 435 + }, + { + "epoch": 0.17730784871899147, + "grad_norm": 38.63530388986449, + "learning_rate": 9.083333333333333e-06, + "loss": 0.8194, + "step": 436 + }, + { + "epoch": 0.17771451809678732, + "grad_norm": 48.0170590248312, + "learning_rate": 9.104166666666667e-06, + "loss": 1.6001, + "step": 437 + }, + { + "epoch": 0.17812118747458316, + "grad_norm": 9.117275507557029, + "learning_rate": 9.125e-06, + "loss": 0.0989, + "step": 438 + }, + { + "epoch": 0.178527856852379, + "grad_norm": 34.389557392477016, + "learning_rate": 9.145833333333333e-06, + "loss": 0.6701, + "step": 439 + }, + { + "epoch": 0.17893452623017486, + "grad_norm": 52.917868125120386, + "learning_rate": 9.166666666666666e-06, + "loss": 0.7575, + "step": 440 + }, + { + "epoch": 0.1793411956079707, + "grad_norm": 58.335625789305595, + "learning_rate": 9.1875e-06, + "loss": 0.9067, + "step": 441 + }, + { + "epoch": 0.17974786498576656, + "grad_norm": 50.22865480644807, + "learning_rate": 9.208333333333333e-06, + "loss": 1.4591, + "step": 442 + }, + { + "epoch": 0.18015453436356244, + "grad_norm": 40.67189162389919, + "learning_rate": 9.229166666666668e-06, + "loss": 0.8591, + "step": 443 + }, + { + "epoch": 0.18056120374135828, + "grad_norm": 39.52336395069826, + "learning_rate": 9.250000000000001e-06, + "loss": 0.5256, + "step": 444 + }, + { + "epoch": 0.18096787311915413, + "grad_norm": 8.13983696934899, + "learning_rate": 9.270833333333334e-06, + "loss": 0.0704, + "step": 445 + }, + { + "epoch": 0.18137454249694998, + "grad_norm": 61.03657481554626, + "learning_rate": 9.291666666666668e-06, + "loss": 0.7396, + "step": 446 + }, + { + "epoch": 0.18178121187474583, + "grad_norm": 26.914056464860344, + "learning_rate": 9.312500000000001e-06, + "loss": 0.1642, + "step": 447 + }, + { + "epoch": 0.18218788125254168, + "grad_norm": 44.8026067674658, + "learning_rate": 9.333333333333334e-06, + "loss": 1.4291, + "step": 448 + }, + { + "epoch": 0.18259455063033753, + "grad_norm": 24.61707773539013, + "learning_rate": 9.354166666666668e-06, + "loss": 0.6202, + "step": 449 + }, + { + "epoch": 0.18300122000813338, + "grad_norm": 55.66130971841137, + "learning_rate": 9.375000000000001e-06, + "loss": 1.6294, + "step": 450 + }, + { + "epoch": 0.18340788938592925, + "grad_norm": 27.91256581497078, + "learning_rate": 9.395833333333334e-06, + "loss": 0.6849, + "step": 451 + }, + { + "epoch": 0.1838145587637251, + "grad_norm": 19.99742889441409, + "learning_rate": 9.416666666666667e-06, + "loss": 0.1054, + "step": 452 + }, + { + "epoch": 0.18422122814152095, + "grad_norm": 52.763103422032465, + "learning_rate": 9.4375e-06, + "loss": 1.4763, + "step": 453 + }, + { + "epoch": 0.1846278975193168, + "grad_norm": 35.02278501220648, + "learning_rate": 9.458333333333334e-06, + "loss": 1.1615, + "step": 454 + }, + { + "epoch": 0.18503456689711265, + "grad_norm": 74.16814563746401, + "learning_rate": 9.479166666666667e-06, + "loss": 2.0158, + "step": 455 + }, + { + "epoch": 0.1854412362749085, + "grad_norm": 15.310656250676013, + "learning_rate": 9.5e-06, + "loss": 0.34, + "step": 456 + }, + { + "epoch": 0.18584790565270434, + "grad_norm": 23.734443075862192, + "learning_rate": 9.520833333333334e-06, + "loss": 0.351, + "step": 457 + }, + { + "epoch": 0.1862545750305002, + "grad_norm": 53.93324819429116, + "learning_rate": 9.541666666666669e-06, + "loss": 1.5211, + "step": 458 + }, + { + "epoch": 0.18666124440829607, + "grad_norm": 74.6588472367239, + "learning_rate": 9.562500000000002e-06, + "loss": 2.0629, + "step": 459 + }, + { + "epoch": 0.18706791378609192, + "grad_norm": 64.01622843600533, + "learning_rate": 9.583333333333335e-06, + "loss": 1.4337, + "step": 460 + }, + { + "epoch": 0.18747458316388776, + "grad_norm": 35.90277196018464, + "learning_rate": 9.604166666666669e-06, + "loss": 0.8892, + "step": 461 + }, + { + "epoch": 0.1878812525416836, + "grad_norm": 29.374725009767364, + "learning_rate": 9.625e-06, + "loss": 0.791, + "step": 462 + }, + { + "epoch": 0.18828792191947946, + "grad_norm": 22.986214161750883, + "learning_rate": 9.645833333333333e-06, + "loss": 0.7196, + "step": 463 + }, + { + "epoch": 0.1886945912972753, + "grad_norm": 30.941209258106326, + "learning_rate": 9.666666666666667e-06, + "loss": 0.8221, + "step": 464 + }, + { + "epoch": 0.18910126067507116, + "grad_norm": 42.011515049283155, + "learning_rate": 9.6875e-06, + "loss": 1.0127, + "step": 465 + }, + { + "epoch": 0.189507930052867, + "grad_norm": 46.83937106364435, + "learning_rate": 9.708333333333333e-06, + "loss": 1.1371, + "step": 466 + }, + { + "epoch": 0.18991459943066288, + "grad_norm": 22.804729842904308, + "learning_rate": 9.729166666666667e-06, + "loss": 0.5198, + "step": 467 + }, + { + "epoch": 0.19032126880845873, + "grad_norm": 38.93437700315232, + "learning_rate": 9.75e-06, + "loss": 0.8445, + "step": 468 + }, + { + "epoch": 0.19072793818625458, + "grad_norm": 22.709737780796512, + "learning_rate": 9.770833333333333e-06, + "loss": 0.1431, + "step": 469 + }, + { + "epoch": 0.19113460756405043, + "grad_norm": 16.53488756821469, + "learning_rate": 9.791666666666666e-06, + "loss": 0.3786, + "step": 470 + }, + { + "epoch": 0.19154127694184628, + "grad_norm": 8.698087774672596, + "learning_rate": 9.8125e-06, + "loss": 0.1101, + "step": 471 + }, + { + "epoch": 0.19194794631964213, + "grad_norm": 21.01327137378615, + "learning_rate": 9.833333333333333e-06, + "loss": 0.464, + "step": 472 + }, + { + "epoch": 0.19235461569743798, + "grad_norm": 40.325850590741354, + "learning_rate": 9.854166666666668e-06, + "loss": 0.2629, + "step": 473 + }, + { + "epoch": 0.19276128507523382, + "grad_norm": 35.458288730867096, + "learning_rate": 9.875000000000001e-06, + "loss": 0.9404, + "step": 474 + }, + { + "epoch": 0.1931679544530297, + "grad_norm": 23.32078120033786, + "learning_rate": 9.895833333333334e-06, + "loss": 0.3589, + "step": 475 + }, + { + "epoch": 0.19357462383082555, + "grad_norm": 17.35372462278276, + "learning_rate": 9.916666666666668e-06, + "loss": 0.1159, + "step": 476 + }, + { + "epoch": 0.1939812932086214, + "grad_norm": 42.085411853102954, + "learning_rate": 9.937500000000001e-06, + "loss": 1.5124, + "step": 477 + }, + { + "epoch": 0.19438796258641725, + "grad_norm": 22.12548908793643, + "learning_rate": 9.958333333333334e-06, + "loss": 0.696, + "step": 478 + }, + { + "epoch": 0.1947946319642131, + "grad_norm": 23.626064074107497, + "learning_rate": 9.979166666666668e-06, + "loss": 0.5169, + "step": 479 + }, + { + "epoch": 0.19520130134200894, + "grad_norm": 19.092224418335917, + "learning_rate": 1e-05, + "loss": 0.2752, + "step": 480 + }, + { + "epoch": 0.1956079707198048, + "grad_norm": 27.51165768562954, + "learning_rate": 1.0020833333333336e-05, + "loss": 0.7622, + "step": 481 + }, + { + "epoch": 0.19601464009760064, + "grad_norm": 28.275262379788597, + "learning_rate": 1.0041666666666667e-05, + "loss": 0.6185, + "step": 482 + }, + { + "epoch": 0.19642130947539652, + "grad_norm": 6.05590107438126, + "learning_rate": 1.0062500000000002e-05, + "loss": 0.0523, + "step": 483 + }, + { + "epoch": 0.19682797885319236, + "grad_norm": 29.771137150811338, + "learning_rate": 1.0083333333333334e-05, + "loss": 0.5641, + "step": 484 + }, + { + "epoch": 0.1972346482309882, + "grad_norm": 22.338869053908304, + "learning_rate": 1.0104166666666669e-05, + "loss": 0.5363, + "step": 485 + }, + { + "epoch": 0.19764131760878406, + "grad_norm": 40.10135348369766, + "learning_rate": 1.0125e-05, + "loss": 0.9278, + "step": 486 + }, + { + "epoch": 0.1980479869865799, + "grad_norm": 22.853853877414288, + "learning_rate": 1.0145833333333335e-05, + "loss": 0.3429, + "step": 487 + }, + { + "epoch": 0.19845465636437576, + "grad_norm": 29.100257390218825, + "learning_rate": 1.0166666666666667e-05, + "loss": 0.8602, + "step": 488 + }, + { + "epoch": 0.1988613257421716, + "grad_norm": 36.861381611976995, + "learning_rate": 1.0187500000000002e-05, + "loss": 1.1773, + "step": 489 + }, + { + "epoch": 0.19926799511996746, + "grad_norm": 14.343701552607127, + "learning_rate": 1.0208333333333334e-05, + "loss": 0.4846, + "step": 490 + }, + { + "epoch": 0.19967466449776333, + "grad_norm": 26.321476844595, + "learning_rate": 1.0229166666666669e-05, + "loss": 0.4101, + "step": 491 + }, + { + "epoch": 0.20008133387555918, + "grad_norm": 26.117891402433987, + "learning_rate": 1.025e-05, + "loss": 0.7093, + "step": 492 + }, + { + "epoch": 0.20048800325335503, + "grad_norm": 28.56583902580133, + "learning_rate": 1.0270833333333335e-05, + "loss": 0.7006, + "step": 493 + }, + { + "epoch": 0.20089467263115088, + "grad_norm": 13.522075052479405, + "learning_rate": 1.0291666666666667e-05, + "loss": 0.1691, + "step": 494 + }, + { + "epoch": 0.20130134200894673, + "grad_norm": 39.06900711449438, + "learning_rate": 1.0312500000000002e-05, + "loss": 0.8686, + "step": 495 + }, + { + "epoch": 0.20170801138674258, + "grad_norm": 23.29992623115392, + "learning_rate": 1.0333333333333335e-05, + "loss": 0.3809, + "step": 496 + }, + { + "epoch": 0.20211468076453842, + "grad_norm": 32.57783771234993, + "learning_rate": 1.0354166666666668e-05, + "loss": 0.7055, + "step": 497 + }, + { + "epoch": 0.20252135014233427, + "grad_norm": 24.420390996947777, + "learning_rate": 1.0375000000000001e-05, + "loss": 0.5236, + "step": 498 + }, + { + "epoch": 0.20292801952013012, + "grad_norm": 22.45403291883152, + "learning_rate": 1.0395833333333333e-05, + "loss": 0.5956, + "step": 499 + }, + { + "epoch": 0.203334688897926, + "grad_norm": 28.26541641948113, + "learning_rate": 1.0416666666666668e-05, + "loss": 0.3753, + "step": 500 + }, + { + "epoch": 0.20374135827572185, + "grad_norm": 38.4928671260241, + "learning_rate": 1.04375e-05, + "loss": 0.8777, + "step": 501 + }, + { + "epoch": 0.2041480276535177, + "grad_norm": 3.8659378488289162, + "learning_rate": 1.0458333333333335e-05, + "loss": 0.0348, + "step": 502 + }, + { + "epoch": 0.20455469703131354, + "grad_norm": 19.654179832476192, + "learning_rate": 1.0479166666666666e-05, + "loss": 0.2644, + "step": 503 + }, + { + "epoch": 0.2049613664091094, + "grad_norm": 51.42248731438115, + "learning_rate": 1.0500000000000001e-05, + "loss": 1.8288, + "step": 504 + }, + { + "epoch": 0.20536803578690524, + "grad_norm": 7.897496381893316, + "learning_rate": 1.0520833333333333e-05, + "loss": 0.0605, + "step": 505 + }, + { + "epoch": 0.2057747051647011, + "grad_norm": 34.50884128301799, + "learning_rate": 1.0541666666666668e-05, + "loss": 0.8239, + "step": 506 + }, + { + "epoch": 0.20618137454249694, + "grad_norm": 28.857071123395897, + "learning_rate": 1.05625e-05, + "loss": 0.2667, + "step": 507 + }, + { + "epoch": 0.2065880439202928, + "grad_norm": 13.93792380934274, + "learning_rate": 1.0583333333333334e-05, + "loss": 0.1338, + "step": 508 + }, + { + "epoch": 0.20699471329808866, + "grad_norm": 71.90002657763158, + "learning_rate": 1.0604166666666666e-05, + "loss": 0.6414, + "step": 509 + }, + { + "epoch": 0.2074013826758845, + "grad_norm": 28.60432562229816, + "learning_rate": 1.0625e-05, + "loss": 0.6281, + "step": 510 + }, + { + "epoch": 0.20780805205368036, + "grad_norm": 14.37255498600301, + "learning_rate": 1.0645833333333336e-05, + "loss": 0.2737, + "step": 511 + }, + { + "epoch": 0.2082147214314762, + "grad_norm": 13.004615373461853, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.2042, + "step": 512 + }, + { + "epoch": 0.20862139080927206, + "grad_norm": 43.08445062051723, + "learning_rate": 1.0687500000000002e-05, + "loss": 0.309, + "step": 513 + }, + { + "epoch": 0.2090280601870679, + "grad_norm": 82.92641985255314, + "learning_rate": 1.0708333333333334e-05, + "loss": 0.5244, + "step": 514 + }, + { + "epoch": 0.20943472956486375, + "grad_norm": 71.29290626008549, + "learning_rate": 1.0729166666666669e-05, + "loss": 0.6057, + "step": 515 + }, + { + "epoch": 0.20984139894265963, + "grad_norm": 24.19757370547137, + "learning_rate": 1.075e-05, + "loss": 0.287, + "step": 516 + }, + { + "epoch": 0.21024806832045548, + "grad_norm": 34.024674027963115, + "learning_rate": 1.0770833333333335e-05, + "loss": 0.7027, + "step": 517 + }, + { + "epoch": 0.21065473769825133, + "grad_norm": 33.55787038461967, + "learning_rate": 1.0791666666666667e-05, + "loss": 0.7832, + "step": 518 + }, + { + "epoch": 0.21106140707604717, + "grad_norm": 32.2218120810686, + "learning_rate": 1.0812500000000002e-05, + "loss": 0.557, + "step": 519 + }, + { + "epoch": 0.21146807645384302, + "grad_norm": 42.89864975010613, + "learning_rate": 1.0833333333333334e-05, + "loss": 1.1267, + "step": 520 + }, + { + "epoch": 0.21187474583163887, + "grad_norm": 49.19080140062996, + "learning_rate": 1.0854166666666668e-05, + "loss": 0.4454, + "step": 521 + }, + { + "epoch": 0.21228141520943472, + "grad_norm": 57.463951602306736, + "learning_rate": 1.0875e-05, + "loss": 1.1022, + "step": 522 + }, + { + "epoch": 0.21268808458723057, + "grad_norm": 11.002520306867682, + "learning_rate": 1.0895833333333335e-05, + "loss": 0.1026, + "step": 523 + }, + { + "epoch": 0.21309475396502645, + "grad_norm": 36.21714554957098, + "learning_rate": 1.0916666666666667e-05, + "loss": 0.4352, + "step": 524 + }, + { + "epoch": 0.2135014233428223, + "grad_norm": 22.604357703983055, + "learning_rate": 1.0937500000000002e-05, + "loss": 0.3185, + "step": 525 + }, + { + "epoch": 0.21390809272061814, + "grad_norm": 35.522353957601744, + "learning_rate": 1.0958333333333335e-05, + "loss": 0.5551, + "step": 526 + }, + { + "epoch": 0.214314762098414, + "grad_norm": 41.10153632062562, + "learning_rate": 1.0979166666666668e-05, + "loss": 0.5481, + "step": 527 + }, + { + "epoch": 0.21472143147620984, + "grad_norm": 15.675397559526626, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1793, + "step": 528 + }, + { + "epoch": 0.2151281008540057, + "grad_norm": 45.0175218377818, + "learning_rate": 1.1020833333333335e-05, + "loss": 1.2463, + "step": 529 + }, + { + "epoch": 0.21553477023180154, + "grad_norm": 36.183158128140846, + "learning_rate": 1.1041666666666668e-05, + "loss": 0.2623, + "step": 530 + }, + { + "epoch": 0.21594143960959739, + "grad_norm": 19.665814143149014, + "learning_rate": 1.1062500000000001e-05, + "loss": 0.2985, + "step": 531 + }, + { + "epoch": 0.21634810898739326, + "grad_norm": 19.45839157721603, + "learning_rate": 1.1083333333333335e-05, + "loss": 0.0798, + "step": 532 + }, + { + "epoch": 0.2167547783651891, + "grad_norm": 36.3415450651554, + "learning_rate": 1.1104166666666668e-05, + "loss": 0.2952, + "step": 533 + }, + { + "epoch": 0.21716144774298496, + "grad_norm": 19.001550926612445, + "learning_rate": 1.1125000000000001e-05, + "loss": 0.2787, + "step": 534 + }, + { + "epoch": 0.2175681171207808, + "grad_norm": 35.041570659162716, + "learning_rate": 1.1145833333333334e-05, + "loss": 0.4279, + "step": 535 + }, + { + "epoch": 0.21797478649857666, + "grad_norm": 12.966408549063388, + "learning_rate": 1.1166666666666668e-05, + "loss": 0.0991, + "step": 536 + }, + { + "epoch": 0.2183814558763725, + "grad_norm": 35.1262813076481, + "learning_rate": 1.11875e-05, + "loss": 0.3632, + "step": 537 + }, + { + "epoch": 0.21878812525416835, + "grad_norm": 2.666378554855736, + "learning_rate": 1.1208333333333334e-05, + "loss": 0.0246, + "step": 538 + }, + { + "epoch": 0.2191947946319642, + "grad_norm": 52.52182058705012, + "learning_rate": 1.1229166666666666e-05, + "loss": 0.6804, + "step": 539 + }, + { + "epoch": 0.21960146400976008, + "grad_norm": 51.91549051012835, + "learning_rate": 1.125e-05, + "loss": 0.8946, + "step": 540 + }, + { + "epoch": 0.22000813338755593, + "grad_norm": 40.23710403186817, + "learning_rate": 1.1270833333333336e-05, + "loss": 1.063, + "step": 541 + }, + { + "epoch": 0.22041480276535177, + "grad_norm": 85.62413947141128, + "learning_rate": 1.1291666666666667e-05, + "loss": 0.5114, + "step": 542 + }, + { + "epoch": 0.22082147214314762, + "grad_norm": 39.14326846338795, + "learning_rate": 1.1312500000000002e-05, + "loss": 0.757, + "step": 543 + }, + { + "epoch": 0.22122814152094347, + "grad_norm": 29.947060508625164, + "learning_rate": 1.1333333333333334e-05, + "loss": 0.7722, + "step": 544 + }, + { + "epoch": 0.22163481089873932, + "grad_norm": 23.692613788769908, + "learning_rate": 1.1354166666666669e-05, + "loss": 0.4085, + "step": 545 + }, + { + "epoch": 0.22204148027653517, + "grad_norm": 44.579837965386275, + "learning_rate": 1.1375e-05, + "loss": 1.1877, + "step": 546 + }, + { + "epoch": 0.22244814965433102, + "grad_norm": 13.031468966096504, + "learning_rate": 1.1395833333333335e-05, + "loss": 0.1311, + "step": 547 + }, + { + "epoch": 0.2228548190321269, + "grad_norm": 68.45242987434148, + "learning_rate": 1.1416666666666667e-05, + "loss": 0.7715, + "step": 548 + }, + { + "epoch": 0.22326148840992274, + "grad_norm": 84.51860069640279, + "learning_rate": 1.1437500000000002e-05, + "loss": 0.5924, + "step": 549 + }, + { + "epoch": 0.2236681577877186, + "grad_norm": 1.0130943153296754, + "learning_rate": 1.1458333333333333e-05, + "loss": 0.0064, + "step": 550 + }, + { + "epoch": 0.22407482716551444, + "grad_norm": 37.96467807661799, + "learning_rate": 1.1479166666666668e-05, + "loss": 1.188, + "step": 551 + }, + { + "epoch": 0.2244814965433103, + "grad_norm": 28.605408999370106, + "learning_rate": 1.15e-05, + "loss": 0.6079, + "step": 552 + }, + { + "epoch": 0.22488816592110614, + "grad_norm": 38.23240121015085, + "learning_rate": 1.1520833333333335e-05, + "loss": 1.4017, + "step": 553 + }, + { + "epoch": 0.22529483529890199, + "grad_norm": 12.131829882745153, + "learning_rate": 1.1541666666666667e-05, + "loss": 0.0344, + "step": 554 + }, + { + "epoch": 0.22570150467669783, + "grad_norm": 8.796762844914529, + "learning_rate": 1.1562500000000002e-05, + "loss": 0.1061, + "step": 555 + }, + { + "epoch": 0.2261081740544937, + "grad_norm": 43.960637870168796, + "learning_rate": 1.1583333333333335e-05, + "loss": 0.9581, + "step": 556 + }, + { + "epoch": 0.22651484343228956, + "grad_norm": 11.127458384811957, + "learning_rate": 1.1604166666666668e-05, + "loss": 0.0997, + "step": 557 + }, + { + "epoch": 0.2269215128100854, + "grad_norm": 34.51774307027663, + "learning_rate": 1.1625000000000001e-05, + "loss": 0.3822, + "step": 558 + }, + { + "epoch": 0.22732818218788126, + "grad_norm": 28.168383968616716, + "learning_rate": 1.1645833333333335e-05, + "loss": 0.4753, + "step": 559 + }, + { + "epoch": 0.2277348515656771, + "grad_norm": 17.20859636393854, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.292, + "step": 560 + }, + { + "epoch": 0.22814152094347295, + "grad_norm": 17.085095996600863, + "learning_rate": 1.1687500000000001e-05, + "loss": 0.4844, + "step": 561 + }, + { + "epoch": 0.2285481903212688, + "grad_norm": 44.81951051539436, + "learning_rate": 1.1708333333333334e-05, + "loss": 1.3026, + "step": 562 + }, + { + "epoch": 0.22895485969906465, + "grad_norm": 42.27554040020032, + "learning_rate": 1.1729166666666668e-05, + "loss": 0.8495, + "step": 563 + }, + { + "epoch": 0.2293615290768605, + "grad_norm": 63.15817456377932, + "learning_rate": 1.1750000000000001e-05, + "loss": 2.3297, + "step": 564 + }, + { + "epoch": 0.22976819845465637, + "grad_norm": 26.44383767521884, + "learning_rate": 1.1770833333333334e-05, + "loss": 0.3882, + "step": 565 + }, + { + "epoch": 0.23017486783245222, + "grad_norm": 31.639535573995122, + "learning_rate": 1.1791666666666668e-05, + "loss": 1.1136, + "step": 566 + }, + { + "epoch": 0.23058153721024807, + "grad_norm": 50.18029391560642, + "learning_rate": 1.18125e-05, + "loss": 1.2113, + "step": 567 + }, + { + "epoch": 0.23098820658804392, + "grad_norm": 6.810159979850739, + "learning_rate": 1.1833333333333334e-05, + "loss": 0.0544, + "step": 568 + }, + { + "epoch": 0.23139487596583977, + "grad_norm": 42.316213099037896, + "learning_rate": 1.1854166666666667e-05, + "loss": 1.3549, + "step": 569 + }, + { + "epoch": 0.23180154534363562, + "grad_norm": 10.643855998927842, + "learning_rate": 1.1875e-05, + "loss": 0.059, + "step": 570 + }, + { + "epoch": 0.23220821472143147, + "grad_norm": 14.220641300187264, + "learning_rate": 1.1895833333333336e-05, + "loss": 0.3902, + "step": 571 + }, + { + "epoch": 0.23261488409922731, + "grad_norm": 8.354263643904424, + "learning_rate": 1.1916666666666667e-05, + "loss": 0.0594, + "step": 572 + }, + { + "epoch": 0.2330215534770232, + "grad_norm": 40.34594756072348, + "learning_rate": 1.1937500000000002e-05, + "loss": 1.1923, + "step": 573 + }, + { + "epoch": 0.23342822285481904, + "grad_norm": 26.856108100112362, + "learning_rate": 1.1958333333333334e-05, + "loss": 0.8053, + "step": 574 + }, + { + "epoch": 0.2338348922326149, + "grad_norm": 15.624921014466917, + "learning_rate": 1.1979166666666669e-05, + "loss": 0.4528, + "step": 575 + }, + { + "epoch": 0.23424156161041074, + "grad_norm": 18.74793575685916, + "learning_rate": 1.2e-05, + "loss": 0.3287, + "step": 576 + }, + { + "epoch": 0.23464823098820659, + "grad_norm": 47.91536030270494, + "learning_rate": 1.2020833333333335e-05, + "loss": 1.1877, + "step": 577 + }, + { + "epoch": 0.23505490036600243, + "grad_norm": 25.451256704156144, + "learning_rate": 1.2041666666666667e-05, + "loss": 0.4921, + "step": 578 + }, + { + "epoch": 0.23546156974379828, + "grad_norm": 42.63094432135677, + "learning_rate": 1.2062500000000002e-05, + "loss": 0.2576, + "step": 579 + }, + { + "epoch": 0.23586823912159413, + "grad_norm": 17.229687785759594, + "learning_rate": 1.2083333333333333e-05, + "loss": 0.3123, + "step": 580 + }, + { + "epoch": 0.23627490849939, + "grad_norm": 63.4970000179766, + "learning_rate": 1.2104166666666668e-05, + "loss": 0.7229, + "step": 581 + }, + { + "epoch": 0.23668157787718586, + "grad_norm": 29.140258492473727, + "learning_rate": 1.2125e-05, + "loss": 0.576, + "step": 582 + }, + { + "epoch": 0.2370882472549817, + "grad_norm": 18.343742466001817, + "learning_rate": 1.2145833333333335e-05, + "loss": 0.17, + "step": 583 + }, + { + "epoch": 0.23749491663277755, + "grad_norm": 15.678608381089793, + "learning_rate": 1.2166666666666667e-05, + "loss": 0.1955, + "step": 584 + }, + { + "epoch": 0.2379015860105734, + "grad_norm": 105.9394704269761, + "learning_rate": 1.2187500000000001e-05, + "loss": 1.8582, + "step": 585 + }, + { + "epoch": 0.23830825538836925, + "grad_norm": 25.722627884075077, + "learning_rate": 1.2208333333333335e-05, + "loss": 0.4679, + "step": 586 + }, + { + "epoch": 0.2387149247661651, + "grad_norm": 19.273329373440244, + "learning_rate": 1.2229166666666668e-05, + "loss": 0.2022, + "step": 587 + }, + { + "epoch": 0.23912159414396095, + "grad_norm": 30.259590466782193, + "learning_rate": 1.2250000000000001e-05, + "loss": 0.704, + "step": 588 + }, + { + "epoch": 0.23952826352175682, + "grad_norm": 30.457950771965386, + "learning_rate": 1.2270833333333335e-05, + "loss": 0.4917, + "step": 589 + }, + { + "epoch": 0.23993493289955267, + "grad_norm": 20.11819955965163, + "learning_rate": 1.2291666666666668e-05, + "loss": 0.3791, + "step": 590 + }, + { + "epoch": 0.24034160227734852, + "grad_norm": 60.756925783165265, + "learning_rate": 1.2312500000000001e-05, + "loss": 1.9618, + "step": 591 + }, + { + "epoch": 0.24074827165514437, + "grad_norm": 66.55406395415993, + "learning_rate": 1.2333333333333334e-05, + "loss": 1.5512, + "step": 592 + }, + { + "epoch": 0.24115494103294022, + "grad_norm": 43.746896015818635, + "learning_rate": 1.2354166666666668e-05, + "loss": 0.5395, + "step": 593 + }, + { + "epoch": 0.24156161041073607, + "grad_norm": 20.677438163275138, + "learning_rate": 1.2375000000000001e-05, + "loss": 0.217, + "step": 594 + }, + { + "epoch": 0.24196827978853191, + "grad_norm": 13.242636676079501, + "learning_rate": 1.2395833333333334e-05, + "loss": 0.2832, + "step": 595 + }, + { + "epoch": 0.24237494916632776, + "grad_norm": 34.76900440121621, + "learning_rate": 1.2416666666666667e-05, + "loss": 0.4939, + "step": 596 + }, + { + "epoch": 0.24278161854412364, + "grad_norm": 11.545365101735838, + "learning_rate": 1.24375e-05, + "loss": 0.2629, + "step": 597 + }, + { + "epoch": 0.2431882879219195, + "grad_norm": 30.21019019216705, + "learning_rate": 1.2458333333333334e-05, + "loss": 0.1443, + "step": 598 + }, + { + "epoch": 0.24359495729971534, + "grad_norm": 57.99944839338558, + "learning_rate": 1.2479166666666667e-05, + "loss": 0.6735, + "step": 599 + }, + { + "epoch": 0.24400162667751119, + "grad_norm": 20.413064456343132, + "learning_rate": 1.25e-05, + "loss": 0.5264, + "step": 600 + }, + { + "epoch": 0.24440829605530703, + "grad_norm": 42.960144128079556, + "learning_rate": 1.2520833333333336e-05, + "loss": 1.0719, + "step": 601 + }, + { + "epoch": 0.24481496543310288, + "grad_norm": 35.670058838671636, + "learning_rate": 1.2541666666666667e-05, + "loss": 1.0154, + "step": 602 + }, + { + "epoch": 0.24522163481089873, + "grad_norm": 42.34639784849753, + "learning_rate": 1.2562500000000002e-05, + "loss": 0.9278, + "step": 603 + }, + { + "epoch": 0.24562830418869458, + "grad_norm": 39.71534236948768, + "learning_rate": 1.2583333333333334e-05, + "loss": 1.2497, + "step": 604 + }, + { + "epoch": 0.24603497356649046, + "grad_norm": 17.293333710437757, + "learning_rate": 1.2604166666666669e-05, + "loss": 0.4288, + "step": 605 + }, + { + "epoch": 0.2464416429442863, + "grad_norm": 29.996960713490296, + "learning_rate": 1.2625e-05, + "loss": 0.6795, + "step": 606 + }, + { + "epoch": 0.24684831232208215, + "grad_norm": 15.529412702997577, + "learning_rate": 1.2645833333333335e-05, + "loss": 0.3096, + "step": 607 + }, + { + "epoch": 0.247254981699878, + "grad_norm": 37.942205775102224, + "learning_rate": 1.2666666666666667e-05, + "loss": 0.4133, + "step": 608 + }, + { + "epoch": 0.24766165107767385, + "grad_norm": 5.693422293029101, + "learning_rate": 1.2687500000000002e-05, + "loss": 0.0486, + "step": 609 + }, + { + "epoch": 0.2480683204554697, + "grad_norm": 9.422893782137692, + "learning_rate": 1.2708333333333333e-05, + "loss": 0.0776, + "step": 610 + }, + { + "epoch": 0.24847498983326555, + "grad_norm": 7.166191630324573, + "learning_rate": 1.2729166666666668e-05, + "loss": 0.1306, + "step": 611 + }, + { + "epoch": 0.2488816592110614, + "grad_norm": 30.44374803943663, + "learning_rate": 1.275e-05, + "loss": 0.3887, + "step": 612 + }, + { + "epoch": 0.24928832858885727, + "grad_norm": 15.84916097000231, + "learning_rate": 1.2770833333333335e-05, + "loss": 0.3682, + "step": 613 + }, + { + "epoch": 0.24969499796665312, + "grad_norm": 8.532439971638615, + "learning_rate": 1.2791666666666666e-05, + "loss": 0.1503, + "step": 614 + }, + { + "epoch": 0.25010166734444894, + "grad_norm": 66.5951537494938, + "learning_rate": 1.2812500000000001e-05, + "loss": 0.6339, + "step": 615 + }, + { + "epoch": 0.2505083367222448, + "grad_norm": 13.128847332529665, + "learning_rate": 1.2833333333333335e-05, + "loss": 0.4374, + "step": 616 + }, + { + "epoch": 0.2509150061000407, + "grad_norm": 12.793925454215145, + "learning_rate": 1.2854166666666668e-05, + "loss": 0.1128, + "step": 617 + }, + { + "epoch": 0.2513216754778365, + "grad_norm": 27.785584079762813, + "learning_rate": 1.2875000000000001e-05, + "loss": 0.9156, + "step": 618 + }, + { + "epoch": 0.2517283448556324, + "grad_norm": 20.133667174971944, + "learning_rate": 1.2895833333333335e-05, + "loss": 0.3801, + "step": 619 + }, + { + "epoch": 0.2521350142334282, + "grad_norm": 19.630824803208146, + "learning_rate": 1.2916666666666668e-05, + "loss": 0.1838, + "step": 620 + }, + { + "epoch": 0.2525416836112241, + "grad_norm": 27.913178692824964, + "learning_rate": 1.2937500000000001e-05, + "loss": 0.7707, + "step": 621 + }, + { + "epoch": 0.2529483529890199, + "grad_norm": 11.472535241501612, + "learning_rate": 1.2958333333333334e-05, + "loss": 0.1157, + "step": 622 + }, + { + "epoch": 0.2533550223668158, + "grad_norm": 5.338378079461059, + "learning_rate": 1.2979166666666668e-05, + "loss": 0.0608, + "step": 623 + }, + { + "epoch": 0.2537616917446116, + "grad_norm": 41.54473009487125, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.4167, + "step": 624 + }, + { + "epoch": 0.2541683611224075, + "grad_norm": 12.124925381744859, + "learning_rate": 1.3020833333333334e-05, + "loss": 0.1563, + "step": 625 + }, + { + "epoch": 0.25457503050020336, + "grad_norm": 1.3016371157487872, + "learning_rate": 1.3041666666666667e-05, + "loss": 0.0075, + "step": 626 + }, + { + "epoch": 0.2549816998779992, + "grad_norm": 17.890051151842396, + "learning_rate": 1.30625e-05, + "loss": 0.2747, + "step": 627 + }, + { + "epoch": 0.25538836925579506, + "grad_norm": 32.61816044281452, + "learning_rate": 1.3083333333333334e-05, + "loss": 1.0142, + "step": 628 + }, + { + "epoch": 0.2557950386335909, + "grad_norm": 26.722959409579722, + "learning_rate": 1.3104166666666667e-05, + "loss": 0.6955, + "step": 629 + }, + { + "epoch": 0.25620170801138675, + "grad_norm": 4.899686824295484, + "learning_rate": 1.3125e-05, + "loss": 0.0415, + "step": 630 + }, + { + "epoch": 0.2566083773891826, + "grad_norm": 31.180995836290627, + "learning_rate": 1.3145833333333336e-05, + "loss": 0.7432, + "step": 631 + }, + { + "epoch": 0.25701504676697845, + "grad_norm": 54.806962245681135, + "learning_rate": 1.3166666666666667e-05, + "loss": 0.4867, + "step": 632 + }, + { + "epoch": 0.25742171614477427, + "grad_norm": 1.7368269274391661, + "learning_rate": 1.3187500000000002e-05, + "loss": 0.0211, + "step": 633 + }, + { + "epoch": 0.25782838552257015, + "grad_norm": 33.36665713675395, + "learning_rate": 1.3208333333333334e-05, + "loss": 0.7702, + "step": 634 + }, + { + "epoch": 0.258235054900366, + "grad_norm": 22.579906553174734, + "learning_rate": 1.3229166666666669e-05, + "loss": 0.2785, + "step": 635 + }, + { + "epoch": 0.25864172427816184, + "grad_norm": 11.078409255801043, + "learning_rate": 1.325e-05, + "loss": 0.1349, + "step": 636 + }, + { + "epoch": 0.2590483936559577, + "grad_norm": 90.82343208287641, + "learning_rate": 1.3270833333333335e-05, + "loss": 1.3511, + "step": 637 + }, + { + "epoch": 0.25945506303375354, + "grad_norm": 138.68664614512141, + "learning_rate": 1.3291666666666667e-05, + "loss": 0.5263, + "step": 638 + }, + { + "epoch": 0.2598617324115494, + "grad_norm": 43.02854774741215, + "learning_rate": 1.3312500000000002e-05, + "loss": 0.619, + "step": 639 + }, + { + "epoch": 0.26026840178934524, + "grad_norm": 39.69243762021546, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.1137, + "step": 640 + }, + { + "epoch": 0.2606750711671411, + "grad_norm": 67.44187441259311, + "learning_rate": 1.3354166666666668e-05, + "loss": 1.5924, + "step": 641 + }, + { + "epoch": 0.261081740544937, + "grad_norm": 11.470156093591642, + "learning_rate": 1.3375e-05, + "loss": 0.18, + "step": 642 + }, + { + "epoch": 0.2614884099227328, + "grad_norm": 32.79379932574068, + "learning_rate": 1.3395833333333335e-05, + "loss": 0.7547, + "step": 643 + }, + { + "epoch": 0.2618950793005287, + "grad_norm": 23.78413895047669, + "learning_rate": 1.3416666666666666e-05, + "loss": 0.3297, + "step": 644 + }, + { + "epoch": 0.2623017486783245, + "grad_norm": 17.990133887409858, + "learning_rate": 1.3437500000000001e-05, + "loss": 0.3059, + "step": 645 + }, + { + "epoch": 0.2627084180561204, + "grad_norm": 38.336330535895904, + "learning_rate": 1.3458333333333335e-05, + "loss": 1.0298, + "step": 646 + }, + { + "epoch": 0.2631150874339162, + "grad_norm": 20.57656737437666, + "learning_rate": 1.3479166666666668e-05, + "loss": 0.2198, + "step": 647 + }, + { + "epoch": 0.2635217568117121, + "grad_norm": 26.405061484569377, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.4306, + "step": 648 + }, + { + "epoch": 0.2639284261895079, + "grad_norm": 66.55079503682545, + "learning_rate": 1.3520833333333334e-05, + "loss": 1.8236, + "step": 649 + }, + { + "epoch": 0.2643350955673038, + "grad_norm": 0.5837177164476234, + "learning_rate": 1.3541666666666668e-05, + "loss": 0.006, + "step": 650 + }, + { + "epoch": 0.26474176494509966, + "grad_norm": 28.474163455665057, + "learning_rate": 1.3562500000000001e-05, + "loss": 0.3254, + "step": 651 + }, + { + "epoch": 0.2651484343228955, + "grad_norm": 34.998829575748964, + "learning_rate": 1.3583333333333334e-05, + "loss": 0.5639, + "step": 652 + }, + { + "epoch": 0.26555510370069135, + "grad_norm": 31.434298015008725, + "learning_rate": 1.3604166666666668e-05, + "loss": 1.3791, + "step": 653 + }, + { + "epoch": 0.2659617730784872, + "grad_norm": 42.594437090215024, + "learning_rate": 1.3625e-05, + "loss": 0.7557, + "step": 654 + }, + { + "epoch": 0.26636844245628305, + "grad_norm": 16.84736626196609, + "learning_rate": 1.3645833333333334e-05, + "loss": 0.1232, + "step": 655 + }, + { + "epoch": 0.26677511183407887, + "grad_norm": 29.455172828798794, + "learning_rate": 1.3666666666666667e-05, + "loss": 0.911, + "step": 656 + }, + { + "epoch": 0.26718178121187475, + "grad_norm": 23.08083827024407, + "learning_rate": 1.36875e-05, + "loss": 0.3567, + "step": 657 + }, + { + "epoch": 0.2675884505896706, + "grad_norm": 30.974542809675906, + "learning_rate": 1.3708333333333334e-05, + "loss": 1.2048, + "step": 658 + }, + { + "epoch": 0.26799511996746644, + "grad_norm": 45.93023628365112, + "learning_rate": 1.3729166666666667e-05, + "loss": 0.3501, + "step": 659 + }, + { + "epoch": 0.2684017893452623, + "grad_norm": 9.624753255282295, + "learning_rate": 1.375e-05, + "loss": 0.2336, + "step": 660 + }, + { + "epoch": 0.26880845872305814, + "grad_norm": 23.029834422246154, + "learning_rate": 1.3770833333333335e-05, + "loss": 1.0576, + "step": 661 + }, + { + "epoch": 0.269215128100854, + "grad_norm": 24.33857802827044, + "learning_rate": 1.3791666666666667e-05, + "loss": 0.4346, + "step": 662 + }, + { + "epoch": 0.26962179747864984, + "grad_norm": 34.91296560622003, + "learning_rate": 1.3812500000000002e-05, + "loss": 0.8914, + "step": 663 + }, + { + "epoch": 0.2700284668564457, + "grad_norm": 8.650083527779078, + "learning_rate": 1.3833333333333334e-05, + "loss": 0.0644, + "step": 664 + }, + { + "epoch": 0.27043513623424154, + "grad_norm": 26.866528103627054, + "learning_rate": 1.3854166666666669e-05, + "loss": 0.7018, + "step": 665 + }, + { + "epoch": 0.2708418056120374, + "grad_norm": 63.56389415200401, + "learning_rate": 1.3875e-05, + "loss": 0.6621, + "step": 666 + }, + { + "epoch": 0.2712484749898333, + "grad_norm": 25.271390367995405, + "learning_rate": 1.3895833333333335e-05, + "loss": 0.5509, + "step": 667 + }, + { + "epoch": 0.2716551443676291, + "grad_norm": 12.862252004525079, + "learning_rate": 1.3916666666666667e-05, + "loss": 0.3454, + "step": 668 + }, + { + "epoch": 0.272061813745425, + "grad_norm": 76.93788204516605, + "learning_rate": 1.3937500000000002e-05, + "loss": 1.3981, + "step": 669 + }, + { + "epoch": 0.2724684831232208, + "grad_norm": 47.75259547020182, + "learning_rate": 1.3958333333333333e-05, + "loss": 0.8707, + "step": 670 + }, + { + "epoch": 0.2728751525010167, + "grad_norm": 17.77935762168726, + "learning_rate": 1.3979166666666668e-05, + "loss": 0.3355, + "step": 671 + }, + { + "epoch": 0.2732818218788125, + "grad_norm": 19.300875858926382, + "learning_rate": 1.4e-05, + "loss": 0.3206, + "step": 672 + }, + { + "epoch": 0.2736884912566084, + "grad_norm": 19.05079661339959, + "learning_rate": 1.4020833333333335e-05, + "loss": 0.5653, + "step": 673 + }, + { + "epoch": 0.27409516063440426, + "grad_norm": 75.82422874316589, + "learning_rate": 1.4041666666666666e-05, + "loss": 1.2229, + "step": 674 + }, + { + "epoch": 0.2745018300122001, + "grad_norm": 19.426276424616958, + "learning_rate": 1.4062500000000001e-05, + "loss": 0.3822, + "step": 675 + }, + { + "epoch": 0.27490849938999595, + "grad_norm": 21.17054555315456, + "learning_rate": 1.4083333333333336e-05, + "loss": 0.1623, + "step": 676 + }, + { + "epoch": 0.2753151687677918, + "grad_norm": 20.51756990052194, + "learning_rate": 1.4104166666666668e-05, + "loss": 0.2943, + "step": 677 + }, + { + "epoch": 0.27572183814558765, + "grad_norm": 22.185999781971635, + "learning_rate": 1.4125000000000003e-05, + "loss": 0.245, + "step": 678 + }, + { + "epoch": 0.27612850752338347, + "grad_norm": 31.574226204421613, + "learning_rate": 1.4145833333333334e-05, + "loss": 0.5405, + "step": 679 + }, + { + "epoch": 0.27653517690117935, + "grad_norm": 30.511965081091446, + "learning_rate": 1.416666666666667e-05, + "loss": 0.9643, + "step": 680 + }, + { + "epoch": 0.27694184627897517, + "grad_norm": 21.753067301819023, + "learning_rate": 1.4187500000000001e-05, + "loss": 0.4422, + "step": 681 + }, + { + "epoch": 0.27734851565677104, + "grad_norm": 16.90707682053713, + "learning_rate": 1.4208333333333336e-05, + "loss": 0.3132, + "step": 682 + }, + { + "epoch": 0.2777551850345669, + "grad_norm": 39.89482856594132, + "learning_rate": 1.4229166666666668e-05, + "loss": 1.6008, + "step": 683 + }, + { + "epoch": 0.27816185441236274, + "grad_norm": 40.62456328230653, + "learning_rate": 1.425e-05, + "loss": 1.8422, + "step": 684 + }, + { + "epoch": 0.2785685237901586, + "grad_norm": 9.807158022878404, + "learning_rate": 1.4270833333333334e-05, + "loss": 0.1271, + "step": 685 + }, + { + "epoch": 0.27897519316795444, + "grad_norm": 43.11270307668371, + "learning_rate": 1.4291666666666667e-05, + "loss": 0.8961, + "step": 686 + }, + { + "epoch": 0.2793818625457503, + "grad_norm": 36.911755192415484, + "learning_rate": 1.43125e-05, + "loss": 0.884, + "step": 687 + }, + { + "epoch": 0.27978853192354614, + "grad_norm": 39.58916562425996, + "learning_rate": 1.4333333333333334e-05, + "loss": 1.0291, + "step": 688 + }, + { + "epoch": 0.280195201301342, + "grad_norm": 57.28789362696225, + "learning_rate": 1.4354166666666667e-05, + "loss": 0.2941, + "step": 689 + }, + { + "epoch": 0.2806018706791379, + "grad_norm": 20.054689324872623, + "learning_rate": 1.4375e-05, + "loss": 0.2293, + "step": 690 + }, + { + "epoch": 0.2810085400569337, + "grad_norm": 33.46694756622272, + "learning_rate": 1.4395833333333335e-05, + "loss": 0.8085, + "step": 691 + }, + { + "epoch": 0.2814152094347296, + "grad_norm": 35.34157677250144, + "learning_rate": 1.4416666666666667e-05, + "loss": 0.9158, + "step": 692 + }, + { + "epoch": 0.2818218788125254, + "grad_norm": 3.6031336546706165, + "learning_rate": 1.4437500000000002e-05, + "loss": 0.0281, + "step": 693 + }, + { + "epoch": 0.2822285481903213, + "grad_norm": 20.31576451102355, + "learning_rate": 1.4458333333333334e-05, + "loss": 0.8165, + "step": 694 + }, + { + "epoch": 0.2826352175681171, + "grad_norm": 19.5204643893304, + "learning_rate": 1.4479166666666669e-05, + "loss": 0.4852, + "step": 695 + }, + { + "epoch": 0.283041886945913, + "grad_norm": 29.91964957487296, + "learning_rate": 1.45e-05, + "loss": 0.4742, + "step": 696 + }, + { + "epoch": 0.2834485563237088, + "grad_norm": 15.065056574077131, + "learning_rate": 1.4520833333333335e-05, + "loss": 0.1713, + "step": 697 + }, + { + "epoch": 0.2838552257015047, + "grad_norm": 29.55334525825441, + "learning_rate": 1.4541666666666667e-05, + "loss": 0.8462, + "step": 698 + }, + { + "epoch": 0.28426189507930055, + "grad_norm": 19.62309213646185, + "learning_rate": 1.4562500000000002e-05, + "loss": 0.4289, + "step": 699 + }, + { + "epoch": 0.2846685644570964, + "grad_norm": 25.82491870543747, + "learning_rate": 1.4583333333333333e-05, + "loss": 0.3836, + "step": 700 + }, + { + "epoch": 0.28507523383489225, + "grad_norm": 34.26371183851328, + "learning_rate": 1.4604166666666668e-05, + "loss": 0.6558, + "step": 701 + }, + { + "epoch": 0.28548190321268807, + "grad_norm": 21.84646066862202, + "learning_rate": 1.4625e-05, + "loss": 0.4318, + "step": 702 + }, + { + "epoch": 0.28588857259048395, + "grad_norm": 29.796299652281967, + "learning_rate": 1.4645833333333335e-05, + "loss": 0.3314, + "step": 703 + }, + { + "epoch": 0.28629524196827977, + "grad_norm": 24.973557813694608, + "learning_rate": 1.4666666666666666e-05, + "loss": 0.4548, + "step": 704 + }, + { + "epoch": 0.28670191134607564, + "grad_norm": 26.429077194952395, + "learning_rate": 1.4687500000000001e-05, + "loss": 0.983, + "step": 705 + }, + { + "epoch": 0.28710858072387146, + "grad_norm": 58.528013976480274, + "learning_rate": 1.4708333333333336e-05, + "loss": 0.7745, + "step": 706 + }, + { + "epoch": 0.28751525010166734, + "grad_norm": 39.67220720175149, + "learning_rate": 1.4729166666666668e-05, + "loss": 0.9487, + "step": 707 + }, + { + "epoch": 0.2879219194794632, + "grad_norm": 9.21016619659569, + "learning_rate": 1.4750000000000003e-05, + "loss": 0.1437, + "step": 708 + }, + { + "epoch": 0.28832858885725904, + "grad_norm": 1.5079148699343548, + "learning_rate": 1.4770833333333334e-05, + "loss": 0.0106, + "step": 709 + }, + { + "epoch": 0.2887352582350549, + "grad_norm": 30.529608509783568, + "learning_rate": 1.479166666666667e-05, + "loss": 0.5511, + "step": 710 + }, + { + "epoch": 0.28914192761285074, + "grad_norm": 11.982917430302363, + "learning_rate": 1.4812500000000001e-05, + "loss": 0.1461, + "step": 711 + }, + { + "epoch": 0.2895485969906466, + "grad_norm": 15.877396310298257, + "learning_rate": 1.4833333333333336e-05, + "loss": 0.8682, + "step": 712 + }, + { + "epoch": 0.28995526636844243, + "grad_norm": 15.193640052097196, + "learning_rate": 1.4854166666666667e-05, + "loss": 0.3137, + "step": 713 + }, + { + "epoch": 0.2903619357462383, + "grad_norm": 15.949634844144956, + "learning_rate": 1.4875000000000002e-05, + "loss": 0.149, + "step": 714 + }, + { + "epoch": 0.2907686051240342, + "grad_norm": 58.793574730964764, + "learning_rate": 1.4895833333333334e-05, + "loss": 0.361, + "step": 715 + }, + { + "epoch": 0.29117527450183, + "grad_norm": 28.496548049631183, + "learning_rate": 1.4916666666666669e-05, + "loss": 1.0549, + "step": 716 + }, + { + "epoch": 0.2915819438796259, + "grad_norm": 25.84190251517152, + "learning_rate": 1.49375e-05, + "loss": 0.3179, + "step": 717 + }, + { + "epoch": 0.2919886132574217, + "grad_norm": 32.98390925126924, + "learning_rate": 1.4958333333333336e-05, + "loss": 0.9969, + "step": 718 + }, + { + "epoch": 0.2923952826352176, + "grad_norm": 16.314101829894483, + "learning_rate": 1.4979166666666667e-05, + "loss": 0.3616, + "step": 719 + }, + { + "epoch": 0.2928019520130134, + "grad_norm": 35.77139334952156, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8743, + "step": 720 + }, + { + "epoch": 0.2932086213908093, + "grad_norm": 50.12386869687257, + "learning_rate": 1.5020833333333335e-05, + "loss": 1.6486, + "step": 721 + }, + { + "epoch": 0.2936152907686051, + "grad_norm": 28.386066051836075, + "learning_rate": 1.5041666666666667e-05, + "loss": 0.8967, + "step": 722 + }, + { + "epoch": 0.294021960146401, + "grad_norm": 37.429875391880884, + "learning_rate": 1.5062500000000002e-05, + "loss": 0.7662, + "step": 723 + }, + { + "epoch": 0.29442862952419685, + "grad_norm": 43.51537597283654, + "learning_rate": 1.5083333333333333e-05, + "loss": 1.2163, + "step": 724 + }, + { + "epoch": 0.29483529890199267, + "grad_norm": 72.15367828815228, + "learning_rate": 1.5104166666666668e-05, + "loss": 0.351, + "step": 725 + }, + { + "epoch": 0.29524196827978855, + "grad_norm": 59.29543042339444, + "learning_rate": 1.5125e-05, + "loss": 1.5614, + "step": 726 + }, + { + "epoch": 0.29564863765758437, + "grad_norm": 49.11128756919122, + "learning_rate": 1.5145833333333335e-05, + "loss": 1.312, + "step": 727 + }, + { + "epoch": 0.29605530703538024, + "grad_norm": 8.621752588464973, + "learning_rate": 1.5166666666666667e-05, + "loss": 0.096, + "step": 728 + }, + { + "epoch": 0.29646197641317606, + "grad_norm": 62.54791477700888, + "learning_rate": 1.5187500000000002e-05, + "loss": 0.6915, + "step": 729 + }, + { + "epoch": 0.29686864579097194, + "grad_norm": 18.377680071142947, + "learning_rate": 1.5208333333333333e-05, + "loss": 0.7056, + "step": 730 + }, + { + "epoch": 0.2972753151687678, + "grad_norm": 30.673828445259684, + "learning_rate": 1.5229166666666668e-05, + "loss": 0.9273, + "step": 731 + }, + { + "epoch": 0.29768198454656364, + "grad_norm": 7.352354103754619, + "learning_rate": 1.525e-05, + "loss": 0.0342, + "step": 732 + }, + { + "epoch": 0.2980886539243595, + "grad_norm": 20.55573648009311, + "learning_rate": 1.5270833333333336e-05, + "loss": 0.7127, + "step": 733 + }, + { + "epoch": 0.29849532330215534, + "grad_norm": 12.054731622885267, + "learning_rate": 1.5291666666666668e-05, + "loss": 0.189, + "step": 734 + }, + { + "epoch": 0.2989019926799512, + "grad_norm": 31.84893823291875, + "learning_rate": 1.5312500000000003e-05, + "loss": 1.1273, + "step": 735 + }, + { + "epoch": 0.29930866205774703, + "grad_norm": 19.515210081023607, + "learning_rate": 1.5333333333333334e-05, + "loss": 0.5691, + "step": 736 + }, + { + "epoch": 0.2997153314355429, + "grad_norm": 38.53500465328445, + "learning_rate": 1.535416666666667e-05, + "loss": 1.0477, + "step": 737 + }, + { + "epoch": 0.30012200081333873, + "grad_norm": 20.622638176662523, + "learning_rate": 1.5375e-05, + "loss": 0.3925, + "step": 738 + }, + { + "epoch": 0.3005286701911346, + "grad_norm": 20.812203476593435, + "learning_rate": 1.5395833333333333e-05, + "loss": 0.3637, + "step": 739 + }, + { + "epoch": 0.3009353395689305, + "grad_norm": 19.244019971318508, + "learning_rate": 1.5416666666666668e-05, + "loss": 0.6349, + "step": 740 + }, + { + "epoch": 0.3013420089467263, + "grad_norm": 5.757947484412791, + "learning_rate": 1.54375e-05, + "loss": 0.074, + "step": 741 + }, + { + "epoch": 0.3017486783245222, + "grad_norm": 74.85505971013986, + "learning_rate": 1.5458333333333334e-05, + "loss": 0.8284, + "step": 742 + }, + { + "epoch": 0.302155347702318, + "grad_norm": 24.207728962894773, + "learning_rate": 1.5479166666666666e-05, + "loss": 0.8045, + "step": 743 + }, + { + "epoch": 0.3025620170801139, + "grad_norm": 62.941333181920854, + "learning_rate": 1.55e-05, + "loss": 1.1529, + "step": 744 + }, + { + "epoch": 0.3029686864579097, + "grad_norm": 16.233122274292608, + "learning_rate": 1.5520833333333332e-05, + "loss": 0.1952, + "step": 745 + }, + { + "epoch": 0.3033753558357056, + "grad_norm": 25.032281867587397, + "learning_rate": 1.5541666666666667e-05, + "loss": 0.6604, + "step": 746 + }, + { + "epoch": 0.30378202521350145, + "grad_norm": 19.29550845890531, + "learning_rate": 1.55625e-05, + "loss": 0.3545, + "step": 747 + }, + { + "epoch": 0.30418869459129727, + "grad_norm": 7.7522429231477625, + "learning_rate": 1.5583333333333334e-05, + "loss": 0.0913, + "step": 748 + }, + { + "epoch": 0.30459536396909315, + "grad_norm": 44.038748371003116, + "learning_rate": 1.5604166666666665e-05, + "loss": 0.9601, + "step": 749 + }, + { + "epoch": 0.30500203334688897, + "grad_norm": 36.331836416101275, + "learning_rate": 1.5625e-05, + "loss": 1.9224, + "step": 750 + }, + { + "epoch": 0.30540870272468484, + "grad_norm": 8.687415803957558, + "learning_rate": 1.5645833333333335e-05, + "loss": 0.069, + "step": 751 + }, + { + "epoch": 0.30581537210248066, + "grad_norm": 1.6265715765543078, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.0175, + "step": 752 + }, + { + "epoch": 0.30622204148027654, + "grad_norm": 24.234848276395397, + "learning_rate": 1.5687500000000002e-05, + "loss": 0.5433, + "step": 753 + }, + { + "epoch": 0.30662871085807236, + "grad_norm": 44.0703043022209, + "learning_rate": 1.5708333333333333e-05, + "loss": 0.879, + "step": 754 + }, + { + "epoch": 0.30703538023586824, + "grad_norm": 23.85805349150188, + "learning_rate": 1.572916666666667e-05, + "loss": 0.7213, + "step": 755 + }, + { + "epoch": 0.3074420496136641, + "grad_norm": 7.527645440782168, + "learning_rate": 1.575e-05, + "loss": 0.0913, + "step": 756 + }, + { + "epoch": 0.30784871899145994, + "grad_norm": 32.13268227090561, + "learning_rate": 1.5770833333333335e-05, + "loss": 0.3531, + "step": 757 + }, + { + "epoch": 0.3082553883692558, + "grad_norm": 29.77590622973149, + "learning_rate": 1.5791666666666667e-05, + "loss": 0.5616, + "step": 758 + }, + { + "epoch": 0.30866205774705163, + "grad_norm": 33.90138728544883, + "learning_rate": 1.58125e-05, + "loss": 1.232, + "step": 759 + }, + { + "epoch": 0.3090687271248475, + "grad_norm": 19.036712863773424, + "learning_rate": 1.5833333333333333e-05, + "loss": 0.5109, + "step": 760 + }, + { + "epoch": 0.30947539650264333, + "grad_norm": 14.381197538846425, + "learning_rate": 1.5854166666666668e-05, + "loss": 0.3814, + "step": 761 + }, + { + "epoch": 0.3098820658804392, + "grad_norm": 66.08360239591899, + "learning_rate": 1.5875e-05, + "loss": 2.1499, + "step": 762 + }, + { + "epoch": 0.3102887352582351, + "grad_norm": 19.772569002018304, + "learning_rate": 1.5895833333333335e-05, + "loss": 0.4666, + "step": 763 + }, + { + "epoch": 0.3106954046360309, + "grad_norm": 43.01627608477516, + "learning_rate": 1.5916666666666666e-05, + "loss": 0.8954, + "step": 764 + }, + { + "epoch": 0.3111020740138268, + "grad_norm": 29.28028454812334, + "learning_rate": 1.59375e-05, + "loss": 1.2867, + "step": 765 + }, + { + "epoch": 0.3115087433916226, + "grad_norm": 32.95858916312733, + "learning_rate": 1.5958333333333336e-05, + "loss": 0.9579, + "step": 766 + }, + { + "epoch": 0.3119154127694185, + "grad_norm": 40.04396403181655, + "learning_rate": 1.5979166666666668e-05, + "loss": 1.5277, + "step": 767 + }, + { + "epoch": 0.3123220821472143, + "grad_norm": 15.045933901490667, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.2479, + "step": 768 + }, + { + "epoch": 0.3127287515250102, + "grad_norm": 12.706084070810501, + "learning_rate": 1.6020833333333334e-05, + "loss": 0.1032, + "step": 769 + }, + { + "epoch": 0.313135420902806, + "grad_norm": 39.57199687052436, + "learning_rate": 1.604166666666667e-05, + "loss": 0.3035, + "step": 770 + }, + { + "epoch": 0.31354209028060187, + "grad_norm": 12.734582612548085, + "learning_rate": 1.60625e-05, + "loss": 0.1603, + "step": 771 + }, + { + "epoch": 0.31394875965839775, + "grad_norm": 28.011572432624604, + "learning_rate": 1.6083333333333336e-05, + "loss": 0.2779, + "step": 772 + }, + { + "epoch": 0.31435542903619357, + "grad_norm": 17.633524747440298, + "learning_rate": 1.6104166666666667e-05, + "loss": 0.2853, + "step": 773 + }, + { + "epoch": 0.31476209841398944, + "grad_norm": 23.921407790950568, + "learning_rate": 1.6125000000000002e-05, + "loss": 0.2764, + "step": 774 + }, + { + "epoch": 0.31516876779178526, + "grad_norm": 49.85858218871157, + "learning_rate": 1.6145833333333334e-05, + "loss": 1.1821, + "step": 775 + }, + { + "epoch": 0.31557543716958114, + "grad_norm": 13.847263959256473, + "learning_rate": 1.616666666666667e-05, + "loss": 0.1423, + "step": 776 + }, + { + "epoch": 0.31598210654737696, + "grad_norm": 23.147276580572726, + "learning_rate": 1.61875e-05, + "loss": 0.4817, + "step": 777 + }, + { + "epoch": 0.31638877592517284, + "grad_norm": 26.174499288339383, + "learning_rate": 1.6208333333333335e-05, + "loss": 0.9725, + "step": 778 + }, + { + "epoch": 0.31679544530296866, + "grad_norm": 142.78846019524548, + "learning_rate": 1.6229166666666667e-05, + "loss": 0.847, + "step": 779 + }, + { + "epoch": 0.31720211468076454, + "grad_norm": 21.04331652791357, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.4813, + "step": 780 + }, + { + "epoch": 0.3176087840585604, + "grad_norm": 11.783130255394324, + "learning_rate": 1.6270833333333337e-05, + "loss": 0.2646, + "step": 781 + }, + { + "epoch": 0.31801545343635623, + "grad_norm": 8.490395360477418, + "learning_rate": 1.629166666666667e-05, + "loss": 0.0547, + "step": 782 + }, + { + "epoch": 0.3184221228141521, + "grad_norm": 56.25464741899556, + "learning_rate": 1.6312500000000003e-05, + "loss": 1.2, + "step": 783 + }, + { + "epoch": 0.31882879219194793, + "grad_norm": 16.521119615294012, + "learning_rate": 1.6333333333333335e-05, + "loss": 0.3312, + "step": 784 + }, + { + "epoch": 0.3192354615697438, + "grad_norm": 8.655646457884904, + "learning_rate": 1.635416666666667e-05, + "loss": 0.0582, + "step": 785 + }, + { + "epoch": 0.3196421309475396, + "grad_norm": 21.28313548175431, + "learning_rate": 1.6375e-05, + "loss": 0.3397, + "step": 786 + }, + { + "epoch": 0.3200488003253355, + "grad_norm": 36.78322269413047, + "learning_rate": 1.6395833333333337e-05, + "loss": 1.1305, + "step": 787 + }, + { + "epoch": 0.3204554697031314, + "grad_norm": 1.2917952199751346, + "learning_rate": 1.6416666666666668e-05, + "loss": 0.0088, + "step": 788 + }, + { + "epoch": 0.3208621390809272, + "grad_norm": 57.30785613865568, + "learning_rate": 1.6437500000000003e-05, + "loss": 0.9587, + "step": 789 + }, + { + "epoch": 0.3212688084587231, + "grad_norm": 15.695497505882264, + "learning_rate": 1.6458333333333335e-05, + "loss": 0.1983, + "step": 790 + }, + { + "epoch": 0.3216754778365189, + "grad_norm": 18.992709249434657, + "learning_rate": 1.647916666666667e-05, + "loss": 0.3383, + "step": 791 + }, + { + "epoch": 0.3220821472143148, + "grad_norm": 117.38027279738883, + "learning_rate": 1.65e-05, + "loss": 1.4404, + "step": 792 + }, + { + "epoch": 0.3224888165921106, + "grad_norm": 13.59362523251858, + "learning_rate": 1.6520833333333336e-05, + "loss": 0.2738, + "step": 793 + }, + { + "epoch": 0.32289548596990647, + "grad_norm": 31.691556128504004, + "learning_rate": 1.6541666666666668e-05, + "loss": 0.8934, + "step": 794 + }, + { + "epoch": 0.3233021553477023, + "grad_norm": 45.53989776984679, + "learning_rate": 1.6562500000000003e-05, + "loss": 2.7383, + "step": 795 + }, + { + "epoch": 0.32370882472549817, + "grad_norm": 17.19231111315982, + "learning_rate": 1.6583333333333334e-05, + "loss": 0.4986, + "step": 796 + }, + { + "epoch": 0.32411549410329404, + "grad_norm": 19.622325177161457, + "learning_rate": 1.660416666666667e-05, + "loss": 0.4254, + "step": 797 + }, + { + "epoch": 0.32452216348108986, + "grad_norm": 41.80575839036054, + "learning_rate": 1.6625e-05, + "loss": 1.9313, + "step": 798 + }, + { + "epoch": 0.32492883285888574, + "grad_norm": 17.636432387898786, + "learning_rate": 1.6645833333333336e-05, + "loss": 0.6006, + "step": 799 + }, + { + "epoch": 0.32533550223668156, + "grad_norm": 28.95594577666545, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.7032, + "step": 800 + }, + { + "epoch": 0.32574217161447744, + "grad_norm": 9.625264858496895, + "learning_rate": 1.6687500000000002e-05, + "loss": 0.087, + "step": 801 + }, + { + "epoch": 0.32614884099227326, + "grad_norm": 19.312064443405703, + "learning_rate": 1.6708333333333334e-05, + "loss": 0.4334, + "step": 802 + }, + { + "epoch": 0.32655551037006914, + "grad_norm": 23.77429918217163, + "learning_rate": 1.672916666666667e-05, + "loss": 0.3797, + "step": 803 + }, + { + "epoch": 0.326962179747865, + "grad_norm": 18.03040619536664, + "learning_rate": 1.675e-05, + "loss": 0.5212, + "step": 804 + }, + { + "epoch": 0.32736884912566083, + "grad_norm": 21.33971859249161, + "learning_rate": 1.6770833333333336e-05, + "loss": 0.406, + "step": 805 + }, + { + "epoch": 0.3277755185034567, + "grad_norm": 28.139462298059424, + "learning_rate": 1.6791666666666667e-05, + "loss": 0.5072, + "step": 806 + }, + { + "epoch": 0.32818218788125253, + "grad_norm": 27.225176885694037, + "learning_rate": 1.6812500000000002e-05, + "loss": 0.9873, + "step": 807 + }, + { + "epoch": 0.3285888572590484, + "grad_norm": 51.485707686314434, + "learning_rate": 1.6833333333333334e-05, + "loss": 2.4391, + "step": 808 + }, + { + "epoch": 0.3289955266368442, + "grad_norm": 20.590211739433308, + "learning_rate": 1.685416666666667e-05, + "loss": 0.1926, + "step": 809 + }, + { + "epoch": 0.3294021960146401, + "grad_norm": 20.22629545400362, + "learning_rate": 1.6875e-05, + "loss": 0.5057, + "step": 810 + }, + { + "epoch": 0.3298088653924359, + "grad_norm": 4.481355579763829, + "learning_rate": 1.6895833333333335e-05, + "loss": 0.0534, + "step": 811 + }, + { + "epoch": 0.3302155347702318, + "grad_norm": 4.4388276653726635, + "learning_rate": 1.6916666666666667e-05, + "loss": 0.0855, + "step": 812 + }, + { + "epoch": 0.3306222041480277, + "grad_norm": 39.592861148624266, + "learning_rate": 1.6937500000000002e-05, + "loss": 0.7964, + "step": 813 + }, + { + "epoch": 0.3310288735258235, + "grad_norm": 121.67367747152204, + "learning_rate": 1.6958333333333333e-05, + "loss": 0.2783, + "step": 814 + }, + { + "epoch": 0.3314355429036194, + "grad_norm": 36.66395716405141, + "learning_rate": 1.6979166666666668e-05, + "loss": 1.5581, + "step": 815 + }, + { + "epoch": 0.3318422122814152, + "grad_norm": 21.76692427872134, + "learning_rate": 1.7e-05, + "loss": 0.3355, + "step": 816 + }, + { + "epoch": 0.33224888165921107, + "grad_norm": 17.057585022214013, + "learning_rate": 1.7020833333333335e-05, + "loss": 0.4349, + "step": 817 + }, + { + "epoch": 0.3326555510370069, + "grad_norm": 12.156199713897308, + "learning_rate": 1.7041666666666666e-05, + "loss": 0.3298, + "step": 818 + }, + { + "epoch": 0.33306222041480277, + "grad_norm": 6.785459087064286, + "learning_rate": 1.70625e-05, + "loss": 0.0694, + "step": 819 + }, + { + "epoch": 0.33346888979259864, + "grad_norm": 47.15388795025716, + "learning_rate": 1.7083333333333333e-05, + "loss": 1.4489, + "step": 820 + }, + { + "epoch": 0.33387555917039446, + "grad_norm": 15.92150967554841, + "learning_rate": 1.7104166666666668e-05, + "loss": 0.2719, + "step": 821 + }, + { + "epoch": 0.33428222854819034, + "grad_norm": 8.61611465676801, + "learning_rate": 1.7125e-05, + "loss": 0.2258, + "step": 822 + }, + { + "epoch": 0.33468889792598616, + "grad_norm": 6.841059784565834, + "learning_rate": 1.7145833333333334e-05, + "loss": 0.1869, + "step": 823 + }, + { + "epoch": 0.33509556730378204, + "grad_norm": 12.683424483808665, + "learning_rate": 1.7166666666666666e-05, + "loss": 0.1308, + "step": 824 + }, + { + "epoch": 0.33550223668157786, + "grad_norm": 27.0360110174918, + "learning_rate": 1.71875e-05, + "loss": 0.3335, + "step": 825 + }, + { + "epoch": 0.33590890605937374, + "grad_norm": 11.50789116750323, + "learning_rate": 1.7208333333333336e-05, + "loss": 0.1394, + "step": 826 + }, + { + "epoch": 0.33631557543716956, + "grad_norm": 23.53624448943247, + "learning_rate": 1.7229166666666668e-05, + "loss": 0.7743, + "step": 827 + }, + { + "epoch": 0.33672224481496543, + "grad_norm": 56.32459790149605, + "learning_rate": 1.7250000000000003e-05, + "loss": 1.7822, + "step": 828 + }, + { + "epoch": 0.3371289141927613, + "grad_norm": 9.267247291082466, + "learning_rate": 1.7270833333333334e-05, + "loss": 0.1847, + "step": 829 + }, + { + "epoch": 0.33753558357055713, + "grad_norm": 16.696161253664418, + "learning_rate": 1.729166666666667e-05, + "loss": 0.3645, + "step": 830 + }, + { + "epoch": 0.337942252948353, + "grad_norm": 140.99899452946684, + "learning_rate": 1.73125e-05, + "loss": 0.6508, + "step": 831 + }, + { + "epoch": 0.3383489223261488, + "grad_norm": 55.54084949969597, + "learning_rate": 1.7333333333333336e-05, + "loss": 1.6041, + "step": 832 + }, + { + "epoch": 0.3387555917039447, + "grad_norm": 143.93219045883146, + "learning_rate": 1.7354166666666667e-05, + "loss": 1.5666, + "step": 833 + }, + { + "epoch": 0.3391622610817405, + "grad_norm": 36.246783402389106, + "learning_rate": 1.7375000000000002e-05, + "loss": 0.6739, + "step": 834 + }, + { + "epoch": 0.3395689304595364, + "grad_norm": 23.575006254391027, + "learning_rate": 1.7395833333333334e-05, + "loss": 0.9791, + "step": 835 + }, + { + "epoch": 0.3399755998373323, + "grad_norm": 11.346839713233745, + "learning_rate": 1.741666666666667e-05, + "loss": 0.1794, + "step": 836 + }, + { + "epoch": 0.3403822692151281, + "grad_norm": 33.20896937466378, + "learning_rate": 1.74375e-05, + "loss": 1.0828, + "step": 837 + }, + { + "epoch": 0.340788938592924, + "grad_norm": 16.002026807665914, + "learning_rate": 1.7458333333333335e-05, + "loss": 0.2296, + "step": 838 + }, + { + "epoch": 0.3411956079707198, + "grad_norm": 10.046417529816734, + "learning_rate": 1.7479166666666667e-05, + "loss": 0.2531, + "step": 839 + }, + { + "epoch": 0.34160227734851567, + "grad_norm": 27.49872896605833, + "learning_rate": 1.7500000000000002e-05, + "loss": 1.1563, + "step": 840 + }, + { + "epoch": 0.3420089467263115, + "grad_norm": 12.68362718017644, + "learning_rate": 1.7520833333333337e-05, + "loss": 0.1682, + "step": 841 + }, + { + "epoch": 0.34241561610410737, + "grad_norm": 17.299469478308634, + "learning_rate": 1.754166666666667e-05, + "loss": 0.5194, + "step": 842 + }, + { + "epoch": 0.3428222854819032, + "grad_norm": 88.39886539648974, + "learning_rate": 1.7562500000000003e-05, + "loss": 1.2372, + "step": 843 + }, + { + "epoch": 0.34322895485969906, + "grad_norm": 13.844199374441617, + "learning_rate": 1.7583333333333335e-05, + "loss": 0.2641, + "step": 844 + }, + { + "epoch": 0.34363562423749494, + "grad_norm": 24.203069499969878, + "learning_rate": 1.760416666666667e-05, + "loss": 0.4894, + "step": 845 + }, + { + "epoch": 0.34404229361529076, + "grad_norm": 21.54882207362314, + "learning_rate": 1.7625e-05, + "loss": 0.2323, + "step": 846 + }, + { + "epoch": 0.34444896299308664, + "grad_norm": 36.85650044989957, + "learning_rate": 1.7645833333333336e-05, + "loss": 0.7388, + "step": 847 + }, + { + "epoch": 0.34485563237088246, + "grad_norm": 18.77508096043745, + "learning_rate": 1.7666666666666668e-05, + "loss": 0.2014, + "step": 848 + }, + { + "epoch": 0.34526230174867834, + "grad_norm": 12.59636103928448, + "learning_rate": 1.7687500000000003e-05, + "loss": 0.3002, + "step": 849 + }, + { + "epoch": 0.34566897112647416, + "grad_norm": 37.22743010515622, + "learning_rate": 1.7708333333333335e-05, + "loss": 1.7174, + "step": 850 + }, + { + "epoch": 0.34607564050427003, + "grad_norm": 14.317511112766734, + "learning_rate": 1.772916666666667e-05, + "loss": 0.4019, + "step": 851 + }, + { + "epoch": 0.34648230988206585, + "grad_norm": 57.878113475134896, + "learning_rate": 1.775e-05, + "loss": 2.426, + "step": 852 + }, + { + "epoch": 0.34688897925986173, + "grad_norm": 19.11261869857422, + "learning_rate": 1.7770833333333336e-05, + "loss": 0.4111, + "step": 853 + }, + { + "epoch": 0.3472956486376576, + "grad_norm": 23.348474913849824, + "learning_rate": 1.7791666666666668e-05, + "loss": 1.2167, + "step": 854 + }, + { + "epoch": 0.3477023180154534, + "grad_norm": 29.00282010713089, + "learning_rate": 1.7812500000000003e-05, + "loss": 0.7188, + "step": 855 + }, + { + "epoch": 0.3481089873932493, + "grad_norm": 73.59602459350653, + "learning_rate": 1.7833333333333334e-05, + "loss": 1.0761, + "step": 856 + }, + { + "epoch": 0.3485156567710451, + "grad_norm": 40.696183959687545, + "learning_rate": 1.785416666666667e-05, + "loss": 0.7594, + "step": 857 + }, + { + "epoch": 0.348922326148841, + "grad_norm": 22.394379260364005, + "learning_rate": 1.7875e-05, + "loss": 0.9656, + "step": 858 + }, + { + "epoch": 0.3493289955266368, + "grad_norm": 37.07911508759816, + "learning_rate": 1.7895833333333336e-05, + "loss": 0.6012, + "step": 859 + }, + { + "epoch": 0.3497356649044327, + "grad_norm": 42.10588637659006, + "learning_rate": 1.7916666666666667e-05, + "loss": 1.9944, + "step": 860 + }, + { + "epoch": 0.3501423342822286, + "grad_norm": 19.30968120536164, + "learning_rate": 1.7937500000000002e-05, + "loss": 0.6896, + "step": 861 + }, + { + "epoch": 0.3505490036600244, + "grad_norm": 11.823812395430705, + "learning_rate": 1.7958333333333334e-05, + "loss": 0.1673, + "step": 862 + }, + { + "epoch": 0.35095567303782027, + "grad_norm": 30.123551670255623, + "learning_rate": 1.797916666666667e-05, + "loss": 1.0428, + "step": 863 + }, + { + "epoch": 0.3513623424156161, + "grad_norm": 22.93768031259912, + "learning_rate": 1.8e-05, + "loss": 1.1523, + "step": 864 + }, + { + "epoch": 0.35176901179341197, + "grad_norm": 20.78210258939857, + "learning_rate": 1.8020833333333335e-05, + "loss": 0.5424, + "step": 865 + }, + { + "epoch": 0.3521756811712078, + "grad_norm": 24.53761843570633, + "learning_rate": 1.8041666666666667e-05, + "loss": 0.9138, + "step": 866 + }, + { + "epoch": 0.35258235054900366, + "grad_norm": 56.767241746059085, + "learning_rate": 1.8062500000000002e-05, + "loss": 0.9155, + "step": 867 + }, + { + "epoch": 0.3529890199267995, + "grad_norm": 23.536267745415216, + "learning_rate": 1.8083333333333334e-05, + "loss": 0.4945, + "step": 868 + }, + { + "epoch": 0.35339568930459536, + "grad_norm": 11.385035311135042, + "learning_rate": 1.810416666666667e-05, + "loss": 0.1873, + "step": 869 + }, + { + "epoch": 0.35380235868239124, + "grad_norm": 35.845726688692984, + "learning_rate": 1.8125e-05, + "loss": 0.7886, + "step": 870 + }, + { + "epoch": 0.35420902806018706, + "grad_norm": 20.674165974755258, + "learning_rate": 1.8145833333333335e-05, + "loss": 0.451, + "step": 871 + }, + { + "epoch": 0.35461569743798294, + "grad_norm": 17.359704352292365, + "learning_rate": 1.8166666666666667e-05, + "loss": 0.3491, + "step": 872 + }, + { + "epoch": 0.35502236681577876, + "grad_norm": 10.51900060428506, + "learning_rate": 1.81875e-05, + "loss": 0.1834, + "step": 873 + }, + { + "epoch": 0.35542903619357463, + "grad_norm": 23.629980799741446, + "learning_rate": 1.8208333333333333e-05, + "loss": 0.7109, + "step": 874 + }, + { + "epoch": 0.35583570557137045, + "grad_norm": 4.781534533978903, + "learning_rate": 1.8229166666666668e-05, + "loss": 0.0493, + "step": 875 + }, + { + "epoch": 0.35624237494916633, + "grad_norm": 10.982613418680417, + "learning_rate": 1.825e-05, + "loss": 0.2061, + "step": 876 + }, + { + "epoch": 0.3566490443269622, + "grad_norm": 8.666672755748532, + "learning_rate": 1.8270833333333335e-05, + "loss": 0.0658, + "step": 877 + }, + { + "epoch": 0.357055713704758, + "grad_norm": 26.555722497108356, + "learning_rate": 1.8291666666666666e-05, + "loss": 1.1526, + "step": 878 + }, + { + "epoch": 0.3574623830825539, + "grad_norm": 34.86108479822837, + "learning_rate": 1.83125e-05, + "loss": 1.5283, + "step": 879 + }, + { + "epoch": 0.3578690524603497, + "grad_norm": 15.39901319898035, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.134, + "step": 880 + }, + { + "epoch": 0.3582757218381456, + "grad_norm": 19.5315612066271, + "learning_rate": 1.8354166666666668e-05, + "loss": 0.2941, + "step": 881 + }, + { + "epoch": 0.3586823912159414, + "grad_norm": 33.57123930328434, + "learning_rate": 1.8375e-05, + "loss": 1.1959, + "step": 882 + }, + { + "epoch": 0.3590890605937373, + "grad_norm": 48.41375342373992, + "learning_rate": 1.8395833333333334e-05, + "loss": 1.2209, + "step": 883 + }, + { + "epoch": 0.3594957299715331, + "grad_norm": 20.081975313890457, + "learning_rate": 1.8416666666666666e-05, + "loss": 0.2882, + "step": 884 + }, + { + "epoch": 0.359902399349329, + "grad_norm": 38.32094702744563, + "learning_rate": 1.84375e-05, + "loss": 0.5561, + "step": 885 + }, + { + "epoch": 0.36030906872712487, + "grad_norm": 40.29026453389809, + "learning_rate": 1.8458333333333336e-05, + "loss": 1.2235, + "step": 886 + }, + { + "epoch": 0.3607157381049207, + "grad_norm": 15.973701239899503, + "learning_rate": 1.8479166666666667e-05, + "loss": 0.7208, + "step": 887 + }, + { + "epoch": 0.36112240748271657, + "grad_norm": 17.027802491372313, + "learning_rate": 1.8500000000000002e-05, + "loss": 0.6163, + "step": 888 + }, + { + "epoch": 0.3615290768605124, + "grad_norm": 9.275032601649295, + "learning_rate": 1.8520833333333334e-05, + "loss": 0.1175, + "step": 889 + }, + { + "epoch": 0.36193574623830826, + "grad_norm": 11.475316562368787, + "learning_rate": 1.854166666666667e-05, + "loss": 0.1778, + "step": 890 + }, + { + "epoch": 0.3623424156161041, + "grad_norm": 26.916989970794457, + "learning_rate": 1.85625e-05, + "loss": 0.5478, + "step": 891 + }, + { + "epoch": 0.36274908499389996, + "grad_norm": 29.03659720131749, + "learning_rate": 1.8583333333333336e-05, + "loss": 1.1601, + "step": 892 + }, + { + "epoch": 0.36315575437169584, + "grad_norm": 13.90062247828384, + "learning_rate": 1.8604166666666667e-05, + "loss": 0.3724, + "step": 893 + }, + { + "epoch": 0.36356242374949166, + "grad_norm": 18.674583610989913, + "learning_rate": 1.8625000000000002e-05, + "loss": 0.5021, + "step": 894 + }, + { + "epoch": 0.36396909312728754, + "grad_norm": 18.278833127543493, + "learning_rate": 1.8645833333333334e-05, + "loss": 0.2939, + "step": 895 + }, + { + "epoch": 0.36437576250508336, + "grad_norm": 23.31187027177845, + "learning_rate": 1.866666666666667e-05, + "loss": 0.4425, + "step": 896 + }, + { + "epoch": 0.36478243188287923, + "grad_norm": 10.318965061112635, + "learning_rate": 1.86875e-05, + "loss": 0.1094, + "step": 897 + }, + { + "epoch": 0.36518910126067505, + "grad_norm": 27.54037043515936, + "learning_rate": 1.8708333333333335e-05, + "loss": 0.821, + "step": 898 + }, + { + "epoch": 0.36559577063847093, + "grad_norm": 20.015076988557126, + "learning_rate": 1.8729166666666667e-05, + "loss": 0.5196, + "step": 899 + }, + { + "epoch": 0.36600244001626675, + "grad_norm": 44.12517923835892, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.462, + "step": 900 + }, + { + "epoch": 0.3664091093940626, + "grad_norm": 38.983524292587155, + "learning_rate": 1.8770833333333337e-05, + "loss": 0.239, + "step": 901 + }, + { + "epoch": 0.3668157787718585, + "grad_norm": 32.60802699610339, + "learning_rate": 1.8791666666666668e-05, + "loss": 1.1927, + "step": 902 + }, + { + "epoch": 0.3672224481496543, + "grad_norm": 33.11861943746338, + "learning_rate": 1.8812500000000003e-05, + "loss": 0.9628, + "step": 903 + }, + { + "epoch": 0.3676291175274502, + "grad_norm": 30.80884525853718, + "learning_rate": 1.8833333333333335e-05, + "loss": 0.9874, + "step": 904 + }, + { + "epoch": 0.368035786905246, + "grad_norm": 29.117050129121413, + "learning_rate": 1.885416666666667e-05, + "loss": 1.2299, + "step": 905 + }, + { + "epoch": 0.3684424562830419, + "grad_norm": 21.643105634641948, + "learning_rate": 1.8875e-05, + "loss": 1.015, + "step": 906 + }, + { + "epoch": 0.3688491256608377, + "grad_norm": 8.77609415227581, + "learning_rate": 1.8895833333333336e-05, + "loss": 0.0879, + "step": 907 + }, + { + "epoch": 0.3692557950386336, + "grad_norm": 10.727579330942612, + "learning_rate": 1.8916666666666668e-05, + "loss": 0.1765, + "step": 908 + }, + { + "epoch": 0.36966246441642947, + "grad_norm": 5.014966982328815, + "learning_rate": 1.8937500000000003e-05, + "loss": 0.0379, + "step": 909 + }, + { + "epoch": 0.3700691337942253, + "grad_norm": 40.20092952766238, + "learning_rate": 1.8958333333333334e-05, + "loss": 1.2693, + "step": 910 + }, + { + "epoch": 0.37047580317202117, + "grad_norm": 2.104429729977538, + "learning_rate": 1.897916666666667e-05, + "loss": 0.0209, + "step": 911 + }, + { + "epoch": 0.370882472549817, + "grad_norm": 12.014743150869341, + "learning_rate": 1.9e-05, + "loss": 0.1176, + "step": 912 + }, + { + "epoch": 0.37128914192761286, + "grad_norm": 65.6073831944966, + "learning_rate": 1.9020833333333336e-05, + "loss": 1.3484, + "step": 913 + }, + { + "epoch": 0.3716958113054087, + "grad_norm": 23.446627898641538, + "learning_rate": 1.9041666666666668e-05, + "loss": 0.4603, + "step": 914 + }, + { + "epoch": 0.37210248068320456, + "grad_norm": 4.50356747594059, + "learning_rate": 1.9062500000000003e-05, + "loss": 0.082, + "step": 915 + }, + { + "epoch": 0.3725091500610004, + "grad_norm": 19.80184722976524, + "learning_rate": 1.9083333333333338e-05, + "loss": 0.4724, + "step": 916 + }, + { + "epoch": 0.37291581943879626, + "grad_norm": 8.160185641827704, + "learning_rate": 1.910416666666667e-05, + "loss": 0.2142, + "step": 917 + }, + { + "epoch": 0.37332248881659214, + "grad_norm": 19.48760600247772, + "learning_rate": 1.9125000000000004e-05, + "loss": 0.5005, + "step": 918 + }, + { + "epoch": 0.37372915819438796, + "grad_norm": 15.506664622429431, + "learning_rate": 1.9145833333333336e-05, + "loss": 0.2152, + "step": 919 + }, + { + "epoch": 0.37413582757218383, + "grad_norm": 11.39601404692353, + "learning_rate": 1.916666666666667e-05, + "loss": 0.108, + "step": 920 + }, + { + "epoch": 0.37454249694997965, + "grad_norm": 79.11517963854385, + "learning_rate": 1.9187500000000002e-05, + "loss": 0.6314, + "step": 921 + }, + { + "epoch": 0.37494916632777553, + "grad_norm": 1.3633339158827367, + "learning_rate": 1.9208333333333337e-05, + "loss": 0.0172, + "step": 922 + }, + { + "epoch": 0.37535583570557135, + "grad_norm": 48.996244871144754, + "learning_rate": 1.922916666666667e-05, + "loss": 1.9077, + "step": 923 + }, + { + "epoch": 0.3757625050833672, + "grad_norm": 45.73333394059348, + "learning_rate": 1.925e-05, + "loss": 1.0706, + "step": 924 + }, + { + "epoch": 0.37616917446116305, + "grad_norm": 25.217798236343842, + "learning_rate": 1.9270833333333335e-05, + "loss": 0.5198, + "step": 925 + }, + { + "epoch": 0.3765758438389589, + "grad_norm": 15.098428587571377, + "learning_rate": 1.9291666666666667e-05, + "loss": 0.368, + "step": 926 + }, + { + "epoch": 0.3769825132167548, + "grad_norm": 8.094867110427238, + "learning_rate": 1.9312500000000002e-05, + "loss": 0.1984, + "step": 927 + }, + { + "epoch": 0.3773891825945506, + "grad_norm": 19.05512988557243, + "learning_rate": 1.9333333333333333e-05, + "loss": 0.3633, + "step": 928 + }, + { + "epoch": 0.3777958519723465, + "grad_norm": 4.576289114161826, + "learning_rate": 1.935416666666667e-05, + "loss": 0.0355, + "step": 929 + }, + { + "epoch": 0.3782025213501423, + "grad_norm": 31.760798584751598, + "learning_rate": 1.9375e-05, + "loss": 1.3844, + "step": 930 + }, + { + "epoch": 0.3786091907279382, + "grad_norm": 160.54220008188074, + "learning_rate": 1.9395833333333335e-05, + "loss": 1.7659, + "step": 931 + }, + { + "epoch": 0.379015860105734, + "grad_norm": 23.299162514121175, + "learning_rate": 1.9416666666666667e-05, + "loss": 0.7987, + "step": 932 + }, + { + "epoch": 0.3794225294835299, + "grad_norm": 45.949434487600826, + "learning_rate": 1.94375e-05, + "loss": 1.1733, + "step": 933 + }, + { + "epoch": 0.37982919886132577, + "grad_norm": 30.749054030176833, + "learning_rate": 1.9458333333333333e-05, + "loss": 0.8106, + "step": 934 + }, + { + "epoch": 0.3802358682391216, + "grad_norm": 29.089364021866693, + "learning_rate": 1.9479166666666668e-05, + "loss": 0.4439, + "step": 935 + }, + { + "epoch": 0.38064253761691746, + "grad_norm": 17.911627817382747, + "learning_rate": 1.95e-05, + "loss": 0.4455, + "step": 936 + }, + { + "epoch": 0.3810492069947133, + "grad_norm": 13.965763473701834, + "learning_rate": 1.9520833333333335e-05, + "loss": 0.2523, + "step": 937 + }, + { + "epoch": 0.38145587637250916, + "grad_norm": 13.823091632110046, + "learning_rate": 1.9541666666666666e-05, + "loss": 0.3562, + "step": 938 + }, + { + "epoch": 0.381862545750305, + "grad_norm": 86.46906543090991, + "learning_rate": 1.95625e-05, + "loss": 0.4013, + "step": 939 + }, + { + "epoch": 0.38226921512810086, + "grad_norm": 18.443542676392696, + "learning_rate": 1.9583333333333333e-05, + "loss": 0.8542, + "step": 940 + }, + { + "epoch": 0.3826758845058967, + "grad_norm": 50.40939154475388, + "learning_rate": 1.9604166666666668e-05, + "loss": 2.1517, + "step": 941 + }, + { + "epoch": 0.38308255388369256, + "grad_norm": 24.238279030497754, + "learning_rate": 1.9625e-05, + "loss": 0.6027, + "step": 942 + }, + { + "epoch": 0.38348922326148843, + "grad_norm": 11.239859582285046, + "learning_rate": 1.9645833333333334e-05, + "loss": 0.1681, + "step": 943 + }, + { + "epoch": 0.38389589263928425, + "grad_norm": 8.092107220334439, + "learning_rate": 1.9666666666666666e-05, + "loss": 0.1266, + "step": 944 + }, + { + "epoch": 0.38430256201708013, + "grad_norm": 32.46607231814565, + "learning_rate": 1.96875e-05, + "loss": 0.6589, + "step": 945 + }, + { + "epoch": 0.38470923139487595, + "grad_norm": 17.798805541071594, + "learning_rate": 1.9708333333333336e-05, + "loss": 0.325, + "step": 946 + }, + { + "epoch": 0.3851159007726718, + "grad_norm": 19.85083320728467, + "learning_rate": 1.9729166666666667e-05, + "loss": 0.5353, + "step": 947 + }, + { + "epoch": 0.38552257015046765, + "grad_norm": 36.910585326564025, + "learning_rate": 1.9750000000000002e-05, + "loss": 1.8061, + "step": 948 + }, + { + "epoch": 0.3859292395282635, + "grad_norm": 51.778332046464584, + "learning_rate": 1.9770833333333334e-05, + "loss": 0.9497, + "step": 949 + }, + { + "epoch": 0.3863359089060594, + "grad_norm": 34.434969106763226, + "learning_rate": 1.979166666666667e-05, + "loss": 0.9589, + "step": 950 + }, + { + "epoch": 0.3867425782838552, + "grad_norm": 23.914293418300275, + "learning_rate": 1.98125e-05, + "loss": 0.5023, + "step": 951 + }, + { + "epoch": 0.3871492476616511, + "grad_norm": 17.40565268250999, + "learning_rate": 1.9833333333333335e-05, + "loss": 0.3562, + "step": 952 + }, + { + "epoch": 0.3875559170394469, + "grad_norm": 15.975218194259087, + "learning_rate": 1.9854166666666667e-05, + "loss": 0.2939, + "step": 953 + }, + { + "epoch": 0.3879625864172428, + "grad_norm": 38.78928243799464, + "learning_rate": 1.9875000000000002e-05, + "loss": 0.7557, + "step": 954 + }, + { + "epoch": 0.3883692557950386, + "grad_norm": 16.402502981268352, + "learning_rate": 1.9895833333333334e-05, + "loss": 0.4861, + "step": 955 + }, + { + "epoch": 0.3887759251728345, + "grad_norm": 31.103661765169957, + "learning_rate": 1.991666666666667e-05, + "loss": 1.1684, + "step": 956 + }, + { + "epoch": 0.3891825945506303, + "grad_norm": 24.288176702001795, + "learning_rate": 1.99375e-05, + "loss": 0.4304, + "step": 957 + }, + { + "epoch": 0.3895892639284262, + "grad_norm": 14.043056185878573, + "learning_rate": 1.9958333333333335e-05, + "loss": 0.3208, + "step": 958 + }, + { + "epoch": 0.38999593330622206, + "grad_norm": 42.8455135412387, + "learning_rate": 1.9979166666666667e-05, + "loss": 1.5578, + "step": 959 + }, + { + "epoch": 0.3904026026840179, + "grad_norm": 49.91466778120852, + "learning_rate": 2e-05, + "loss": 1.9492, + "step": 960 + }, + { + "epoch": 0.39080927206181376, + "grad_norm": 17.763950327472852, + "learning_rate": 1.9999999948672485e-05, + "loss": 0.3808, + "step": 961 + }, + { + "epoch": 0.3912159414396096, + "grad_norm": 6.517087159378999, + "learning_rate": 1.9999999794689935e-05, + "loss": 0.2035, + "step": 962 + }, + { + "epoch": 0.39162261081740546, + "grad_norm": 50.97603916323433, + "learning_rate": 1.9999999538052356e-05, + "loss": 1.3043, + "step": 963 + }, + { + "epoch": 0.3920292801952013, + "grad_norm": 13.921442521781435, + "learning_rate": 1.999999917875975e-05, + "loss": 0.1971, + "step": 964 + }, + { + "epoch": 0.39243594957299716, + "grad_norm": 5.542427913009786, + "learning_rate": 1.9999998716812117e-05, + "loss": 0.056, + "step": 965 + }, + { + "epoch": 0.39284261895079303, + "grad_norm": 9.53540682526906, + "learning_rate": 1.9999998152209462e-05, + "loss": 0.1258, + "step": 966 + }, + { + "epoch": 0.39324928832858885, + "grad_norm": 31.954930464794707, + "learning_rate": 1.9999997484951797e-05, + "loss": 1.9525, + "step": 967 + }, + { + "epoch": 0.39365595770638473, + "grad_norm": 16.314977630364037, + "learning_rate": 1.9999996715039126e-05, + "loss": 0.4836, + "step": 968 + }, + { + "epoch": 0.39406262708418055, + "grad_norm": 3.050686541004119, + "learning_rate": 1.9999995842471453e-05, + "loss": 0.041, + "step": 969 + }, + { + "epoch": 0.3944692964619764, + "grad_norm": 31.29891567545294, + "learning_rate": 1.999999486724879e-05, + "loss": 0.6053, + "step": 970 + }, + { + "epoch": 0.39487596583977225, + "grad_norm": 17.559819243917385, + "learning_rate": 1.999999378937115e-05, + "loss": 0.514, + "step": 971 + }, + { + "epoch": 0.3952826352175681, + "grad_norm": 0.6127150196146419, + "learning_rate": 1.9999992608838537e-05, + "loss": 0.0089, + "step": 972 + }, + { + "epoch": 0.39568930459536394, + "grad_norm": 10.079290106958343, + "learning_rate": 1.9999991325650968e-05, + "loss": 0.1775, + "step": 973 + }, + { + "epoch": 0.3960959739731598, + "grad_norm": 17.15238987322294, + "learning_rate": 1.9999989939808454e-05, + "loss": 0.2808, + "step": 974 + }, + { + "epoch": 0.3965026433509557, + "grad_norm": 9.421210444329333, + "learning_rate": 1.9999988451311015e-05, + "loss": 0.072, + "step": 975 + }, + { + "epoch": 0.3969093127287515, + "grad_norm": 22.165910659800836, + "learning_rate": 1.9999986860158658e-05, + "loss": 0.524, + "step": 976 + }, + { + "epoch": 0.3973159821065474, + "grad_norm": 11.851150913979748, + "learning_rate": 1.9999985166351403e-05, + "loss": 0.322, + "step": 977 + }, + { + "epoch": 0.3977226514843432, + "grad_norm": 33.94897151493128, + "learning_rate": 1.9999983369889268e-05, + "loss": 0.7012, + "step": 978 + }, + { + "epoch": 0.3981293208621391, + "grad_norm": 13.30837393659418, + "learning_rate": 1.9999981470772272e-05, + "loss": 0.1871, + "step": 979 + }, + { + "epoch": 0.3985359902399349, + "grad_norm": 6.631281431538193, + "learning_rate": 1.999997946900043e-05, + "loss": 0.2947, + "step": 980 + }, + { + "epoch": 0.3989426596177308, + "grad_norm": 22.595654880849064, + "learning_rate": 1.9999977364573767e-05, + "loss": 0.3056, + "step": 981 + }, + { + "epoch": 0.39934932899552666, + "grad_norm": 32.39860259929984, + "learning_rate": 1.9999975157492305e-05, + "loss": 0.6006, + "step": 982 + }, + { + "epoch": 0.3997559983733225, + "grad_norm": 34.75449310043937, + "learning_rate": 1.9999972847756064e-05, + "loss": 0.3459, + "step": 983 + }, + { + "epoch": 0.40016266775111836, + "grad_norm": 31.9093030037573, + "learning_rate": 1.999997043536507e-05, + "loss": 1.608, + "step": 984 + }, + { + "epoch": 0.4005693371289142, + "grad_norm": 28.047925394413365, + "learning_rate": 1.9999967920319347e-05, + "loss": 0.8924, + "step": 985 + }, + { + "epoch": 0.40097600650671006, + "grad_norm": 38.31149163249332, + "learning_rate": 1.9999965302618918e-05, + "loss": 0.789, + "step": 986 + }, + { + "epoch": 0.4013826758845059, + "grad_norm": 17.840994721134958, + "learning_rate": 1.9999962582263815e-05, + "loss": 0.4938, + "step": 987 + }, + { + "epoch": 0.40178934526230176, + "grad_norm": 77.43763941605266, + "learning_rate": 1.9999959759254062e-05, + "loss": 0.4017, + "step": 988 + }, + { + "epoch": 0.4021960146400976, + "grad_norm": 28.126721948786575, + "learning_rate": 1.999995683358969e-05, + "loss": 0.913, + "step": 989 + }, + { + "epoch": 0.40260268401789345, + "grad_norm": 27.177483328380443, + "learning_rate": 1.9999953805270723e-05, + "loss": 0.5096, + "step": 990 + }, + { + "epoch": 0.40300935339568933, + "grad_norm": 40.63421367645364, + "learning_rate": 1.9999950674297205e-05, + "loss": 2.013, + "step": 991 + }, + { + "epoch": 0.40341602277348515, + "grad_norm": 6.405283565138103, + "learning_rate": 1.9999947440669155e-05, + "loss": 0.1017, + "step": 992 + }, + { + "epoch": 0.403822692151281, + "grad_norm": 14.008142952497108, + "learning_rate": 1.9999944104386614e-05, + "loss": 0.1862, + "step": 993 + }, + { + "epoch": 0.40422936152907685, + "grad_norm": 32.979307746015635, + "learning_rate": 1.9999940665449613e-05, + "loss": 0.3462, + "step": 994 + }, + { + "epoch": 0.4046360309068727, + "grad_norm": 35.71602281829013, + "learning_rate": 1.9999937123858188e-05, + "loss": 0.8928, + "step": 995 + }, + { + "epoch": 0.40504270028466854, + "grad_norm": 17.693479959279347, + "learning_rate": 1.9999933479612377e-05, + "loss": 0.2731, + "step": 996 + }, + { + "epoch": 0.4054493696624644, + "grad_norm": 12.727944335069745, + "learning_rate": 1.9999929732712218e-05, + "loss": 0.1428, + "step": 997 + }, + { + "epoch": 0.40585603904026024, + "grad_norm": 12.177014017652665, + "learning_rate": 1.9999925883157743e-05, + "loss": 0.5724, + "step": 998 + }, + { + "epoch": 0.4062627084180561, + "grad_norm": 30.80286118545562, + "learning_rate": 1.9999921930948998e-05, + "loss": 1.4276, + "step": 999 + }, + { + "epoch": 0.406669377795852, + "grad_norm": 11.6473201168249, + "learning_rate": 1.9999917876086023e-05, + "loss": 0.2745, + "step": 1000 + }, + { + "epoch": 0.4070760471736478, + "grad_norm": 19.291198655446184, + "learning_rate": 1.9999913718568854e-05, + "loss": 0.5197, + "step": 1001 + }, + { + "epoch": 0.4074827165514437, + "grad_norm": 9.267357794741043, + "learning_rate": 1.999990945839754e-05, + "loss": 0.1216, + "step": 1002 + }, + { + "epoch": 0.4078893859292395, + "grad_norm": 17.280943274039405, + "learning_rate": 1.9999905095572125e-05, + "loss": 0.5319, + "step": 1003 + }, + { + "epoch": 0.4082960553070354, + "grad_norm": 31.303142991341186, + "learning_rate": 1.999990063009265e-05, + "loss": 1.2422, + "step": 1004 + }, + { + "epoch": 0.4087027246848312, + "grad_norm": 23.78755346955345, + "learning_rate": 1.999989606195916e-05, + "loss": 0.7553, + "step": 1005 + }, + { + "epoch": 0.4091093940626271, + "grad_norm": 25.713646215600786, + "learning_rate": 1.9999891391171704e-05, + "loss": 0.9969, + "step": 1006 + }, + { + "epoch": 0.40951606344042296, + "grad_norm": 11.62444223904008, + "learning_rate": 1.999988661773033e-05, + "loss": 0.3676, + "step": 1007 + }, + { + "epoch": 0.4099227328182188, + "grad_norm": 42.315232614274564, + "learning_rate": 1.999988174163509e-05, + "loss": 1.1811, + "step": 1008 + }, + { + "epoch": 0.41032940219601466, + "grad_norm": 39.80230794729946, + "learning_rate": 1.999987676288603e-05, + "loss": 2.3046, + "step": 1009 + }, + { + "epoch": 0.4107360715738105, + "grad_norm": 21.30168607830263, + "learning_rate": 1.9999871681483202e-05, + "loss": 0.5647, + "step": 1010 + }, + { + "epoch": 0.41114274095160636, + "grad_norm": 15.034847129019917, + "learning_rate": 1.999986649742666e-05, + "loss": 0.248, + "step": 1011 + }, + { + "epoch": 0.4115494103294022, + "grad_norm": 28.380544192781215, + "learning_rate": 1.9999861210716452e-05, + "loss": 1.15, + "step": 1012 + }, + { + "epoch": 0.41195607970719805, + "grad_norm": 0.942622722355126, + "learning_rate": 1.9999855821352635e-05, + "loss": 0.0085, + "step": 1013 + }, + { + "epoch": 0.4123627490849939, + "grad_norm": 10.508971058425487, + "learning_rate": 1.9999850329335268e-05, + "loss": 0.1918, + "step": 1014 + }, + { + "epoch": 0.41276941846278975, + "grad_norm": 29.45526684301631, + "learning_rate": 1.9999844734664403e-05, + "loss": 0.6518, + "step": 1015 + }, + { + "epoch": 0.4131760878405856, + "grad_norm": 24.948362541378792, + "learning_rate": 1.9999839037340104e-05, + "loss": 0.8537, + "step": 1016 + }, + { + "epoch": 0.41358275721838145, + "grad_norm": 31.503082279278146, + "learning_rate": 1.9999833237362418e-05, + "loss": 0.7236, + "step": 1017 + }, + { + "epoch": 0.4139894265961773, + "grad_norm": 25.13021800760798, + "learning_rate": 1.9999827334731417e-05, + "loss": 0.9765, + "step": 1018 + }, + { + "epoch": 0.41439609597397314, + "grad_norm": 9.549411770816032, + "learning_rate": 1.999982132944715e-05, + "loss": 0.2566, + "step": 1019 + }, + { + "epoch": 0.414802765351769, + "grad_norm": 7.411409688064201, + "learning_rate": 1.9999815221509686e-05, + "loss": 0.2224, + "step": 1020 + }, + { + "epoch": 0.41520943472956484, + "grad_norm": 49.674347939840814, + "learning_rate": 1.9999809010919087e-05, + "loss": 1.7278, + "step": 1021 + }, + { + "epoch": 0.4156161041073607, + "grad_norm": 32.42621056580737, + "learning_rate": 1.9999802697675418e-05, + "loss": 0.7514, + "step": 1022 + }, + { + "epoch": 0.4160227734851566, + "grad_norm": 14.158693675045523, + "learning_rate": 1.9999796281778737e-05, + "loss": 0.3275, + "step": 1023 + }, + { + "epoch": 0.4164294428629524, + "grad_norm": 12.788337649693467, + "learning_rate": 1.9999789763229115e-05, + "loss": 0.2668, + "step": 1024 + }, + { + "epoch": 0.4168361122407483, + "grad_norm": 20.005814502818062, + "learning_rate": 1.999978314202662e-05, + "loss": 0.5331, + "step": 1025 + }, + { + "epoch": 0.4172427816185441, + "grad_norm": 1.2123422422944643, + "learning_rate": 1.999977641817132e-05, + "loss": 0.0061, + "step": 1026 + }, + { + "epoch": 0.41764945099634, + "grad_norm": 8.802529417804314, + "learning_rate": 1.9999769591663277e-05, + "loss": 0.2904, + "step": 1027 + }, + { + "epoch": 0.4180561203741358, + "grad_norm": 20.974052913796065, + "learning_rate": 1.9999762662502568e-05, + "loss": 0.7623, + "step": 1028 + }, + { + "epoch": 0.4184627897519317, + "grad_norm": 7.604180677325983, + "learning_rate": 1.9999755630689265e-05, + "loss": 0.1063, + "step": 1029 + }, + { + "epoch": 0.4188694591297275, + "grad_norm": 4.812390971371122, + "learning_rate": 1.9999748496223436e-05, + "loss": 0.066, + "step": 1030 + }, + { + "epoch": 0.4192761285075234, + "grad_norm": 51.104901666431864, + "learning_rate": 1.9999741259105156e-05, + "loss": 1.2973, + "step": 1031 + }, + { + "epoch": 0.41968279788531926, + "grad_norm": 33.78222893911817, + "learning_rate": 1.9999733919334502e-05, + "loss": 1.3769, + "step": 1032 + }, + { + "epoch": 0.4200894672631151, + "grad_norm": 11.974496023753481, + "learning_rate": 1.9999726476911542e-05, + "loss": 0.4893, + "step": 1033 + }, + { + "epoch": 0.42049613664091096, + "grad_norm": 14.911997691325197, + "learning_rate": 1.999971893183636e-05, + "loss": 0.5415, + "step": 1034 + }, + { + "epoch": 0.4209028060187068, + "grad_norm": 19.329619029048104, + "learning_rate": 1.9999711284109028e-05, + "loss": 0.4841, + "step": 1035 + }, + { + "epoch": 0.42130947539650265, + "grad_norm": 35.0491400418096, + "learning_rate": 1.999970353372963e-05, + "loss": 1.128, + "step": 1036 + }, + { + "epoch": 0.4217161447742985, + "grad_norm": 87.2316603015265, + "learning_rate": 1.999969568069824e-05, + "loss": 1.1093, + "step": 1037 + }, + { + "epoch": 0.42212281415209435, + "grad_norm": 11.657807767807261, + "learning_rate": 1.9999687725014943e-05, + "loss": 0.177, + "step": 1038 + }, + { + "epoch": 0.4225294835298902, + "grad_norm": 8.293954960480432, + "learning_rate": 1.9999679666679816e-05, + "loss": 0.2231, + "step": 1039 + }, + { + "epoch": 0.42293615290768605, + "grad_norm": 32.13041167167305, + "learning_rate": 1.999967150569295e-05, + "loss": 1.6349, + "step": 1040 + }, + { + "epoch": 0.4233428222854819, + "grad_norm": 26.340814248731597, + "learning_rate": 1.9999663242054416e-05, + "loss": 0.9225, + "step": 1041 + }, + { + "epoch": 0.42374949166327774, + "grad_norm": 11.239879555046357, + "learning_rate": 1.999965487576431e-05, + "loss": 0.1383, + "step": 1042 + }, + { + "epoch": 0.4241561610410736, + "grad_norm": 38.155403179360036, + "learning_rate": 1.9999646406822715e-05, + "loss": 1.0051, + "step": 1043 + }, + { + "epoch": 0.42456283041886944, + "grad_norm": 19.094007896825467, + "learning_rate": 1.9999637835229715e-05, + "loss": 0.888, + "step": 1044 + }, + { + "epoch": 0.4249694997966653, + "grad_norm": 15.891843897658974, + "learning_rate": 1.9999629160985398e-05, + "loss": 0.4772, + "step": 1045 + }, + { + "epoch": 0.42537616917446114, + "grad_norm": 9.634370073908782, + "learning_rate": 1.999962038408986e-05, + "loss": 0.3453, + "step": 1046 + }, + { + "epoch": 0.425782838552257, + "grad_norm": 26.006435977862232, + "learning_rate": 1.9999611504543185e-05, + "loss": 0.7317, + "step": 1047 + }, + { + "epoch": 0.4261895079300529, + "grad_norm": 29.7526240270981, + "learning_rate": 1.9999602522345462e-05, + "loss": 0.7813, + "step": 1048 + }, + { + "epoch": 0.4265961773078487, + "grad_norm": 21.276610641077653, + "learning_rate": 1.9999593437496788e-05, + "loss": 0.6834, + "step": 1049 + }, + { + "epoch": 0.4270028466856446, + "grad_norm": 18.845020989514257, + "learning_rate": 1.9999584249997255e-05, + "loss": 0.2998, + "step": 1050 + }, + { + "epoch": 0.4274095160634404, + "grad_norm": 4.28174945377569, + "learning_rate": 1.9999574959846954e-05, + "loss": 0.0704, + "step": 1051 + }, + { + "epoch": 0.4278161854412363, + "grad_norm": 8.578194550541852, + "learning_rate": 1.9999565567045988e-05, + "loss": 0.1083, + "step": 1052 + }, + { + "epoch": 0.4282228548190321, + "grad_norm": 19.4715807522194, + "learning_rate": 1.9999556071594447e-05, + "loss": 0.6953, + "step": 1053 + }, + { + "epoch": 0.428629524196828, + "grad_norm": 20.613601848861745, + "learning_rate": 1.999954647349243e-05, + "loss": 0.9973, + "step": 1054 + }, + { + "epoch": 0.4290361935746238, + "grad_norm": 21.321126392201553, + "learning_rate": 1.9999536772740035e-05, + "loss": 1.0152, + "step": 1055 + }, + { + "epoch": 0.4294428629524197, + "grad_norm": 16.495560704741802, + "learning_rate": 1.9999526969337364e-05, + "loss": 0.6773, + "step": 1056 + }, + { + "epoch": 0.42984953233021556, + "grad_norm": 19.509325192435405, + "learning_rate": 1.9999517063284515e-05, + "loss": 0.4421, + "step": 1057 + }, + { + "epoch": 0.4302562017080114, + "grad_norm": 30.051285620410894, + "learning_rate": 1.9999507054581593e-05, + "loss": 1.1966, + "step": 1058 + }, + { + "epoch": 0.43066287108580725, + "grad_norm": 27.305518132783668, + "learning_rate": 1.9999496943228696e-05, + "loss": 1.2571, + "step": 1059 + }, + { + "epoch": 0.4310695404636031, + "grad_norm": 27.553520646317835, + "learning_rate": 1.9999486729225927e-05, + "loss": 0.9643, + "step": 1060 + }, + { + "epoch": 0.43147620984139895, + "grad_norm": 23.18611624411215, + "learning_rate": 1.99994764125734e-05, + "loss": 0.561, + "step": 1061 + }, + { + "epoch": 0.43188287921919477, + "grad_norm": 20.164838284814717, + "learning_rate": 1.9999465993271212e-05, + "loss": 0.4998, + "step": 1062 + }, + { + "epoch": 0.43228954859699065, + "grad_norm": 17.972145781059105, + "learning_rate": 1.9999455471319473e-05, + "loss": 0.5568, + "step": 1063 + }, + { + "epoch": 0.4326962179747865, + "grad_norm": 46.03518564727453, + "learning_rate": 1.9999444846718292e-05, + "loss": 0.7446, + "step": 1064 + }, + { + "epoch": 0.43310288735258234, + "grad_norm": 18.803796869662076, + "learning_rate": 1.9999434119467777e-05, + "loss": 0.6967, + "step": 1065 + }, + { + "epoch": 0.4335095567303782, + "grad_norm": 53.012846800001014, + "learning_rate": 1.999942328956804e-05, + "loss": 0.6118, + "step": 1066 + }, + { + "epoch": 0.43391622610817404, + "grad_norm": 19.76677672051922, + "learning_rate": 1.9999412357019186e-05, + "loss": 0.7915, + "step": 1067 + }, + { + "epoch": 0.4343228954859699, + "grad_norm": 22.667409879114658, + "learning_rate": 1.9999401321821335e-05, + "loss": 0.9216, + "step": 1068 + }, + { + "epoch": 0.43472956486376574, + "grad_norm": 28.240145920397886, + "learning_rate": 1.9999390183974594e-05, + "loss": 0.6022, + "step": 1069 + }, + { + "epoch": 0.4351362342415616, + "grad_norm": 92.78295682363095, + "learning_rate": 1.9999378943479083e-05, + "loss": 1.581, + "step": 1070 + }, + { + "epoch": 0.43554290361935744, + "grad_norm": 24.957931370369394, + "learning_rate": 1.9999367600334913e-05, + "loss": 0.7959, + "step": 1071 + }, + { + "epoch": 0.4359495729971533, + "grad_norm": 37.69440206031077, + "learning_rate": 1.99993561545422e-05, + "loss": 1.0166, + "step": 1072 + }, + { + "epoch": 0.4363562423749492, + "grad_norm": 11.384264799592161, + "learning_rate": 1.9999344606101067e-05, + "loss": 0.3128, + "step": 1073 + }, + { + "epoch": 0.436762911752745, + "grad_norm": 3.0721875081506815, + "learning_rate": 1.9999332955011628e-05, + "loss": 0.0499, + "step": 1074 + }, + { + "epoch": 0.4371695811305409, + "grad_norm": 27.486351445251255, + "learning_rate": 1.9999321201274006e-05, + "loss": 0.7404, + "step": 1075 + }, + { + "epoch": 0.4375762505083367, + "grad_norm": 11.982119222495038, + "learning_rate": 1.9999309344888314e-05, + "loss": 0.2111, + "step": 1076 + }, + { + "epoch": 0.4379829198861326, + "grad_norm": 6.2573677989935685, + "learning_rate": 1.9999297385854684e-05, + "loss": 0.1012, + "step": 1077 + }, + { + "epoch": 0.4383895892639284, + "grad_norm": 36.2248991534407, + "learning_rate": 1.999928532417323e-05, + "loss": 1.4823, + "step": 1078 + }, + { + "epoch": 0.4387962586417243, + "grad_norm": 24.776916450906725, + "learning_rate": 1.9999273159844082e-05, + "loss": 0.3172, + "step": 1079 + }, + { + "epoch": 0.43920292801952016, + "grad_norm": 16.979610257065413, + "learning_rate": 1.999926089286736e-05, + "loss": 0.7789, + "step": 1080 + }, + { + "epoch": 0.439609597397316, + "grad_norm": 22.33632368755952, + "learning_rate": 1.9999248523243193e-05, + "loss": 1.2347, + "step": 1081 + }, + { + "epoch": 0.44001626677511185, + "grad_norm": 21.817532613684026, + "learning_rate": 1.999923605097171e-05, + "loss": 0.8394, + "step": 1082 + }, + { + "epoch": 0.4404229361529077, + "grad_norm": 24.684082627539325, + "learning_rate": 1.9999223476053034e-05, + "loss": 0.3862, + "step": 1083 + }, + { + "epoch": 0.44082960553070355, + "grad_norm": 14.94241014685891, + "learning_rate": 1.9999210798487298e-05, + "loss": 0.3638, + "step": 1084 + }, + { + "epoch": 0.44123627490849937, + "grad_norm": 21.29161738192782, + "learning_rate": 1.9999198018274632e-05, + "loss": 0.1507, + "step": 1085 + }, + { + "epoch": 0.44164294428629525, + "grad_norm": 22.43742039395399, + "learning_rate": 1.9999185135415163e-05, + "loss": 0.7322, + "step": 1086 + }, + { + "epoch": 0.44204961366409107, + "grad_norm": 37.55926953053442, + "learning_rate": 1.9999172149909026e-05, + "loss": 1.0309, + "step": 1087 + }, + { + "epoch": 0.44245628304188694, + "grad_norm": 27.821548297481343, + "learning_rate": 1.9999159061756357e-05, + "loss": 1.1042, + "step": 1088 + }, + { + "epoch": 0.4428629524196828, + "grad_norm": 29.499908022044515, + "learning_rate": 1.9999145870957287e-05, + "loss": 1.0999, + "step": 1089 + }, + { + "epoch": 0.44326962179747864, + "grad_norm": 29.01984834208943, + "learning_rate": 1.999913257751195e-05, + "loss": 1.1389, + "step": 1090 + }, + { + "epoch": 0.4436762911752745, + "grad_norm": 47.12264397091942, + "learning_rate": 1.9999119181420486e-05, + "loss": 0.8475, + "step": 1091 + }, + { + "epoch": 0.44408296055307034, + "grad_norm": 13.80978772223101, + "learning_rate": 1.999910568268303e-05, + "loss": 0.4628, + "step": 1092 + }, + { + "epoch": 0.4444896299308662, + "grad_norm": 11.6779001233251, + "learning_rate": 1.999909208129972e-05, + "loss": 0.2524, + "step": 1093 + }, + { + "epoch": 0.44489629930866204, + "grad_norm": 44.67667852811322, + "learning_rate": 1.9999078377270704e-05, + "loss": 1.0778, + "step": 1094 + }, + { + "epoch": 0.4453029686864579, + "grad_norm": 18.56985575158798, + "learning_rate": 1.9999064570596113e-05, + "loss": 0.4723, + "step": 1095 + }, + { + "epoch": 0.4457096380642538, + "grad_norm": 23.517826965595777, + "learning_rate": 1.999905066127609e-05, + "loss": 0.7073, + "step": 1096 + }, + { + "epoch": 0.4461163074420496, + "grad_norm": 28.94119170144519, + "learning_rate": 1.9999036649310783e-05, + "loss": 0.7941, + "step": 1097 + }, + { + "epoch": 0.4465229768198455, + "grad_norm": 8.62525759951504, + "learning_rate": 1.999902253470033e-05, + "loss": 0.1654, + "step": 1098 + }, + { + "epoch": 0.4469296461976413, + "grad_norm": 18.341659934534288, + "learning_rate": 1.9999008317444873e-05, + "loss": 0.7666, + "step": 1099 + }, + { + "epoch": 0.4473363155754372, + "grad_norm": 10.103078719818592, + "learning_rate": 1.999899399754457e-05, + "loss": 0.1963, + "step": 1100 + }, + { + "epoch": 0.447742984953233, + "grad_norm": 10.45873770632186, + "learning_rate": 1.9998979574999563e-05, + "loss": 0.1153, + "step": 1101 + }, + { + "epoch": 0.4481496543310289, + "grad_norm": 75.88190386689719, + "learning_rate": 1.9998965049809993e-05, + "loss": 2.1074, + "step": 1102 + }, + { + "epoch": 0.4485563237088247, + "grad_norm": 18.26445823174301, + "learning_rate": 1.9998950421976015e-05, + "loss": 0.5448, + "step": 1103 + }, + { + "epoch": 0.4489629930866206, + "grad_norm": 10.128207811011453, + "learning_rate": 1.9998935691497784e-05, + "loss": 0.107, + "step": 1104 + }, + { + "epoch": 0.44936966246441645, + "grad_norm": 21.52915796442128, + "learning_rate": 1.999892085837544e-05, + "loss": 0.8772, + "step": 1105 + }, + { + "epoch": 0.4497763318422123, + "grad_norm": 18.339823165324425, + "learning_rate": 1.9998905922609143e-05, + "loss": 0.5828, + "step": 1106 + }, + { + "epoch": 0.45018300122000815, + "grad_norm": 33.570886102902286, + "learning_rate": 1.9998890884199044e-05, + "loss": 1.0873, + "step": 1107 + }, + { + "epoch": 0.45058967059780397, + "grad_norm": 5.386303916949442, + "learning_rate": 1.9998875743145296e-05, + "loss": 0.0391, + "step": 1108 + }, + { + "epoch": 0.45099633997559985, + "grad_norm": 13.374403665431032, + "learning_rate": 1.9998860499448058e-05, + "loss": 0.4242, + "step": 1109 + }, + { + "epoch": 0.45140300935339567, + "grad_norm": 5.284339377155168, + "learning_rate": 1.9998845153107486e-05, + "loss": 0.0677, + "step": 1110 + }, + { + "epoch": 0.45180967873119154, + "grad_norm": 18.357364750717537, + "learning_rate": 1.9998829704123735e-05, + "loss": 0.4813, + "step": 1111 + }, + { + "epoch": 0.4522163481089874, + "grad_norm": 20.91405312926603, + "learning_rate": 1.9998814152496963e-05, + "loss": 0.7707, + "step": 1112 + }, + { + "epoch": 0.45262301748678324, + "grad_norm": 17.23240364898034, + "learning_rate": 1.9998798498227334e-05, + "loss": 0.4953, + "step": 1113 + }, + { + "epoch": 0.4530296868645791, + "grad_norm": 20.857933954403517, + "learning_rate": 1.9998782741315005e-05, + "loss": 0.921, + "step": 1114 + }, + { + "epoch": 0.45343635624237494, + "grad_norm": 16.212942739811243, + "learning_rate": 1.999876688176014e-05, + "loss": 0.3082, + "step": 1115 + }, + { + "epoch": 0.4538430256201708, + "grad_norm": 11.194564817786445, + "learning_rate": 1.9998750919562897e-05, + "loss": 0.2153, + "step": 1116 + }, + { + "epoch": 0.45424969499796664, + "grad_norm": 23.966635918389585, + "learning_rate": 1.9998734854723446e-05, + "loss": 0.7016, + "step": 1117 + }, + { + "epoch": 0.4546563643757625, + "grad_norm": 32.00047710566409, + "learning_rate": 1.9998718687241952e-05, + "loss": 0.1835, + "step": 1118 + }, + { + "epoch": 0.45506303375355833, + "grad_norm": 17.03676055567296, + "learning_rate": 1.9998702417118577e-05, + "loss": 0.4956, + "step": 1119 + }, + { + "epoch": 0.4554697031313542, + "grad_norm": 40.159233460196575, + "learning_rate": 1.9998686044353488e-05, + "loss": 0.2612, + "step": 1120 + }, + { + "epoch": 0.4558763725091501, + "grad_norm": 17.670152871814704, + "learning_rate": 1.9998669568946854e-05, + "loss": 0.6172, + "step": 1121 + }, + { + "epoch": 0.4562830418869459, + "grad_norm": 10.098354846549178, + "learning_rate": 1.9998652990898845e-05, + "loss": 0.2542, + "step": 1122 + }, + { + "epoch": 0.4566897112647418, + "grad_norm": 23.10090573552521, + "learning_rate": 1.9998636310209632e-05, + "loss": 0.4081, + "step": 1123 + }, + { + "epoch": 0.4570963806425376, + "grad_norm": 15.464535230510993, + "learning_rate": 1.9998619526879386e-05, + "loss": 0.082, + "step": 1124 + }, + { + "epoch": 0.4575030500203335, + "grad_norm": 18.71419221796546, + "learning_rate": 1.9998602640908276e-05, + "loss": 1.0161, + "step": 1125 + }, + { + "epoch": 0.4579097193981293, + "grad_norm": 6.567050061145172, + "learning_rate": 1.999858565229648e-05, + "loss": 0.0887, + "step": 1126 + }, + { + "epoch": 0.4583163887759252, + "grad_norm": 8.031714181804732, + "learning_rate": 1.9998568561044166e-05, + "loss": 0.2815, + "step": 1127 + }, + { + "epoch": 0.458723058153721, + "grad_norm": 45.742144587730394, + "learning_rate": 1.999855136715152e-05, + "loss": 1.7394, + "step": 1128 + }, + { + "epoch": 0.4591297275315169, + "grad_norm": 2.0123274726068523, + "learning_rate": 1.9998534070618707e-05, + "loss": 0.0346, + "step": 1129 + }, + { + "epoch": 0.45953639690931275, + "grad_norm": 21.178343457965035, + "learning_rate": 1.999851667144591e-05, + "loss": 0.4171, + "step": 1130 + }, + { + "epoch": 0.45994306628710857, + "grad_norm": 20.38823284230698, + "learning_rate": 1.999849916963331e-05, + "loss": 0.5155, + "step": 1131 + }, + { + "epoch": 0.46034973566490445, + "grad_norm": 99.71186393122699, + "learning_rate": 1.999848156518108e-05, + "loss": 1.0139, + "step": 1132 + }, + { + "epoch": 0.46075640504270027, + "grad_norm": 5.966724335960012, + "learning_rate": 1.9998463858089408e-05, + "loss": 0.1262, + "step": 1133 + }, + { + "epoch": 0.46116307442049614, + "grad_norm": 5.59314125400446, + "learning_rate": 1.999844604835847e-05, + "loss": 0.0766, + "step": 1134 + }, + { + "epoch": 0.46156974379829196, + "grad_norm": 3.2652680727473635, + "learning_rate": 1.9998428135988454e-05, + "loss": 0.0407, + "step": 1135 + }, + { + "epoch": 0.46197641317608784, + "grad_norm": 8.425050850547715, + "learning_rate": 1.999841012097954e-05, + "loss": 0.2818, + "step": 1136 + }, + { + "epoch": 0.4623830825538837, + "grad_norm": 35.06033407753112, + "learning_rate": 1.9998392003331912e-05, + "loss": 1.6622, + "step": 1137 + }, + { + "epoch": 0.46278975193167954, + "grad_norm": 16.543281549600707, + "learning_rate": 1.999837378304576e-05, + "loss": 0.5443, + "step": 1138 + }, + { + "epoch": 0.4631964213094754, + "grad_norm": 18.62750094129083, + "learning_rate": 1.999835546012127e-05, + "loss": 0.738, + "step": 1139 + }, + { + "epoch": 0.46360309068727124, + "grad_norm": 199.29685618820312, + "learning_rate": 1.9998337034558625e-05, + "loss": 1.0574, + "step": 1140 + }, + { + "epoch": 0.4640097600650671, + "grad_norm": 21.11079669867252, + "learning_rate": 1.999831850635802e-05, + "loss": 0.4753, + "step": 1141 + }, + { + "epoch": 0.46441642944286293, + "grad_norm": 18.184293929938878, + "learning_rate": 1.9998299875519643e-05, + "loss": 0.1865, + "step": 1142 + }, + { + "epoch": 0.4648230988206588, + "grad_norm": 14.971284078932838, + "learning_rate": 1.999828114204369e-05, + "loss": 0.3308, + "step": 1143 + }, + { + "epoch": 0.46522976819845463, + "grad_norm": 12.38054790239868, + "learning_rate": 1.9998262305930344e-05, + "loss": 0.2433, + "step": 1144 + }, + { + "epoch": 0.4656364375762505, + "grad_norm": 48.04579236772463, + "learning_rate": 1.9998243367179806e-05, + "loss": 1.2022, + "step": 1145 + }, + { + "epoch": 0.4660431069540464, + "grad_norm": 28.790735156563706, + "learning_rate": 1.9998224325792267e-05, + "loss": 0.486, + "step": 1146 + }, + { + "epoch": 0.4664497763318422, + "grad_norm": 51.630812725279114, + "learning_rate": 1.9998205181767925e-05, + "loss": 1.0586, + "step": 1147 + }, + { + "epoch": 0.4668564457096381, + "grad_norm": 27.28712037238965, + "learning_rate": 1.9998185935106972e-05, + "loss": 1.4403, + "step": 1148 + }, + { + "epoch": 0.4672631150874339, + "grad_norm": 22.29854635962306, + "learning_rate": 1.999816658580961e-05, + "loss": 0.9121, + "step": 1149 + }, + { + "epoch": 0.4676697844652298, + "grad_norm": 19.152952088076383, + "learning_rate": 1.9998147133876037e-05, + "loss": 0.5112, + "step": 1150 + }, + { + "epoch": 0.4680764538430256, + "grad_norm": 12.37360704007544, + "learning_rate": 1.999812757930645e-05, + "loss": 0.5335, + "step": 1151 + }, + { + "epoch": 0.4684831232208215, + "grad_norm": 14.409772108536151, + "learning_rate": 1.9998107922101052e-05, + "loss": 0.3799, + "step": 1152 + }, + { + "epoch": 0.46888979259861735, + "grad_norm": 26.321904629092053, + "learning_rate": 1.9998088162260045e-05, + "loss": 0.9663, + "step": 1153 + }, + { + "epoch": 0.46929646197641317, + "grad_norm": 14.383020497953114, + "learning_rate": 1.9998068299783632e-05, + "loss": 0.2688, + "step": 1154 + }, + { + "epoch": 0.46970313135420905, + "grad_norm": 14.316491263486508, + "learning_rate": 1.9998048334672013e-05, + "loss": 0.3413, + "step": 1155 + }, + { + "epoch": 0.47010980073200487, + "grad_norm": 2.1740315522275835, + "learning_rate": 1.99980282669254e-05, + "loss": 0.0303, + "step": 1156 + }, + { + "epoch": 0.47051647010980074, + "grad_norm": 3.326510619840411, + "learning_rate": 1.9998008096543988e-05, + "loss": 0.0396, + "step": 1157 + }, + { + "epoch": 0.47092313948759656, + "grad_norm": 15.062521124637994, + "learning_rate": 1.9997987823527998e-05, + "loss": 0.3751, + "step": 1158 + }, + { + "epoch": 0.47132980886539244, + "grad_norm": 9.299437933633735, + "learning_rate": 1.9997967447877626e-05, + "loss": 0.139, + "step": 1159 + }, + { + "epoch": 0.47173647824318826, + "grad_norm": 15.88542644237422, + "learning_rate": 1.9997946969593088e-05, + "loss": 0.3247, + "step": 1160 + }, + { + "epoch": 0.47214314762098414, + "grad_norm": 12.375523818294834, + "learning_rate": 1.9997926388674595e-05, + "loss": 0.1603, + "step": 1161 + }, + { + "epoch": 0.47254981699878, + "grad_norm": 11.640628490161452, + "learning_rate": 1.9997905705122352e-05, + "loss": 0.5557, + "step": 1162 + }, + { + "epoch": 0.47295648637657584, + "grad_norm": 34.65544451625838, + "learning_rate": 1.9997884918936578e-05, + "loss": 1.3082, + "step": 1163 + }, + { + "epoch": 0.4733631557543717, + "grad_norm": 23.596666473011126, + "learning_rate": 1.999786403011748e-05, + "loss": 1.0525, + "step": 1164 + }, + { + "epoch": 0.47376982513216753, + "grad_norm": 25.136090751841646, + "learning_rate": 1.999784303866528e-05, + "loss": 0.7624, + "step": 1165 + }, + { + "epoch": 0.4741764945099634, + "grad_norm": 12.671257477746865, + "learning_rate": 1.999782194458019e-05, + "loss": 0.1812, + "step": 1166 + }, + { + "epoch": 0.47458316388775923, + "grad_norm": 30.190949616080363, + "learning_rate": 1.999780074786242e-05, + "loss": 0.1795, + "step": 1167 + }, + { + "epoch": 0.4749898332655551, + "grad_norm": 19.61110320663683, + "learning_rate": 1.99977794485122e-05, + "loss": 0.1567, + "step": 1168 + }, + { + "epoch": 0.475396502643351, + "grad_norm": 2.91796950678739, + "learning_rate": 1.9997758046529738e-05, + "loss": 0.0228, + "step": 1169 + }, + { + "epoch": 0.4758031720211468, + "grad_norm": 15.042577065707993, + "learning_rate": 1.999773654191526e-05, + "loss": 0.4016, + "step": 1170 + }, + { + "epoch": 0.4762098413989427, + "grad_norm": 14.891491693078516, + "learning_rate": 1.9997714934668983e-05, + "loss": 0.2338, + "step": 1171 + }, + { + "epoch": 0.4766165107767385, + "grad_norm": 9.84735361188029, + "learning_rate": 1.9997693224791134e-05, + "loss": 0.3286, + "step": 1172 + }, + { + "epoch": 0.4770231801545344, + "grad_norm": 13.503915059967081, + "learning_rate": 1.9997671412281927e-05, + "loss": 0.4017, + "step": 1173 + }, + { + "epoch": 0.4774298495323302, + "grad_norm": 8.73399346000633, + "learning_rate": 1.9997649497141594e-05, + "loss": 0.1473, + "step": 1174 + }, + { + "epoch": 0.4778365189101261, + "grad_norm": 7.419867520134502, + "learning_rate": 1.9997627479370355e-05, + "loss": 0.0874, + "step": 1175 + }, + { + "epoch": 0.4782431882879219, + "grad_norm": 34.14590301088591, + "learning_rate": 1.9997605358968443e-05, + "loss": 1.3674, + "step": 1176 + }, + { + "epoch": 0.47864985766571777, + "grad_norm": 6.454405702968852, + "learning_rate": 1.9997583135936076e-05, + "loss": 0.0917, + "step": 1177 + }, + { + "epoch": 0.47905652704351365, + "grad_norm": 26.471269485380716, + "learning_rate": 1.999756081027349e-05, + "loss": 0.6421, + "step": 1178 + }, + { + "epoch": 0.47946319642130947, + "grad_norm": 20.785682162695014, + "learning_rate": 1.999753838198091e-05, + "loss": 0.2648, + "step": 1179 + }, + { + "epoch": 0.47986986579910534, + "grad_norm": 30.417254003423764, + "learning_rate": 1.9997515851058563e-05, + "loss": 1.1801, + "step": 1180 + }, + { + "epoch": 0.48027653517690116, + "grad_norm": 16.197580129257982, + "learning_rate": 1.9997493217506687e-05, + "loss": 0.9243, + "step": 1181 + }, + { + "epoch": 0.48068320455469704, + "grad_norm": 5.36382087781479, + "learning_rate": 1.999747048132551e-05, + "loss": 0.0921, + "step": 1182 + }, + { + "epoch": 0.48108987393249286, + "grad_norm": 112.7986705302628, + "learning_rate": 1.9997447642515268e-05, + "loss": 0.994, + "step": 1183 + }, + { + "epoch": 0.48149654331028874, + "grad_norm": 20.495969729114645, + "learning_rate": 1.9997424701076196e-05, + "loss": 0.4971, + "step": 1184 + }, + { + "epoch": 0.4819032126880846, + "grad_norm": 21.125417183734687, + "learning_rate": 1.9997401657008528e-05, + "loss": 0.7655, + "step": 1185 + }, + { + "epoch": 0.48230988206588044, + "grad_norm": 4.793343028169757, + "learning_rate": 1.9997378510312498e-05, + "loss": 0.0352, + "step": 1186 + }, + { + "epoch": 0.4827165514436763, + "grad_norm": 15.32245022131916, + "learning_rate": 1.9997355260988347e-05, + "loss": 0.6251, + "step": 1187 + }, + { + "epoch": 0.48312322082147213, + "grad_norm": 31.8367944274435, + "learning_rate": 1.9997331909036312e-05, + "loss": 0.9893, + "step": 1188 + }, + { + "epoch": 0.483529890199268, + "grad_norm": 23.973102172854972, + "learning_rate": 1.9997308454456633e-05, + "loss": 1.2177, + "step": 1189 + }, + { + "epoch": 0.48393655957706383, + "grad_norm": 6.269985539920745, + "learning_rate": 1.999728489724955e-05, + "loss": 0.1036, + "step": 1190 + }, + { + "epoch": 0.4843432289548597, + "grad_norm": 24.616892166085222, + "learning_rate": 1.9997261237415308e-05, + "loss": 0.9238, + "step": 1191 + }, + { + "epoch": 0.4847498983326555, + "grad_norm": 22.01727998398506, + "learning_rate": 1.999723747495415e-05, + "loss": 0.4848, + "step": 1192 + }, + { + "epoch": 0.4851565677104514, + "grad_norm": 20.90949864211276, + "learning_rate": 1.9997213609866315e-05, + "loss": 0.7516, + "step": 1193 + }, + { + "epoch": 0.4855632370882473, + "grad_norm": 3.741978432797156, + "learning_rate": 1.9997189642152054e-05, + "loss": 0.0402, + "step": 1194 + }, + { + "epoch": 0.4859699064660431, + "grad_norm": 5.070141978633352, + "learning_rate": 1.9997165571811604e-05, + "loss": 0.0728, + "step": 1195 + }, + { + "epoch": 0.486376575843839, + "grad_norm": 11.493036544337757, + "learning_rate": 1.9997141398845224e-05, + "loss": 0.4681, + "step": 1196 + }, + { + "epoch": 0.4867832452216348, + "grad_norm": 8.79784231109209, + "learning_rate": 1.9997117123253152e-05, + "loss": 0.2157, + "step": 1197 + }, + { + "epoch": 0.4871899145994307, + "grad_norm": 8.318032887376924, + "learning_rate": 1.9997092745035642e-05, + "loss": 0.0958, + "step": 1198 + }, + { + "epoch": 0.4875965839772265, + "grad_norm": 5.885033899351531, + "learning_rate": 1.9997068264192942e-05, + "loss": 0.0777, + "step": 1199 + }, + { + "epoch": 0.48800325335502237, + "grad_norm": 38.90375594591277, + "learning_rate": 1.999704368072531e-05, + "loss": 0.9319, + "step": 1200 + }, + { + "epoch": 0.4884099227328182, + "grad_norm": 9.840191667826083, + "learning_rate": 1.999701899463299e-05, + "loss": 0.099, + "step": 1201 + }, + { + "epoch": 0.48881659211061407, + "grad_norm": 17.16938779312932, + "learning_rate": 1.999699420591624e-05, + "loss": 1.0209, + "step": 1202 + }, + { + "epoch": 0.48922326148840994, + "grad_norm": 15.772075845120948, + "learning_rate": 1.999696931457531e-05, + "loss": 0.3924, + "step": 1203 + }, + { + "epoch": 0.48962993086620576, + "grad_norm": 14.114044895003302, + "learning_rate": 1.999694432061046e-05, + "loss": 0.4012, + "step": 1204 + }, + { + "epoch": 0.49003660024400164, + "grad_norm": 13.707210250361353, + "learning_rate": 1.9996919224021946e-05, + "loss": 0.3245, + "step": 1205 + }, + { + "epoch": 0.49044326962179746, + "grad_norm": 33.369787239505875, + "learning_rate": 1.9996894024810024e-05, + "loss": 0.9161, + "step": 1206 + }, + { + "epoch": 0.49084993899959334, + "grad_norm": 31.38298570524836, + "learning_rate": 1.9996868722974954e-05, + "loss": 0.8013, + "step": 1207 + }, + { + "epoch": 0.49125660837738916, + "grad_norm": 12.318831167395636, + "learning_rate": 1.9996843318516995e-05, + "loss": 1.1447, + "step": 1208 + }, + { + "epoch": 0.49166327775518504, + "grad_norm": 25.073786860922947, + "learning_rate": 1.999681781143641e-05, + "loss": 0.1569, + "step": 1209 + }, + { + "epoch": 0.4920699471329809, + "grad_norm": 13.981197380891265, + "learning_rate": 1.9996792201733456e-05, + "loss": 0.5256, + "step": 1210 + }, + { + "epoch": 0.49247661651077673, + "grad_norm": 12.903783988481697, + "learning_rate": 1.9996766489408398e-05, + "loss": 0.5888, + "step": 1211 + }, + { + "epoch": 0.4928832858885726, + "grad_norm": 33.831291491026555, + "learning_rate": 1.9996740674461504e-05, + "loss": 0.1815, + "step": 1212 + }, + { + "epoch": 0.49328995526636843, + "grad_norm": 26.64108866067166, + "learning_rate": 1.9996714756893035e-05, + "loss": 0.9616, + "step": 1213 + }, + { + "epoch": 0.4936966246441643, + "grad_norm": 17.503915703844797, + "learning_rate": 1.999668873670326e-05, + "loss": 0.6026, + "step": 1214 + }, + { + "epoch": 0.4941032940219601, + "grad_norm": 5.736474926621896, + "learning_rate": 1.9996662613892438e-05, + "loss": 0.0796, + "step": 1215 + }, + { + "epoch": 0.494509963399756, + "grad_norm": 28.533493133346425, + "learning_rate": 1.9996636388460846e-05, + "loss": 0.7782, + "step": 1216 + }, + { + "epoch": 0.4949166327775518, + "grad_norm": 12.606233956213172, + "learning_rate": 1.9996610060408748e-05, + "loss": 0.3807, + "step": 1217 + }, + { + "epoch": 0.4953233021553477, + "grad_norm": 3.4281516785317603, + "learning_rate": 1.999658362973642e-05, + "loss": 0.045, + "step": 1218 + }, + { + "epoch": 0.4957299715331436, + "grad_norm": 65.82871822983562, + "learning_rate": 1.999655709644413e-05, + "loss": 0.4146, + "step": 1219 + }, + { + "epoch": 0.4961366409109394, + "grad_norm": 35.55267225676166, + "learning_rate": 1.9996530460532148e-05, + "loss": 0.0798, + "step": 1220 + }, + { + "epoch": 0.4965433102887353, + "grad_norm": 32.12061419355753, + "learning_rate": 1.9996503722000753e-05, + "loss": 0.7878, + "step": 1221 + }, + { + "epoch": 0.4969499796665311, + "grad_norm": 20.013734679106346, + "learning_rate": 1.9996476880850214e-05, + "loss": 0.656, + "step": 1222 + }, + { + "epoch": 0.49735664904432697, + "grad_norm": 14.378485551963447, + "learning_rate": 1.9996449937080808e-05, + "loss": 0.3157, + "step": 1223 + }, + { + "epoch": 0.4977633184221228, + "grad_norm": 21.952024246258905, + "learning_rate": 1.9996422890692814e-05, + "loss": 0.704, + "step": 1224 + }, + { + "epoch": 0.49816998779991867, + "grad_norm": 14.645254091472749, + "learning_rate": 1.9996395741686504e-05, + "loss": 0.3009, + "step": 1225 + }, + { + "epoch": 0.49857665717771454, + "grad_norm": 18.174234109512195, + "learning_rate": 1.9996368490062163e-05, + "loss": 0.5765, + "step": 1226 + }, + { + "epoch": 0.49898332655551036, + "grad_norm": 10.439378303138373, + "learning_rate": 1.999634113582007e-05, + "loss": 0.1754, + "step": 1227 + }, + { + "epoch": 0.49938999593330624, + "grad_norm": 39.484774044452344, + "learning_rate": 1.99963136789605e-05, + "loss": 1.423, + "step": 1228 + }, + { + "epoch": 0.49979666531110206, + "grad_norm": 41.85991624102765, + "learning_rate": 1.9996286119483744e-05, + "loss": 1.144, + "step": 1229 + }, + { + "epoch": 0.5002033346888979, + "grad_norm": 14.923393028776594, + "learning_rate": 1.999625845739008e-05, + "loss": 0.3465, + "step": 1230 + }, + { + "epoch": 0.5006100040666938, + "grad_norm": 18.699085375172906, + "learning_rate": 1.999623069267979e-05, + "loss": 0.6602, + "step": 1231 + }, + { + "epoch": 0.5010166734444896, + "grad_norm": 10.959724933748458, + "learning_rate": 1.9996202825353158e-05, + "loss": 0.1502, + "step": 1232 + }, + { + "epoch": 0.5014233428222855, + "grad_norm": 15.097078326737751, + "learning_rate": 1.9996174855410478e-05, + "loss": 0.1824, + "step": 1233 + }, + { + "epoch": 0.5018300122000814, + "grad_norm": 19.18281347862503, + "learning_rate": 1.999614678285203e-05, + "loss": 0.5802, + "step": 1234 + }, + { + "epoch": 0.5022366815778772, + "grad_norm": 28.406046417753853, + "learning_rate": 1.99961186076781e-05, + "loss": 0.5706, + "step": 1235 + }, + { + "epoch": 0.502643350955673, + "grad_norm": 48.32180950541135, + "learning_rate": 1.9996090329888988e-05, + "loss": 1.2779, + "step": 1236 + }, + { + "epoch": 0.5030500203334689, + "grad_norm": 19.403813126648284, + "learning_rate": 1.9996061949484977e-05, + "loss": 0.4848, + "step": 1237 + }, + { + "epoch": 0.5034566897112648, + "grad_norm": 29.49260441062258, + "learning_rate": 1.9996033466466357e-05, + "loss": 0.9181, + "step": 1238 + }, + { + "epoch": 0.5038633590890605, + "grad_norm": 6.569350139463069, + "learning_rate": 1.9996004880833425e-05, + "loss": 0.2364, + "step": 1239 + }, + { + "epoch": 0.5042700284668564, + "grad_norm": 16.08171263808386, + "learning_rate": 1.9995976192586467e-05, + "loss": 0.2781, + "step": 1240 + }, + { + "epoch": 0.5046766978446523, + "grad_norm": 20.77067464851792, + "learning_rate": 1.9995947401725787e-05, + "loss": 0.7633, + "step": 1241 + }, + { + "epoch": 0.5050833672224482, + "grad_norm": 76.56847374946967, + "learning_rate": 1.9995918508251676e-05, + "loss": 1.5501, + "step": 1242 + }, + { + "epoch": 0.505490036600244, + "grad_norm": 7.164797733324011, + "learning_rate": 1.999588951216443e-05, + "loss": 0.3095, + "step": 1243 + }, + { + "epoch": 0.5058967059780398, + "grad_norm": 17.545463318237708, + "learning_rate": 1.9995860413464347e-05, + "loss": 0.3153, + "step": 1244 + }, + { + "epoch": 0.5063033753558357, + "grad_norm": 28.95417076908507, + "learning_rate": 1.9995831212151725e-05, + "loss": 0.6849, + "step": 1245 + }, + { + "epoch": 0.5067100447336316, + "grad_norm": 93.9063057085275, + "learning_rate": 1.9995801908226865e-05, + "loss": 0.4221, + "step": 1246 + }, + { + "epoch": 0.5071167141114274, + "grad_norm": 39.75833840386845, + "learning_rate": 1.999577250169007e-05, + "loss": 1.9233, + "step": 1247 + }, + { + "epoch": 0.5075233834892232, + "grad_norm": 30.492780766517672, + "learning_rate": 1.9995742992541638e-05, + "loss": 1.4257, + "step": 1248 + }, + { + "epoch": 0.5079300528670191, + "grad_norm": 25.602601336989725, + "learning_rate": 1.9995713380781874e-05, + "loss": 1.1465, + "step": 1249 + }, + { + "epoch": 0.508336722244815, + "grad_norm": 25.160684542935147, + "learning_rate": 1.999568366641108e-05, + "loss": 0.7067, + "step": 1250 + }, + { + "epoch": 0.5087433916226108, + "grad_norm": 19.50916913847119, + "learning_rate": 1.9995653849429563e-05, + "loss": 0.5907, + "step": 1251 + }, + { + "epoch": 0.5091500610004067, + "grad_norm": 18.567797241387083, + "learning_rate": 1.999562392983763e-05, + "loss": 0.4512, + "step": 1252 + }, + { + "epoch": 0.5095567303782025, + "grad_norm": 17.81173192420958, + "learning_rate": 1.9995593907635586e-05, + "loss": 0.5808, + "step": 1253 + }, + { + "epoch": 0.5099633997559984, + "grad_norm": 5.52103940371618, + "learning_rate": 1.9995563782823738e-05, + "loss": 0.1071, + "step": 1254 + }, + { + "epoch": 0.5103700691337942, + "grad_norm": 31.330250516928125, + "learning_rate": 1.99955335554024e-05, + "loss": 1.9117, + "step": 1255 + }, + { + "epoch": 0.5107767385115901, + "grad_norm": 22.380945617069525, + "learning_rate": 1.9995503225371876e-05, + "loss": 1.2089, + "step": 1256 + }, + { + "epoch": 0.5111834078893859, + "grad_norm": 13.267003462753012, + "learning_rate": 1.9995472792732486e-05, + "loss": 0.3287, + "step": 1257 + }, + { + "epoch": 0.5115900772671818, + "grad_norm": 16.63892505468577, + "learning_rate": 1.999544225748453e-05, + "loss": 0.4668, + "step": 1258 + }, + { + "epoch": 0.5119967466449776, + "grad_norm": 15.912507502591255, + "learning_rate": 1.9995411619628333e-05, + "loss": 0.3089, + "step": 1259 + }, + { + "epoch": 0.5124034160227735, + "grad_norm": 10.673142388638713, + "learning_rate": 1.9995380879164207e-05, + "loss": 0.0964, + "step": 1260 + }, + { + "epoch": 0.5128100854005694, + "grad_norm": 18.697363345529286, + "learning_rate": 1.9995350036092462e-05, + "loss": 0.8952, + "step": 1261 + }, + { + "epoch": 0.5132167547783651, + "grad_norm": 1.7139014544450828, + "learning_rate": 1.999531909041342e-05, + "loss": 0.0263, + "step": 1262 + }, + { + "epoch": 0.513623424156161, + "grad_norm": 27.911627866550212, + "learning_rate": 1.9995288042127396e-05, + "loss": 0.8828, + "step": 1263 + }, + { + "epoch": 0.5140300935339569, + "grad_norm": 18.148179621019974, + "learning_rate": 1.9995256891234712e-05, + "loss": 0.5876, + "step": 1264 + }, + { + "epoch": 0.5144367629117528, + "grad_norm": 16.672766466386655, + "learning_rate": 1.9995225637735683e-05, + "loss": 0.4147, + "step": 1265 + }, + { + "epoch": 0.5148434322895485, + "grad_norm": 7.432017865425055, + "learning_rate": 1.9995194281630633e-05, + "loss": 0.1924, + "step": 1266 + }, + { + "epoch": 0.5152501016673444, + "grad_norm": 14.347387167963305, + "learning_rate": 1.999516282291988e-05, + "loss": 0.5725, + "step": 1267 + }, + { + "epoch": 0.5156567710451403, + "grad_norm": 20.73207238235617, + "learning_rate": 1.9995131261603757e-05, + "loss": 1.0261, + "step": 1268 + }, + { + "epoch": 0.5160634404229362, + "grad_norm": 21.632492812247758, + "learning_rate": 1.9995099597682577e-05, + "loss": 0.4014, + "step": 1269 + }, + { + "epoch": 0.516470109800732, + "grad_norm": 35.63616255110076, + "learning_rate": 1.999506783115667e-05, + "loss": 1.3375, + "step": 1270 + }, + { + "epoch": 0.5168767791785278, + "grad_norm": 3.8023339728636136, + "learning_rate": 1.999503596202636e-05, + "loss": 0.0621, + "step": 1271 + }, + { + "epoch": 0.5172834485563237, + "grad_norm": 22.448473165260317, + "learning_rate": 1.9995003990291978e-05, + "loss": 0.4495, + "step": 1272 + }, + { + "epoch": 0.5176901179341196, + "grad_norm": 15.991244592485613, + "learning_rate": 1.9994971915953848e-05, + "loss": 0.9242, + "step": 1273 + }, + { + "epoch": 0.5180967873119154, + "grad_norm": 39.558915767600205, + "learning_rate": 1.99949397390123e-05, + "loss": 1.5087, + "step": 1274 + }, + { + "epoch": 0.5185034566897113, + "grad_norm": 17.433926706726623, + "learning_rate": 1.999490745946767e-05, + "loss": 0.7627, + "step": 1275 + }, + { + "epoch": 0.5189101260675071, + "grad_norm": 35.15874607126169, + "learning_rate": 1.999487507732028e-05, + "loss": 1.0114, + "step": 1276 + }, + { + "epoch": 0.519316795445303, + "grad_norm": 16.410781745254145, + "learning_rate": 1.9994842592570466e-05, + "loss": 0.6442, + "step": 1277 + }, + { + "epoch": 0.5197234648230988, + "grad_norm": 13.119960663731007, + "learning_rate": 1.9994810005218564e-05, + "loss": 0.352, + "step": 1278 + }, + { + "epoch": 0.5201301342008947, + "grad_norm": 9.202674236196842, + "learning_rate": 1.9994777315264908e-05, + "loss": 0.2781, + "step": 1279 + }, + { + "epoch": 0.5205368035786905, + "grad_norm": 25.22997938431019, + "learning_rate": 1.9994744522709833e-05, + "loss": 1.0938, + "step": 1280 + }, + { + "epoch": 0.5209434729564864, + "grad_norm": 18.138609680581776, + "learning_rate": 1.9994711627553673e-05, + "loss": 0.5471, + "step": 1281 + }, + { + "epoch": 0.5213501423342822, + "grad_norm": 17.55589006196467, + "learning_rate": 1.999467862979677e-05, + "loss": 0.4372, + "step": 1282 + }, + { + "epoch": 0.5217568117120781, + "grad_norm": 29.7580059740763, + "learning_rate": 1.999464552943946e-05, + "loss": 0.7931, + "step": 1283 + }, + { + "epoch": 0.522163481089874, + "grad_norm": 12.675656107593742, + "learning_rate": 1.9994612326482083e-05, + "loss": 0.3994, + "step": 1284 + }, + { + "epoch": 0.5225701504676697, + "grad_norm": 27.533976952404604, + "learning_rate": 1.9994579020924983e-05, + "loss": 1.1078, + "step": 1285 + }, + { + "epoch": 0.5229768198454656, + "grad_norm": 6.921922556438484, + "learning_rate": 1.9994545612768496e-05, + "loss": 0.1525, + "step": 1286 + }, + { + "epoch": 0.5233834892232615, + "grad_norm": 11.841858393174535, + "learning_rate": 1.9994512102012968e-05, + "loss": 0.3626, + "step": 1287 + }, + { + "epoch": 0.5237901586010574, + "grad_norm": 5.338536505250233, + "learning_rate": 1.9994478488658745e-05, + "loss": 0.0967, + "step": 1288 + }, + { + "epoch": 0.5241968279788531, + "grad_norm": 8.761845060340283, + "learning_rate": 1.999444477270617e-05, + "loss": 0.1602, + "step": 1289 + }, + { + "epoch": 0.524603497356649, + "grad_norm": 24.57677366435777, + "learning_rate": 1.9994410954155588e-05, + "loss": 1.4596, + "step": 1290 + }, + { + "epoch": 0.5250101667344449, + "grad_norm": 22.493768026028913, + "learning_rate": 1.9994377033007346e-05, + "loss": 0.7536, + "step": 1291 + }, + { + "epoch": 0.5254168361122408, + "grad_norm": 32.86744094397963, + "learning_rate": 1.9994343009261797e-05, + "loss": 1.9021, + "step": 1292 + }, + { + "epoch": 0.5258235054900366, + "grad_norm": 10.840583753156832, + "learning_rate": 1.9994308882919285e-05, + "loss": 0.1619, + "step": 1293 + }, + { + "epoch": 0.5262301748678324, + "grad_norm": 18.376436322442096, + "learning_rate": 1.9994274653980164e-05, + "loss": 0.3853, + "step": 1294 + }, + { + "epoch": 0.5266368442456283, + "grad_norm": 2.5764961981286127, + "learning_rate": 1.999424032244478e-05, + "loss": 0.0317, + "step": 1295 + }, + { + "epoch": 0.5270435136234242, + "grad_norm": 20.195875777458582, + "learning_rate": 1.9994205888313494e-05, + "loss": 0.838, + "step": 1296 + }, + { + "epoch": 0.52745018300122, + "grad_norm": 13.08938188230367, + "learning_rate": 1.9994171351586652e-05, + "loss": 0.1303, + "step": 1297 + }, + { + "epoch": 0.5278568523790158, + "grad_norm": 11.571980241403315, + "learning_rate": 1.9994136712264612e-05, + "loss": 0.3111, + "step": 1298 + }, + { + "epoch": 0.5282635217568117, + "grad_norm": 4.67757128975505, + "learning_rate": 1.999410197034773e-05, + "loss": 0.0457, + "step": 1299 + }, + { + "epoch": 0.5286701911346076, + "grad_norm": 21.682896734987935, + "learning_rate": 1.999406712583636e-05, + "loss": 1.0051, + "step": 1300 + }, + { + "epoch": 0.5290768605124034, + "grad_norm": 5.400541105295034, + "learning_rate": 1.9994032178730856e-05, + "loss": 0.0488, + "step": 1301 + }, + { + "epoch": 0.5294835298901993, + "grad_norm": 7.126489422147434, + "learning_rate": 1.999399712903159e-05, + "loss": 0.1113, + "step": 1302 + }, + { + "epoch": 0.5298901992679951, + "grad_norm": 28.439953130195185, + "learning_rate": 1.9993961976738908e-05, + "loss": 0.4222, + "step": 1303 + }, + { + "epoch": 0.530296868645791, + "grad_norm": 11.536772469210282, + "learning_rate": 1.999392672185318e-05, + "loss": 0.2904, + "step": 1304 + }, + { + "epoch": 0.5307035380235868, + "grad_norm": 24.292952464383927, + "learning_rate": 1.999389136437476e-05, + "loss": 0.1373, + "step": 1305 + }, + { + "epoch": 0.5311102074013827, + "grad_norm": 3.842354147101772, + "learning_rate": 1.9993855904304018e-05, + "loss": 0.0484, + "step": 1306 + }, + { + "epoch": 0.5315168767791786, + "grad_norm": 70.17214908176834, + "learning_rate": 1.9993820341641316e-05, + "loss": 0.5313, + "step": 1307 + }, + { + "epoch": 0.5319235461569743, + "grad_norm": 17.850465499750864, + "learning_rate": 1.9993784676387018e-05, + "loss": 0.7324, + "step": 1308 + }, + { + "epoch": 0.5323302155347702, + "grad_norm": 17.328152621598065, + "learning_rate": 1.9993748908541494e-05, + "loss": 0.1194, + "step": 1309 + }, + { + "epoch": 0.5327368849125661, + "grad_norm": 16.277292154223836, + "learning_rate": 1.9993713038105102e-05, + "loss": 0.4302, + "step": 1310 + }, + { + "epoch": 0.533143554290362, + "grad_norm": 24.04432983496898, + "learning_rate": 1.9993677065078217e-05, + "loss": 0.5073, + "step": 1311 + }, + { + "epoch": 0.5335502236681577, + "grad_norm": 12.240821837672758, + "learning_rate": 1.9993640989461206e-05, + "loss": 0.2652, + "step": 1312 + }, + { + "epoch": 0.5339568930459536, + "grad_norm": 4.081311188477319, + "learning_rate": 1.9993604811254447e-05, + "loss": 0.0856, + "step": 1313 + }, + { + "epoch": 0.5343635624237495, + "grad_norm": 15.924809982037859, + "learning_rate": 1.9993568530458297e-05, + "loss": 0.4176, + "step": 1314 + }, + { + "epoch": 0.5347702318015454, + "grad_norm": 47.34890124132878, + "learning_rate": 1.999353214707314e-05, + "loss": 0.4579, + "step": 1315 + }, + { + "epoch": 0.5351769011793412, + "grad_norm": 21.848142399043425, + "learning_rate": 1.9993495661099346e-05, + "loss": 0.6629, + "step": 1316 + }, + { + "epoch": 0.535583570557137, + "grad_norm": 26.44376474271664, + "learning_rate": 1.999345907253729e-05, + "loss": 1.4165, + "step": 1317 + }, + { + "epoch": 0.5359902399349329, + "grad_norm": 12.336254650010824, + "learning_rate": 1.9993422381387346e-05, + "loss": 0.2737, + "step": 1318 + }, + { + "epoch": 0.5363969093127288, + "grad_norm": 9.421120053884485, + "learning_rate": 1.9993385587649893e-05, + "loss": 0.1395, + "step": 1319 + }, + { + "epoch": 0.5368035786905246, + "grad_norm": 6.694422128396052, + "learning_rate": 1.9993348691325308e-05, + "loss": 0.0718, + "step": 1320 + }, + { + "epoch": 0.5372102480683204, + "grad_norm": 10.376953625007848, + "learning_rate": 1.9993311692413968e-05, + "loss": 0.2884, + "step": 1321 + }, + { + "epoch": 0.5376169174461163, + "grad_norm": 21.81830081329122, + "learning_rate": 1.9993274590916255e-05, + "loss": 0.8281, + "step": 1322 + }, + { + "epoch": 0.5380235868239122, + "grad_norm": 2.4334339020451567, + "learning_rate": 1.9993237386832545e-05, + "loss": 0.0426, + "step": 1323 + }, + { + "epoch": 0.538430256201708, + "grad_norm": 13.898540038641364, + "learning_rate": 1.9993200080163226e-05, + "loss": 0.3579, + "step": 1324 + }, + { + "epoch": 0.5388369255795039, + "grad_norm": 18.16770720151357, + "learning_rate": 1.9993162670908678e-05, + "loss": 0.4012, + "step": 1325 + }, + { + "epoch": 0.5392435949572997, + "grad_norm": 19.700389966604387, + "learning_rate": 1.9993125159069286e-05, + "loss": 0.4722, + "step": 1326 + }, + { + "epoch": 0.5396502643350956, + "grad_norm": 8.286413567467392, + "learning_rate": 1.9993087544645435e-05, + "loss": 0.2514, + "step": 1327 + }, + { + "epoch": 0.5400569337128914, + "grad_norm": 43.7514030980866, + "learning_rate": 1.9993049827637513e-05, + "loss": 0.6184, + "step": 1328 + }, + { + "epoch": 0.5404636030906873, + "grad_norm": 14.733902672867249, + "learning_rate": 1.9993012008045904e-05, + "loss": 0.3975, + "step": 1329 + }, + { + "epoch": 0.5408702724684831, + "grad_norm": 15.597255244976857, + "learning_rate": 1.9992974085870996e-05, + "loss": 0.9012, + "step": 1330 + }, + { + "epoch": 0.541276941846279, + "grad_norm": 18.536348986662667, + "learning_rate": 1.9992936061113178e-05, + "loss": 0.735, + "step": 1331 + }, + { + "epoch": 0.5416836112240748, + "grad_norm": 19.26012714366998, + "learning_rate": 1.9992897933772846e-05, + "loss": 0.5573, + "step": 1332 + }, + { + "epoch": 0.5420902806018707, + "grad_norm": 12.786208365855284, + "learning_rate": 1.9992859703850387e-05, + "loss": 0.2303, + "step": 1333 + }, + { + "epoch": 0.5424969499796666, + "grad_norm": 15.641677210119306, + "learning_rate": 1.999282137134619e-05, + "loss": 0.956, + "step": 1334 + }, + { + "epoch": 0.5429036193574623, + "grad_norm": 14.332883186008715, + "learning_rate": 1.999278293626066e-05, + "loss": 0.2923, + "step": 1335 + }, + { + "epoch": 0.5433102887352582, + "grad_norm": 18.545754326260514, + "learning_rate": 1.999274439859418e-05, + "loss": 0.7599, + "step": 1336 + }, + { + "epoch": 0.5437169581130541, + "grad_norm": 20.338275418368454, + "learning_rate": 1.9992705758347144e-05, + "loss": 0.8375, + "step": 1337 + }, + { + "epoch": 0.54412362749085, + "grad_norm": 26.278201011834845, + "learning_rate": 1.9992667015519957e-05, + "loss": 1.1057, + "step": 1338 + }, + { + "epoch": 0.5445302968686457, + "grad_norm": 22.29853275848885, + "learning_rate": 1.9992628170113017e-05, + "loss": 0.3538, + "step": 1339 + }, + { + "epoch": 0.5449369662464416, + "grad_norm": 20.691378378097664, + "learning_rate": 1.999258922212672e-05, + "loss": 0.7433, + "step": 1340 + }, + { + "epoch": 0.5453436356242375, + "grad_norm": 14.805581401835498, + "learning_rate": 1.9992550171561464e-05, + "loss": 0.3406, + "step": 1341 + }, + { + "epoch": 0.5457503050020334, + "grad_norm": 12.430915574643, + "learning_rate": 1.999251101841765e-05, + "loss": 0.3759, + "step": 1342 + }, + { + "epoch": 0.5461569743798292, + "grad_norm": 23.722337679339045, + "learning_rate": 1.999247176269568e-05, + "loss": 0.9147, + "step": 1343 + }, + { + "epoch": 0.546563643757625, + "grad_norm": 31.313927743988838, + "learning_rate": 1.999243240439596e-05, + "loss": 1.5416, + "step": 1344 + }, + { + "epoch": 0.5469703131354209, + "grad_norm": 11.494246395624286, + "learning_rate": 1.9992392943518894e-05, + "loss": 0.3364, + "step": 1345 + }, + { + "epoch": 0.5473769825132168, + "grad_norm": 36.24096527061573, + "learning_rate": 1.9992353380064883e-05, + "loss": 0.3268, + "step": 1346 + }, + { + "epoch": 0.5477836518910126, + "grad_norm": 21.59528194547048, + "learning_rate": 1.9992313714034338e-05, + "loss": 0.3304, + "step": 1347 + }, + { + "epoch": 0.5481903212688085, + "grad_norm": 6.815833576189118, + "learning_rate": 1.999227394542766e-05, + "loss": 0.1305, + "step": 1348 + }, + { + "epoch": 0.5485969906466043, + "grad_norm": 17.64180303728325, + "learning_rate": 1.9992234074245265e-05, + "loss": 0.3942, + "step": 1349 + }, + { + "epoch": 0.5490036600244002, + "grad_norm": 31.41229248064664, + "learning_rate": 1.9992194100487556e-05, + "loss": 1.508, + "step": 1350 + }, + { + "epoch": 0.549410329402196, + "grad_norm": 24.44285199244174, + "learning_rate": 1.9992154024154946e-05, + "loss": 1.1072, + "step": 1351 + }, + { + "epoch": 0.5498169987799919, + "grad_norm": 15.531989983750925, + "learning_rate": 1.9992113845247846e-05, + "loss": 0.7308, + "step": 1352 + }, + { + "epoch": 0.5502236681577877, + "grad_norm": 19.094336167930727, + "learning_rate": 1.999207356376667e-05, + "loss": 0.6406, + "step": 1353 + }, + { + "epoch": 0.5506303375355835, + "grad_norm": 13.601069289155037, + "learning_rate": 1.9992033179711827e-05, + "loss": 0.3284, + "step": 1354 + }, + { + "epoch": 0.5510370069133794, + "grad_norm": 10.646701164672342, + "learning_rate": 1.9991992693083736e-05, + "loss": 0.2068, + "step": 1355 + }, + { + "epoch": 0.5514436762911753, + "grad_norm": 37.37160778798616, + "learning_rate": 1.999195210388281e-05, + "loss": 1.881, + "step": 1356 + }, + { + "epoch": 0.5518503456689712, + "grad_norm": 23.835122916932175, + "learning_rate": 1.9991911412109468e-05, + "loss": 0.5016, + "step": 1357 + }, + { + "epoch": 0.5522570150467669, + "grad_norm": 13.324654268691992, + "learning_rate": 1.9991870617764126e-05, + "loss": 0.4341, + "step": 1358 + }, + { + "epoch": 0.5526636844245628, + "grad_norm": 33.83744819208751, + "learning_rate": 1.9991829720847203e-05, + "loss": 0.9118, + "step": 1359 + }, + { + "epoch": 0.5530703538023587, + "grad_norm": 22.436070291137614, + "learning_rate": 1.999178872135912e-05, + "loss": 0.3516, + "step": 1360 + }, + { + "epoch": 0.5534770231801546, + "grad_norm": 15.318340682765282, + "learning_rate": 1.9991747619300298e-05, + "loss": 0.5753, + "step": 1361 + }, + { + "epoch": 0.5538836925579503, + "grad_norm": 28.562923390695797, + "learning_rate": 1.999170641467116e-05, + "loss": 1.4309, + "step": 1362 + }, + { + "epoch": 0.5542903619357462, + "grad_norm": 29.80335324776897, + "learning_rate": 1.999166510747212e-05, + "loss": 0.7953, + "step": 1363 + }, + { + "epoch": 0.5546970313135421, + "grad_norm": 9.476924337068041, + "learning_rate": 1.9991623697703613e-05, + "loss": 0.5076, + "step": 1364 + }, + { + "epoch": 0.555103700691338, + "grad_norm": 28.555904129765757, + "learning_rate": 1.999158218536606e-05, + "loss": 0.6057, + "step": 1365 + }, + { + "epoch": 0.5555103700691338, + "grad_norm": 13.276858057710188, + "learning_rate": 1.9991540570459888e-05, + "loss": 0.2315, + "step": 1366 + }, + { + "epoch": 0.5559170394469296, + "grad_norm": 10.534330273317817, + "learning_rate": 1.9991498852985523e-05, + "loss": 0.4708, + "step": 1367 + }, + { + "epoch": 0.5563237088247255, + "grad_norm": 14.368386480341478, + "learning_rate": 1.9991457032943394e-05, + "loss": 0.3604, + "step": 1368 + }, + { + "epoch": 0.5567303782025214, + "grad_norm": 15.655240222637966, + "learning_rate": 1.9991415110333926e-05, + "loss": 0.5719, + "step": 1369 + }, + { + "epoch": 0.5571370475803172, + "grad_norm": 13.270802355463431, + "learning_rate": 1.9991373085157558e-05, + "loss": 0.4329, + "step": 1370 + }, + { + "epoch": 0.557543716958113, + "grad_norm": 16.608963416137083, + "learning_rate": 1.9991330957414715e-05, + "loss": 0.348, + "step": 1371 + }, + { + "epoch": 0.5579503863359089, + "grad_norm": 15.648896338458252, + "learning_rate": 1.999128872710583e-05, + "loss": 0.4571, + "step": 1372 + }, + { + "epoch": 0.5583570557137048, + "grad_norm": 16.01936769762431, + "learning_rate": 1.999124639423134e-05, + "loss": 1.2097, + "step": 1373 + }, + { + "epoch": 0.5587637250915006, + "grad_norm": 10.260409138694342, + "learning_rate": 1.9991203958791675e-05, + "loss": 0.3421, + "step": 1374 + }, + { + "epoch": 0.5591703944692965, + "grad_norm": 17.209651935264297, + "learning_rate": 1.9991161420787275e-05, + "loss": 0.2042, + "step": 1375 + }, + { + "epoch": 0.5595770638470923, + "grad_norm": 9.458104304684035, + "learning_rate": 1.9991118780218573e-05, + "loss": 0.1386, + "step": 1376 + }, + { + "epoch": 0.5599837332248881, + "grad_norm": 25.855738813263816, + "learning_rate": 1.999107603708601e-05, + "loss": 0.9532, + "step": 1377 + }, + { + "epoch": 0.560390402602684, + "grad_norm": 18.817792206785473, + "learning_rate": 1.9991033191390024e-05, + "loss": 0.6782, + "step": 1378 + }, + { + "epoch": 0.5607970719804799, + "grad_norm": 7.269108630662966, + "learning_rate": 1.999099024313105e-05, + "loss": 0.1797, + "step": 1379 + }, + { + "epoch": 0.5612037413582758, + "grad_norm": 23.458095276273582, + "learning_rate": 1.9990947192309534e-05, + "loss": 0.8168, + "step": 1380 + }, + { + "epoch": 0.5616104107360715, + "grad_norm": 11.320681941577043, + "learning_rate": 1.9990904038925917e-05, + "loss": 0.1942, + "step": 1381 + }, + { + "epoch": 0.5620170801138674, + "grad_norm": 32.188075582441115, + "learning_rate": 1.9990860782980644e-05, + "loss": 0.9452, + "step": 1382 + }, + { + "epoch": 0.5624237494916633, + "grad_norm": 14.248556957827569, + "learning_rate": 1.9990817424474154e-05, + "loss": 0.8706, + "step": 1383 + }, + { + "epoch": 0.5628304188694592, + "grad_norm": 20.19196908638299, + "learning_rate": 1.9990773963406897e-05, + "loss": 0.5137, + "step": 1384 + }, + { + "epoch": 0.5632370882472549, + "grad_norm": 21.1622254779942, + "learning_rate": 1.9990730399779314e-05, + "loss": 0.5807, + "step": 1385 + }, + { + "epoch": 0.5636437576250508, + "grad_norm": 9.507767464115087, + "learning_rate": 1.9990686733591857e-05, + "loss": 0.1843, + "step": 1386 + }, + { + "epoch": 0.5640504270028467, + "grad_norm": 17.321609066662635, + "learning_rate": 1.9990642964844974e-05, + "loss": 0.5228, + "step": 1387 + }, + { + "epoch": 0.5644570963806426, + "grad_norm": 19.034234310535812, + "learning_rate": 1.999059909353911e-05, + "loss": 0.6857, + "step": 1388 + }, + { + "epoch": 0.5648637657584384, + "grad_norm": 22.208883733658066, + "learning_rate": 1.999055511967472e-05, + "loss": 0.8181, + "step": 1389 + }, + { + "epoch": 0.5652704351362342, + "grad_norm": 11.594361712918676, + "learning_rate": 1.9990511043252255e-05, + "loss": 0.3292, + "step": 1390 + }, + { + "epoch": 0.5656771045140301, + "grad_norm": 28.06229858692505, + "learning_rate": 1.9990466864272163e-05, + "loss": 1.733, + "step": 1391 + }, + { + "epoch": 0.566083773891826, + "grad_norm": 14.225830911585494, + "learning_rate": 1.99904225827349e-05, + "loss": 0.4406, + "step": 1392 + }, + { + "epoch": 0.5664904432696218, + "grad_norm": 9.057759818646781, + "learning_rate": 1.9990378198640923e-05, + "loss": 0.1302, + "step": 1393 + }, + { + "epoch": 0.5668971126474176, + "grad_norm": 29.93412956456162, + "learning_rate": 1.999033371199069e-05, + "loss": 0.3848, + "step": 1394 + }, + { + "epoch": 0.5673037820252135, + "grad_norm": 8.54502774925036, + "learning_rate": 1.9990289122784646e-05, + "loss": 0.134, + "step": 1395 + }, + { + "epoch": 0.5677104514030094, + "grad_norm": 15.796310457990804, + "learning_rate": 1.999024443102326e-05, + "loss": 0.6644, + "step": 1396 + }, + { + "epoch": 0.5681171207808052, + "grad_norm": 4.287198278777775, + "learning_rate": 1.9990199636706987e-05, + "loss": 0.0391, + "step": 1397 + }, + { + "epoch": 0.5685237901586011, + "grad_norm": 19.267172945850188, + "learning_rate": 1.9990154739836284e-05, + "loss": 0.8852, + "step": 1398 + }, + { + "epoch": 0.5689304595363969, + "grad_norm": 16.80078322578091, + "learning_rate": 1.9990109740411616e-05, + "loss": 0.5451, + "step": 1399 + }, + { + "epoch": 0.5693371289141927, + "grad_norm": 10.642866311010492, + "learning_rate": 1.9990064638433444e-05, + "loss": 0.1319, + "step": 1400 + }, + { + "epoch": 0.5697437982919886, + "grad_norm": 11.022216665833763, + "learning_rate": 1.999001943390223e-05, + "loss": 0.2165, + "step": 1401 + }, + { + "epoch": 0.5701504676697845, + "grad_norm": 10.983702956318192, + "learning_rate": 1.9989974126818443e-05, + "loss": 0.3383, + "step": 1402 + }, + { + "epoch": 0.5705571370475803, + "grad_norm": 14.006040017917082, + "learning_rate": 1.9989928717182535e-05, + "loss": 0.5515, + "step": 1403 + }, + { + "epoch": 0.5709638064253761, + "grad_norm": 19.731023542091986, + "learning_rate": 1.9989883204994987e-05, + "loss": 0.6536, + "step": 1404 + }, + { + "epoch": 0.571370475803172, + "grad_norm": 2.0363308903832293, + "learning_rate": 1.998983759025626e-05, + "loss": 0.0196, + "step": 1405 + }, + { + "epoch": 0.5717771451809679, + "grad_norm": 11.286611844404963, + "learning_rate": 1.998979187296682e-05, + "loss": 0.4129, + "step": 1406 + }, + { + "epoch": 0.5721838145587638, + "grad_norm": 19.38904832551555, + "learning_rate": 1.9989746053127138e-05, + "loss": 0.4108, + "step": 1407 + }, + { + "epoch": 0.5725904839365595, + "grad_norm": 19.837876059983234, + "learning_rate": 1.9989700130737688e-05, + "loss": 0.5511, + "step": 1408 + }, + { + "epoch": 0.5729971533143554, + "grad_norm": 19.66633351904717, + "learning_rate": 1.998965410579894e-05, + "loss": 0.6015, + "step": 1409 + }, + { + "epoch": 0.5734038226921513, + "grad_norm": 6.09069405951329, + "learning_rate": 1.9989607978311363e-05, + "loss": 0.1073, + "step": 1410 + }, + { + "epoch": 0.5738104920699472, + "grad_norm": 15.532295490125144, + "learning_rate": 1.998956174827543e-05, + "loss": 0.5306, + "step": 1411 + }, + { + "epoch": 0.5742171614477429, + "grad_norm": 9.348940853892413, + "learning_rate": 1.998951541569162e-05, + "loss": 0.3349, + "step": 1412 + }, + { + "epoch": 0.5746238308255388, + "grad_norm": 12.835248831662287, + "learning_rate": 1.998946898056041e-05, + "loss": 0.6256, + "step": 1413 + }, + { + "epoch": 0.5750305002033347, + "grad_norm": 14.012648860328815, + "learning_rate": 1.9989422442882268e-05, + "loss": 0.2891, + "step": 1414 + }, + { + "epoch": 0.5754371695811306, + "grad_norm": 1.3053279822350334, + "learning_rate": 1.9989375802657678e-05, + "loss": 0.0167, + "step": 1415 + }, + { + "epoch": 0.5758438389589264, + "grad_norm": 12.565809714835087, + "learning_rate": 1.9989329059887118e-05, + "loss": 0.2987, + "step": 1416 + }, + { + "epoch": 0.5762505083367222, + "grad_norm": 13.713837520509145, + "learning_rate": 1.998928221457107e-05, + "loss": 0.4595, + "step": 1417 + }, + { + "epoch": 0.5766571777145181, + "grad_norm": 27.224960323244407, + "learning_rate": 1.9989235266710008e-05, + "loss": 1.0542, + "step": 1418 + }, + { + "epoch": 0.577063847092314, + "grad_norm": 23.313040074626386, + "learning_rate": 1.9989188216304424e-05, + "loss": 1.2372, + "step": 1419 + }, + { + "epoch": 0.5774705164701098, + "grad_norm": 5.87198585225963, + "learning_rate": 1.9989141063354792e-05, + "loss": 0.0979, + "step": 1420 + }, + { + "epoch": 0.5778771858479057, + "grad_norm": 31.807896037356755, + "learning_rate": 1.99890938078616e-05, + "loss": 0.4017, + "step": 1421 + }, + { + "epoch": 0.5782838552257015, + "grad_norm": 17.14537299304651, + "learning_rate": 1.9989046449825334e-05, + "loss": 0.83, + "step": 1422 + }, + { + "epoch": 0.5786905246034973, + "grad_norm": 19.92787477764801, + "learning_rate": 1.9988998989246475e-05, + "loss": 1.0631, + "step": 1423 + }, + { + "epoch": 0.5790971939812932, + "grad_norm": 2.1827470215028457, + "learning_rate": 1.998895142612552e-05, + "loss": 0.0306, + "step": 1424 + }, + { + "epoch": 0.5795038633590891, + "grad_norm": 15.87465044072287, + "learning_rate": 1.9988903760462947e-05, + "loss": 0.2043, + "step": 1425 + }, + { + "epoch": 0.5799105327368849, + "grad_norm": 21.642850081497095, + "learning_rate": 1.998885599225925e-05, + "loss": 0.7309, + "step": 1426 + }, + { + "epoch": 0.5803172021146807, + "grad_norm": 44.56543486080932, + "learning_rate": 1.998880812151492e-05, + "loss": 0.8714, + "step": 1427 + }, + { + "epoch": 0.5807238714924766, + "grad_norm": 26.21532277628755, + "learning_rate": 1.998876014823045e-05, + "loss": 2.3689, + "step": 1428 + }, + { + "epoch": 0.5811305408702725, + "grad_norm": 19.711969384131617, + "learning_rate": 1.9988712072406325e-05, + "loss": 0.479, + "step": 1429 + }, + { + "epoch": 0.5815372102480684, + "grad_norm": 1.252675892494965, + "learning_rate": 1.9988663894043045e-05, + "loss": 0.0144, + "step": 1430 + }, + { + "epoch": 0.5819438796258641, + "grad_norm": 2.743878891110626, + "learning_rate": 1.9988615613141104e-05, + "loss": 0.0312, + "step": 1431 + }, + { + "epoch": 0.58235054900366, + "grad_norm": 2.751435753864024, + "learning_rate": 1.9988567229700993e-05, + "loss": 0.033, + "step": 1432 + }, + { + "epoch": 0.5827572183814559, + "grad_norm": 12.848984904784288, + "learning_rate": 1.9988518743723218e-05, + "loss": 0.2085, + "step": 1433 + }, + { + "epoch": 0.5831638877592518, + "grad_norm": 11.435394300221553, + "learning_rate": 1.9988470155208265e-05, + "loss": 0.1737, + "step": 1434 + }, + { + "epoch": 0.5835705571370475, + "grad_norm": 17.468566586908345, + "learning_rate": 1.9988421464156644e-05, + "loss": 0.3454, + "step": 1435 + }, + { + "epoch": 0.5839772265148434, + "grad_norm": 36.21002726475622, + "learning_rate": 1.998837267056885e-05, + "loss": 1.5349, + "step": 1436 + }, + { + "epoch": 0.5843838958926393, + "grad_norm": 5.7384695499529625, + "learning_rate": 1.9988323774445377e-05, + "loss": 0.0718, + "step": 1437 + }, + { + "epoch": 0.5847905652704352, + "grad_norm": 12.402269771310234, + "learning_rate": 1.998827477578674e-05, + "loss": 0.3012, + "step": 1438 + }, + { + "epoch": 0.585197234648231, + "grad_norm": 6.585388004782576, + "learning_rate": 1.9988225674593432e-05, + "loss": 0.2204, + "step": 1439 + }, + { + "epoch": 0.5856039040260268, + "grad_norm": 13.200282279945364, + "learning_rate": 1.998817647086596e-05, + "loss": 0.2437, + "step": 1440 + }, + { + "epoch": 0.5860105734038227, + "grad_norm": 9.505310242892604, + "learning_rate": 1.998812716460483e-05, + "loss": 0.0971, + "step": 1441 + }, + { + "epoch": 0.5864172427816186, + "grad_norm": 12.810729836089262, + "learning_rate": 1.9988077755810548e-05, + "loss": 0.2155, + "step": 1442 + }, + { + "epoch": 0.5868239121594144, + "grad_norm": 4.264862092429055, + "learning_rate": 1.9988028244483625e-05, + "loss": 0.1884, + "step": 1443 + }, + { + "epoch": 0.5872305815372102, + "grad_norm": 42.709105607780636, + "learning_rate": 1.998797863062456e-05, + "loss": 1.3222, + "step": 1444 + }, + { + "epoch": 0.5876372509150061, + "grad_norm": 36.36735359588291, + "learning_rate": 1.998792891423387e-05, + "loss": 0.5203, + "step": 1445 + }, + { + "epoch": 0.588043920292802, + "grad_norm": 17.543490316732136, + "learning_rate": 1.9987879095312062e-05, + "loss": 0.3627, + "step": 1446 + }, + { + "epoch": 0.5884505896705978, + "grad_norm": 1.4072254576898866, + "learning_rate": 1.998782917385965e-05, + "loss": 0.0163, + "step": 1447 + }, + { + "epoch": 0.5888572590483937, + "grad_norm": 23.298668199607974, + "learning_rate": 1.998777914987714e-05, + "loss": 0.6143, + "step": 1448 + }, + { + "epoch": 0.5892639284261895, + "grad_norm": 17.59909179497094, + "learning_rate": 1.9987729023365056e-05, + "loss": 0.4295, + "step": 1449 + }, + { + "epoch": 0.5896705978039853, + "grad_norm": 7.376889185872021, + "learning_rate": 1.9987678794323905e-05, + "loss": 0.0877, + "step": 1450 + }, + { + "epoch": 0.5900772671817812, + "grad_norm": 6.629773081704932, + "learning_rate": 1.9987628462754202e-05, + "loss": 0.1134, + "step": 1451 + }, + { + "epoch": 0.5904839365595771, + "grad_norm": 8.834435175657687, + "learning_rate": 1.9987578028656468e-05, + "loss": 0.2251, + "step": 1452 + }, + { + "epoch": 0.590890605937373, + "grad_norm": 9.960265110920632, + "learning_rate": 1.998752749203122e-05, + "loss": 0.4861, + "step": 1453 + }, + { + "epoch": 0.5912972753151687, + "grad_norm": 8.425547273540255, + "learning_rate": 1.9987476852878975e-05, + "loss": 0.178, + "step": 1454 + }, + { + "epoch": 0.5917039446929646, + "grad_norm": 6.478966049050422, + "learning_rate": 1.998742611120025e-05, + "loss": 0.2133, + "step": 1455 + }, + { + "epoch": 0.5921106140707605, + "grad_norm": 15.898593877215434, + "learning_rate": 1.9987375266995572e-05, + "loss": 0.5613, + "step": 1456 + }, + { + "epoch": 0.5925172834485564, + "grad_norm": 30.947040619638585, + "learning_rate": 1.998732432026546e-05, + "loss": 1.9628, + "step": 1457 + }, + { + "epoch": 0.5929239528263521, + "grad_norm": 27.582188063811756, + "learning_rate": 1.998727327101044e-05, + "loss": 1.2638, + "step": 1458 + }, + { + "epoch": 0.593330622204148, + "grad_norm": 39.778118638094995, + "learning_rate": 1.9987222119231032e-05, + "loss": 0.7468, + "step": 1459 + }, + { + "epoch": 0.5937372915819439, + "grad_norm": 26.424168642269102, + "learning_rate": 1.9987170864927764e-05, + "loss": 0.2367, + "step": 1460 + }, + { + "epoch": 0.5941439609597398, + "grad_norm": 30.010021770400815, + "learning_rate": 1.998711950810116e-05, + "loss": 0.9454, + "step": 1461 + }, + { + "epoch": 0.5945506303375356, + "grad_norm": 5.910898491391715, + "learning_rate": 1.9987068048751743e-05, + "loss": 0.0941, + "step": 1462 + }, + { + "epoch": 0.5949572997153314, + "grad_norm": 5.760488233701412, + "learning_rate": 1.9987016486880048e-05, + "loss": 0.0749, + "step": 1463 + }, + { + "epoch": 0.5953639690931273, + "grad_norm": 5.707178354386884, + "learning_rate": 1.9986964822486605e-05, + "loss": 0.1234, + "step": 1464 + }, + { + "epoch": 0.5957706384709232, + "grad_norm": 19.256435589236876, + "learning_rate": 1.998691305557194e-05, + "loss": 1.3935, + "step": 1465 + }, + { + "epoch": 0.596177307848719, + "grad_norm": 24.052053357487424, + "learning_rate": 1.9986861186136588e-05, + "loss": 0.5927, + "step": 1466 + }, + { + "epoch": 0.5965839772265148, + "grad_norm": 9.737969166814022, + "learning_rate": 1.998680921418108e-05, + "loss": 0.4036, + "step": 1467 + }, + { + "epoch": 0.5969906466043107, + "grad_norm": 13.741385565943936, + "learning_rate": 1.9986757139705947e-05, + "loss": 0.2119, + "step": 1468 + }, + { + "epoch": 0.5973973159821065, + "grad_norm": 16.134072000717083, + "learning_rate": 1.9986704962711726e-05, + "loss": 0.7305, + "step": 1469 + }, + { + "epoch": 0.5978039853599024, + "grad_norm": 14.403960321600291, + "learning_rate": 1.9986652683198952e-05, + "loss": 0.5489, + "step": 1470 + }, + { + "epoch": 0.5982106547376983, + "grad_norm": 13.343065034518697, + "learning_rate": 1.9986600301168164e-05, + "loss": 0.4176, + "step": 1471 + }, + { + "epoch": 0.5986173241154941, + "grad_norm": 14.41017070621149, + "learning_rate": 1.99865478166199e-05, + "loss": 0.501, + "step": 1472 + }, + { + "epoch": 0.5990239934932899, + "grad_norm": 7.445106360040941, + "learning_rate": 1.998649522955469e-05, + "loss": 0.2521, + "step": 1473 + }, + { + "epoch": 0.5994306628710858, + "grad_norm": 11.427797241703027, + "learning_rate": 1.9986442539973082e-05, + "loss": 0.4021, + "step": 1474 + }, + { + "epoch": 0.5998373322488817, + "grad_norm": 16.332140250835618, + "learning_rate": 1.9986389747875617e-05, + "loss": 0.446, + "step": 1475 + }, + { + "epoch": 0.6002440016266775, + "grad_norm": 14.885795838278042, + "learning_rate": 1.9986336853262834e-05, + "loss": 1.1481, + "step": 1476 + }, + { + "epoch": 0.6006506710044733, + "grad_norm": 5.373769530851831, + "learning_rate": 1.998628385613528e-05, + "loss": 0.1277, + "step": 1477 + }, + { + "epoch": 0.6010573403822692, + "grad_norm": 23.379893737149242, + "learning_rate": 1.9986230756493494e-05, + "loss": 0.6496, + "step": 1478 + }, + { + "epoch": 0.6014640097600651, + "grad_norm": 11.794110842177492, + "learning_rate": 1.998617755433802e-05, + "loss": 0.2789, + "step": 1479 + }, + { + "epoch": 0.601870679137861, + "grad_norm": 25.5325207145994, + "learning_rate": 1.9986124249669413e-05, + "loss": 1.6757, + "step": 1480 + }, + { + "epoch": 0.6022773485156567, + "grad_norm": 17.55838463730879, + "learning_rate": 1.9986070842488215e-05, + "loss": 0.5555, + "step": 1481 + }, + { + "epoch": 0.6026840178934526, + "grad_norm": 10.670581114502951, + "learning_rate": 1.998601733279497e-05, + "loss": 0.2257, + "step": 1482 + }, + { + "epoch": 0.6030906872712485, + "grad_norm": 18.354064391464085, + "learning_rate": 1.9985963720590234e-05, + "loss": 0.7012, + "step": 1483 + }, + { + "epoch": 0.6034973566490444, + "grad_norm": 21.695595874470804, + "learning_rate": 1.9985910005874555e-05, + "loss": 0.5774, + "step": 1484 + }, + { + "epoch": 0.6039040260268401, + "grad_norm": 10.187341528140637, + "learning_rate": 1.9985856188648482e-05, + "loss": 0.185, + "step": 1485 + }, + { + "epoch": 0.604310695404636, + "grad_norm": 23.767670743531237, + "learning_rate": 1.998580226891257e-05, + "loss": 0.5974, + "step": 1486 + }, + { + "epoch": 0.6047173647824319, + "grad_norm": 11.224668034689945, + "learning_rate": 1.998574824666737e-05, + "loss": 0.2738, + "step": 1487 + }, + { + "epoch": 0.6051240341602278, + "grad_norm": 23.716651430385365, + "learning_rate": 1.9985694121913443e-05, + "loss": 0.6314, + "step": 1488 + }, + { + "epoch": 0.6055307035380236, + "grad_norm": 7.801998810776895, + "learning_rate": 1.9985639894651338e-05, + "loss": 0.1192, + "step": 1489 + }, + { + "epoch": 0.6059373729158194, + "grad_norm": 10.44314602118836, + "learning_rate": 1.998558556488161e-05, + "loss": 0.2, + "step": 1490 + }, + { + "epoch": 0.6063440422936153, + "grad_norm": 2.869322899606938, + "learning_rate": 1.9985531132604826e-05, + "loss": 0.0886, + "step": 1491 + }, + { + "epoch": 0.6067507116714111, + "grad_norm": 21.832742881302174, + "learning_rate": 1.9985476597821537e-05, + "loss": 1.067, + "step": 1492 + }, + { + "epoch": 0.607157381049207, + "grad_norm": 18.665094225339974, + "learning_rate": 1.9985421960532304e-05, + "loss": 0.2122, + "step": 1493 + }, + { + "epoch": 0.6075640504270029, + "grad_norm": 14.1902449259966, + "learning_rate": 1.998536722073769e-05, + "loss": 0.6064, + "step": 1494 + }, + { + "epoch": 0.6079707198047987, + "grad_norm": 28.211954681093644, + "learning_rate": 1.9985312378438257e-05, + "loss": 0.708, + "step": 1495 + }, + { + "epoch": 0.6083773891825945, + "grad_norm": 6.759793680388254, + "learning_rate": 1.9985257433634566e-05, + "loss": 0.0607, + "step": 1496 + }, + { + "epoch": 0.6087840585603904, + "grad_norm": 15.915365230462033, + "learning_rate": 1.9985202386327182e-05, + "loss": 0.8917, + "step": 1497 + }, + { + "epoch": 0.6091907279381863, + "grad_norm": 19.577875652227625, + "learning_rate": 1.9985147236516672e-05, + "loss": 1.1845, + "step": 1498 + }, + { + "epoch": 0.6095973973159821, + "grad_norm": 11.18330364083354, + "learning_rate": 1.99850919842036e-05, + "loss": 0.1909, + "step": 1499 + }, + { + "epoch": 0.6100040666937779, + "grad_norm": 2.042065378347536, + "learning_rate": 1.998503662938853e-05, + "loss": 0.0181, + "step": 1500 + }, + { + "epoch": 0.6104107360715738, + "grad_norm": 6.863694939004835, + "learning_rate": 1.9984981172072037e-05, + "loss": 0.1504, + "step": 1501 + }, + { + "epoch": 0.6108174054493697, + "grad_norm": 18.713306970302526, + "learning_rate": 1.9984925612254683e-05, + "loss": 0.7848, + "step": 1502 + }, + { + "epoch": 0.6112240748271656, + "grad_norm": 25.193331241037452, + "learning_rate": 1.9984869949937046e-05, + "loss": 0.8546, + "step": 1503 + }, + { + "epoch": 0.6116307442049613, + "grad_norm": 15.50261122080585, + "learning_rate": 1.998481418511969e-05, + "loss": 0.6359, + "step": 1504 + }, + { + "epoch": 0.6120374135827572, + "grad_norm": 15.70903912349382, + "learning_rate": 1.9984758317803195e-05, + "loss": 0.6206, + "step": 1505 + }, + { + "epoch": 0.6124440829605531, + "grad_norm": 15.751562494210408, + "learning_rate": 1.998470234798813e-05, + "loss": 0.5415, + "step": 1506 + }, + { + "epoch": 0.612850752338349, + "grad_norm": 13.12487650834291, + "learning_rate": 1.998464627567507e-05, + "loss": 0.471, + "step": 1507 + }, + { + "epoch": 0.6132574217161447, + "grad_norm": 17.595903554463565, + "learning_rate": 1.998459010086459e-05, + "loss": 0.6159, + "step": 1508 + }, + { + "epoch": 0.6136640910939406, + "grad_norm": 1.2806614875158524, + "learning_rate": 1.9984533823557268e-05, + "loss": 0.017, + "step": 1509 + }, + { + "epoch": 0.6140707604717365, + "grad_norm": 13.42528631082173, + "learning_rate": 1.998447744375368e-05, + "loss": 0.6853, + "step": 1510 + }, + { + "epoch": 0.6144774298495324, + "grad_norm": 15.98764357403685, + "learning_rate": 1.9984420961454406e-05, + "loss": 0.5073, + "step": 1511 + }, + { + "epoch": 0.6148840992273282, + "grad_norm": 14.539510493618703, + "learning_rate": 1.9984364376660027e-05, + "loss": 0.3642, + "step": 1512 + }, + { + "epoch": 0.615290768605124, + "grad_norm": 29.810856720381906, + "learning_rate": 1.9984307689371124e-05, + "loss": 1.3151, + "step": 1513 + }, + { + "epoch": 0.6156974379829199, + "grad_norm": 17.053941001535136, + "learning_rate": 1.9984250899588273e-05, + "loss": 0.5842, + "step": 1514 + }, + { + "epoch": 0.6161041073607157, + "grad_norm": 11.21326028188861, + "learning_rate": 1.9984194007312064e-05, + "loss": 0.4985, + "step": 1515 + }, + { + "epoch": 0.6165107767385116, + "grad_norm": 13.039777514517201, + "learning_rate": 1.998413701254308e-05, + "loss": 0.2365, + "step": 1516 + }, + { + "epoch": 0.6169174461163074, + "grad_norm": 35.88369251196187, + "learning_rate": 1.9984079915281905e-05, + "loss": 0.8099, + "step": 1517 + }, + { + "epoch": 0.6173241154941033, + "grad_norm": 13.238985661498424, + "learning_rate": 1.9984022715529124e-05, + "loss": 0.6924, + "step": 1518 + }, + { + "epoch": 0.6177307848718991, + "grad_norm": 3.3751724617712515, + "learning_rate": 1.9983965413285323e-05, + "loss": 0.0469, + "step": 1519 + }, + { + "epoch": 0.618137454249695, + "grad_norm": 10.091735136429394, + "learning_rate": 1.9983908008551094e-05, + "loss": 0.2711, + "step": 1520 + }, + { + "epoch": 0.6185441236274909, + "grad_norm": 9.785567027309886, + "learning_rate": 1.9983850501327024e-05, + "loss": 0.2444, + "step": 1521 + }, + { + "epoch": 0.6189507930052867, + "grad_norm": 4.96710873489923, + "learning_rate": 1.9983792891613706e-05, + "loss": 0.1218, + "step": 1522 + }, + { + "epoch": 0.6193574623830825, + "grad_norm": 12.854493869525486, + "learning_rate": 1.9983735179411725e-05, + "loss": 0.3293, + "step": 1523 + }, + { + "epoch": 0.6197641317608784, + "grad_norm": 17.11684849518505, + "learning_rate": 1.9983677364721677e-05, + "loss": 0.6152, + "step": 1524 + }, + { + "epoch": 0.6201708011386743, + "grad_norm": 19.38036176537081, + "learning_rate": 1.9983619447544164e-05, + "loss": 1.3703, + "step": 1525 + }, + { + "epoch": 0.6205774705164702, + "grad_norm": 11.547511477553604, + "learning_rate": 1.9983561427879763e-05, + "loss": 0.1867, + "step": 1526 + }, + { + "epoch": 0.6209841398942659, + "grad_norm": 10.383864487254163, + "learning_rate": 1.9983503305729086e-05, + "loss": 0.2357, + "step": 1527 + }, + { + "epoch": 0.6213908092720618, + "grad_norm": 10.55028969861805, + "learning_rate": 1.998344508109272e-05, + "loss": 0.2782, + "step": 1528 + }, + { + "epoch": 0.6217974786498577, + "grad_norm": 9.913090547214466, + "learning_rate": 1.9983386753971268e-05, + "loss": 0.1736, + "step": 1529 + }, + { + "epoch": 0.6222041480276536, + "grad_norm": 9.190566479262081, + "learning_rate": 1.9983328324365327e-05, + "loss": 0.2527, + "step": 1530 + }, + { + "epoch": 0.6226108174054493, + "grad_norm": 19.880608421875703, + "learning_rate": 1.9983269792275494e-05, + "loss": 0.3568, + "step": 1531 + }, + { + "epoch": 0.6230174867832452, + "grad_norm": 1.1437289664407166, + "learning_rate": 1.9983211157702368e-05, + "loss": 0.0192, + "step": 1532 + }, + { + "epoch": 0.6234241561610411, + "grad_norm": 29.388124252828074, + "learning_rate": 1.9983152420646564e-05, + "loss": 0.3114, + "step": 1533 + }, + { + "epoch": 0.623830825538837, + "grad_norm": 11.061350011166965, + "learning_rate": 1.9983093581108667e-05, + "loss": 0.3042, + "step": 1534 + }, + { + "epoch": 0.6242374949166328, + "grad_norm": 12.268430011124575, + "learning_rate": 1.9983034639089298e-05, + "loss": 0.5374, + "step": 1535 + }, + { + "epoch": 0.6246441642944286, + "grad_norm": 24.21938598090029, + "learning_rate": 1.9982975594589048e-05, + "loss": 1.0598, + "step": 1536 + }, + { + "epoch": 0.6250508336722245, + "grad_norm": 17.036281403986624, + "learning_rate": 1.9982916447608532e-05, + "loss": 0.561, + "step": 1537 + }, + { + "epoch": 0.6254575030500203, + "grad_norm": 22.10338184154069, + "learning_rate": 1.9982857198148356e-05, + "loss": 0.3229, + "step": 1538 + }, + { + "epoch": 0.6258641724278162, + "grad_norm": 25.089424819929135, + "learning_rate": 1.9982797846209126e-05, + "loss": 0.8857, + "step": 1539 + }, + { + "epoch": 0.626270841805612, + "grad_norm": 11.55372003681978, + "learning_rate": 1.998273839179145e-05, + "loss": 0.3736, + "step": 1540 + }, + { + "epoch": 0.6266775111834079, + "grad_norm": 19.06173624478556, + "learning_rate": 1.9982678834895938e-05, + "loss": 0.5208, + "step": 1541 + }, + { + "epoch": 0.6270841805612037, + "grad_norm": 12.098597505099027, + "learning_rate": 1.9982619175523207e-05, + "loss": 0.1988, + "step": 1542 + }, + { + "epoch": 0.6274908499389996, + "grad_norm": 18.668286621870077, + "learning_rate": 1.9982559413673868e-05, + "loss": 0.3877, + "step": 1543 + }, + { + "epoch": 0.6278975193167955, + "grad_norm": 3.3853163107492086, + "learning_rate": 1.9982499549348528e-05, + "loss": 0.0511, + "step": 1544 + }, + { + "epoch": 0.6283041886945913, + "grad_norm": 10.610208220200459, + "learning_rate": 1.9982439582547808e-05, + "loss": 0.3285, + "step": 1545 + }, + { + "epoch": 0.6287108580723871, + "grad_norm": 10.101860008446346, + "learning_rate": 1.9982379513272323e-05, + "loss": 0.2314, + "step": 1546 + }, + { + "epoch": 0.629117527450183, + "grad_norm": 15.088739750803988, + "learning_rate": 1.9982319341522687e-05, + "loss": 0.4943, + "step": 1547 + }, + { + "epoch": 0.6295241968279789, + "grad_norm": 70.06473598434293, + "learning_rate": 1.998225906729952e-05, + "loss": 0.9918, + "step": 1548 + }, + { + "epoch": 0.6299308662057747, + "grad_norm": 5.024013513518786, + "learning_rate": 1.9982198690603436e-05, + "loss": 0.0942, + "step": 1549 + }, + { + "epoch": 0.6303375355835705, + "grad_norm": 24.05207238349208, + "learning_rate": 1.998213821143506e-05, + "loss": 0.3678, + "step": 1550 + }, + { + "epoch": 0.6307442049613664, + "grad_norm": 12.001297677551015, + "learning_rate": 1.9982077629795014e-05, + "loss": 0.395, + "step": 1551 + }, + { + "epoch": 0.6311508743391623, + "grad_norm": 30.59110798498798, + "learning_rate": 1.9982016945683913e-05, + "loss": 1.2926, + "step": 1552 + }, + { + "epoch": 0.6315575437169582, + "grad_norm": 7.69088402167479, + "learning_rate": 1.998195615910239e-05, + "loss": 0.1907, + "step": 1553 + }, + { + "epoch": 0.6319642130947539, + "grad_norm": 13.326714727637814, + "learning_rate": 1.998189527005106e-05, + "loss": 0.7374, + "step": 1554 + }, + { + "epoch": 0.6323708824725498, + "grad_norm": 29.785492773687654, + "learning_rate": 1.998183427853055e-05, + "loss": 0.8624, + "step": 1555 + }, + { + "epoch": 0.6327775518503457, + "grad_norm": 18.10344877394292, + "learning_rate": 1.9981773184541488e-05, + "loss": 0.869, + "step": 1556 + }, + { + "epoch": 0.6331842212281416, + "grad_norm": 3.6013084521448944, + "learning_rate": 1.99817119880845e-05, + "loss": 0.0539, + "step": 1557 + }, + { + "epoch": 0.6335908906059373, + "grad_norm": 12.329687397361571, + "learning_rate": 1.9981650689160218e-05, + "loss": 0.1519, + "step": 1558 + }, + { + "epoch": 0.6339975599837332, + "grad_norm": 11.261058963051623, + "learning_rate": 1.9981589287769264e-05, + "loss": 0.2779, + "step": 1559 + }, + { + "epoch": 0.6344042293615291, + "grad_norm": 16.004616186006736, + "learning_rate": 1.9981527783912275e-05, + "loss": 0.3675, + "step": 1560 + }, + { + "epoch": 0.634810898739325, + "grad_norm": 29.051913528258446, + "learning_rate": 1.998146617758988e-05, + "loss": 1.4077, + "step": 1561 + }, + { + "epoch": 0.6352175681171208, + "grad_norm": 11.106197302419035, + "learning_rate": 1.998140446880271e-05, + "loss": 0.1456, + "step": 1562 + }, + { + "epoch": 0.6356242374949166, + "grad_norm": 13.575344916083894, + "learning_rate": 1.99813426575514e-05, + "loss": 0.5517, + "step": 1563 + }, + { + "epoch": 0.6360309068727125, + "grad_norm": 16.307060130115612, + "learning_rate": 1.9981280743836585e-05, + "loss": 0.6022, + "step": 1564 + }, + { + "epoch": 0.6364375762505083, + "grad_norm": 14.053918333037858, + "learning_rate": 1.9981218727658897e-05, + "loss": 0.4881, + "step": 1565 + }, + { + "epoch": 0.6368442456283042, + "grad_norm": 8.92953616137724, + "learning_rate": 1.9981156609018977e-05, + "loss": 0.2303, + "step": 1566 + }, + { + "epoch": 0.6372509150061001, + "grad_norm": 19.778649080641724, + "learning_rate": 1.9981094387917462e-05, + "loss": 0.7682, + "step": 1567 + }, + { + "epoch": 0.6376575843838959, + "grad_norm": 16.337222260520978, + "learning_rate": 1.9981032064354986e-05, + "loss": 0.8562, + "step": 1568 + }, + { + "epoch": 0.6380642537616917, + "grad_norm": 8.908130120589547, + "learning_rate": 1.9980969638332194e-05, + "loss": 0.1237, + "step": 1569 + }, + { + "epoch": 0.6384709231394876, + "grad_norm": 10.569615344851186, + "learning_rate": 1.9980907109849727e-05, + "loss": 0.4696, + "step": 1570 + }, + { + "epoch": 0.6388775925172835, + "grad_norm": 9.696306978634752, + "learning_rate": 1.9980844478908224e-05, + "loss": 0.4173, + "step": 1571 + }, + { + "epoch": 0.6392842618950793, + "grad_norm": 24.375215421008978, + "learning_rate": 1.9980781745508332e-05, + "loss": 0.4058, + "step": 1572 + }, + { + "epoch": 0.6396909312728751, + "grad_norm": 20.790791380332774, + "learning_rate": 1.998071890965069e-05, + "loss": 0.3467, + "step": 1573 + }, + { + "epoch": 0.640097600650671, + "grad_norm": 8.458838171882867, + "learning_rate": 1.9980655971335944e-05, + "loss": 0.2627, + "step": 1574 + }, + { + "epoch": 0.6405042700284669, + "grad_norm": 16.02429354537084, + "learning_rate": 1.9980592930564742e-05, + "loss": 0.5444, + "step": 1575 + }, + { + "epoch": 0.6409109394062628, + "grad_norm": 9.101713615143261, + "learning_rate": 1.9980529787337733e-05, + "loss": 0.4237, + "step": 1576 + }, + { + "epoch": 0.6413176087840585, + "grad_norm": 18.71342498990112, + "learning_rate": 1.998046654165556e-05, + "loss": 1.1529, + "step": 1577 + }, + { + "epoch": 0.6417242781618544, + "grad_norm": 6.575165626186062, + "learning_rate": 1.9980403193518877e-05, + "loss": 0.0933, + "step": 1578 + }, + { + "epoch": 0.6421309475396503, + "grad_norm": 19.73980691243297, + "learning_rate": 1.9980339742928328e-05, + "loss": 1.0626, + "step": 1579 + }, + { + "epoch": 0.6425376169174462, + "grad_norm": 12.603260190802876, + "learning_rate": 1.998027618988457e-05, + "loss": 0.3874, + "step": 1580 + }, + { + "epoch": 0.6429442862952419, + "grad_norm": 17.905955164185514, + "learning_rate": 1.9980212534388257e-05, + "loss": 0.6601, + "step": 1581 + }, + { + "epoch": 0.6433509556730378, + "grad_norm": 22.617122177003335, + "learning_rate": 1.998014877644004e-05, + "loss": 0.5505, + "step": 1582 + }, + { + "epoch": 0.6437576250508337, + "grad_norm": 6.931943403231193, + "learning_rate": 1.9980084916040567e-05, + "loss": 0.1557, + "step": 1583 + }, + { + "epoch": 0.6441642944286295, + "grad_norm": 14.81510172438125, + "learning_rate": 1.9980020953190506e-05, + "loss": 0.4615, + "step": 1584 + }, + { + "epoch": 0.6445709638064254, + "grad_norm": 12.234759722616705, + "learning_rate": 1.9979956887890504e-05, + "loss": 0.3519, + "step": 1585 + }, + { + "epoch": 0.6449776331842212, + "grad_norm": 10.500683375348672, + "learning_rate": 1.997989272014122e-05, + "loss": 0.2926, + "step": 1586 + }, + { + "epoch": 0.6453843025620171, + "grad_norm": 17.820765451967514, + "learning_rate": 1.9979828449943313e-05, + "loss": 0.4593, + "step": 1587 + }, + { + "epoch": 0.6457909719398129, + "grad_norm": 19.078198945252083, + "learning_rate": 1.997976407729745e-05, + "loss": 0.6646, + "step": 1588 + }, + { + "epoch": 0.6461976413176088, + "grad_norm": 16.447342400883535, + "learning_rate": 1.9979699602204283e-05, + "loss": 0.9383, + "step": 1589 + }, + { + "epoch": 0.6466043106954046, + "grad_norm": 26.218114659218163, + "learning_rate": 1.9979635024664478e-05, + "loss": 1.2009, + "step": 1590 + }, + { + "epoch": 0.6470109800732005, + "grad_norm": 17.3104079700515, + "learning_rate": 1.9979570344678695e-05, + "loss": 0.2495, + "step": 1591 + }, + { + "epoch": 0.6474176494509963, + "grad_norm": 9.381000708920727, + "learning_rate": 1.99795055622476e-05, + "loss": 0.1678, + "step": 1592 + }, + { + "epoch": 0.6478243188287922, + "grad_norm": 13.856206204814466, + "learning_rate": 1.997944067737186e-05, + "loss": 0.3729, + "step": 1593 + }, + { + "epoch": 0.6482309882065881, + "grad_norm": 12.209042916442092, + "learning_rate": 1.997937569005214e-05, + "loss": 0.3501, + "step": 1594 + }, + { + "epoch": 0.6486376575843839, + "grad_norm": 19.64505776825982, + "learning_rate": 1.9979310600289103e-05, + "loss": 1.0994, + "step": 1595 + }, + { + "epoch": 0.6490443269621797, + "grad_norm": 17.139227732232854, + "learning_rate": 1.997924540808342e-05, + "loss": 0.7721, + "step": 1596 + }, + { + "epoch": 0.6494509963399756, + "grad_norm": 32.05955321352078, + "learning_rate": 1.9979180113435762e-05, + "loss": 0.5765, + "step": 1597 + }, + { + "epoch": 0.6498576657177715, + "grad_norm": 6.284475605098825, + "learning_rate": 1.9979114716346797e-05, + "loss": 0.0847, + "step": 1598 + }, + { + "epoch": 0.6502643350955674, + "grad_norm": 7.403374969419418, + "learning_rate": 1.9979049216817196e-05, + "loss": 0.1721, + "step": 1599 + }, + { + "epoch": 0.6506710044733631, + "grad_norm": 18.51142954555167, + "learning_rate": 1.9978983614847633e-05, + "loss": 0.5593, + "step": 1600 + }, + { + "epoch": 0.651077673851159, + "grad_norm": 19.233741624176464, + "learning_rate": 1.9978917910438784e-05, + "loss": 1.0, + "step": 1601 + }, + { + "epoch": 0.6514843432289549, + "grad_norm": 13.409497969105104, + "learning_rate": 1.9978852103591318e-05, + "loss": 0.8612, + "step": 1602 + }, + { + "epoch": 0.6518910126067508, + "grad_norm": 4.663761958556169, + "learning_rate": 1.997878619430591e-05, + "loss": 0.073, + "step": 1603 + }, + { + "epoch": 0.6522976819845465, + "grad_norm": 21.780900062851828, + "learning_rate": 1.9978720182583244e-05, + "loss": 0.4797, + "step": 1604 + }, + { + "epoch": 0.6527043513623424, + "grad_norm": 9.518024672278436, + "learning_rate": 1.9978654068423992e-05, + "loss": 0.2892, + "step": 1605 + }, + { + "epoch": 0.6531110207401383, + "grad_norm": 10.111011284195278, + "learning_rate": 1.997858785182883e-05, + "loss": 0.349, + "step": 1606 + }, + { + "epoch": 0.6535176901179341, + "grad_norm": 6.4062955483992186, + "learning_rate": 1.9978521532798445e-05, + "loss": 0.2135, + "step": 1607 + }, + { + "epoch": 0.65392435949573, + "grad_norm": 20.170553231592425, + "learning_rate": 1.9978455111333518e-05, + "loss": 0.6107, + "step": 1608 + }, + { + "epoch": 0.6543310288735258, + "grad_norm": 7.998980062658156, + "learning_rate": 1.9978388587434722e-05, + "loss": 0.2664, + "step": 1609 + }, + { + "epoch": 0.6547376982513217, + "grad_norm": 16.819895680357387, + "learning_rate": 1.9978321961102744e-05, + "loss": 0.1965, + "step": 1610 + }, + { + "epoch": 0.6551443676291175, + "grad_norm": 25.388324855202722, + "learning_rate": 1.9978255232338273e-05, + "loss": 0.602, + "step": 1611 + }, + { + "epoch": 0.6555510370069134, + "grad_norm": 5.442834379474959, + "learning_rate": 1.9978188401141988e-05, + "loss": 0.09, + "step": 1612 + }, + { + "epoch": 0.6559577063847092, + "grad_norm": 25.285366127387185, + "learning_rate": 1.997812146751458e-05, + "loss": 0.6532, + "step": 1613 + }, + { + "epoch": 0.6563643757625051, + "grad_norm": 14.4658859454305, + "learning_rate": 1.997805443145673e-05, + "loss": 0.1959, + "step": 1614 + }, + { + "epoch": 0.6567710451403009, + "grad_norm": 1.5621752139463025, + "learning_rate": 1.997798729296913e-05, + "loss": 0.0168, + "step": 1615 + }, + { + "epoch": 0.6571777145180968, + "grad_norm": 11.120768464837901, + "learning_rate": 1.9977920052052467e-05, + "loss": 0.1866, + "step": 1616 + }, + { + "epoch": 0.6575843838958927, + "grad_norm": 18.848256876966364, + "learning_rate": 1.9977852708707438e-05, + "loss": 0.9479, + "step": 1617 + }, + { + "epoch": 0.6579910532736885, + "grad_norm": 12.485723713642106, + "learning_rate": 1.9977785262934723e-05, + "loss": 0.572, + "step": 1618 + }, + { + "epoch": 0.6583977226514843, + "grad_norm": 9.238446792929278, + "learning_rate": 1.9977717714735027e-05, + "loss": 0.346, + "step": 1619 + }, + { + "epoch": 0.6588043920292802, + "grad_norm": 13.606530901471919, + "learning_rate": 1.9977650064109034e-05, + "loss": 0.4526, + "step": 1620 + }, + { + "epoch": 0.6592110614070761, + "grad_norm": 20.231855610636995, + "learning_rate": 1.9977582311057438e-05, + "loss": 1.2389, + "step": 1621 + }, + { + "epoch": 0.6596177307848718, + "grad_norm": 10.205539579641625, + "learning_rate": 1.997751445558094e-05, + "loss": 0.2509, + "step": 1622 + }, + { + "epoch": 0.6600244001626677, + "grad_norm": 5.602042016882248, + "learning_rate": 1.997744649768024e-05, + "loss": 0.1654, + "step": 1623 + }, + { + "epoch": 0.6604310695404636, + "grad_norm": 6.522625454204162, + "learning_rate": 1.9977378437356022e-05, + "loss": 0.0824, + "step": 1624 + }, + { + "epoch": 0.6608377389182595, + "grad_norm": 27.358039488336335, + "learning_rate": 1.9977310274609e-05, + "loss": 1.2749, + "step": 1625 + }, + { + "epoch": 0.6612444082960554, + "grad_norm": 8.01930940622455, + "learning_rate": 1.9977242009439864e-05, + "loss": 0.2967, + "step": 1626 + }, + { + "epoch": 0.6616510776738511, + "grad_norm": 7.653839225814217, + "learning_rate": 1.9977173641849314e-05, + "loss": 0.3729, + "step": 1627 + }, + { + "epoch": 0.662057747051647, + "grad_norm": 19.658871942807636, + "learning_rate": 1.9977105171838058e-05, + "loss": 0.2589, + "step": 1628 + }, + { + "epoch": 0.6624644164294429, + "grad_norm": 8.959605030818055, + "learning_rate": 1.9977036599406796e-05, + "loss": 0.1258, + "step": 1629 + }, + { + "epoch": 0.6628710858072387, + "grad_norm": 14.153249730192316, + "learning_rate": 1.997696792455623e-05, + "loss": 0.3592, + "step": 1630 + }, + { + "epoch": 0.6632777551850345, + "grad_norm": 8.166324767926309, + "learning_rate": 1.997689914728707e-05, + "loss": 0.0537, + "step": 1631 + }, + { + "epoch": 0.6636844245628304, + "grad_norm": 5.580972778373391, + "learning_rate": 1.997683026760002e-05, + "loss": 0.2051, + "step": 1632 + }, + { + "epoch": 0.6640910939406263, + "grad_norm": 17.489721087065096, + "learning_rate": 1.997676128549578e-05, + "loss": 0.7623, + "step": 1633 + }, + { + "epoch": 0.6644977633184221, + "grad_norm": 19.337593916256225, + "learning_rate": 1.997669220097507e-05, + "loss": 0.4812, + "step": 1634 + }, + { + "epoch": 0.664904432696218, + "grad_norm": 10.495441654397254, + "learning_rate": 1.997662301403859e-05, + "loss": 0.3534, + "step": 1635 + }, + { + "epoch": 0.6653111020740138, + "grad_norm": 16.143974827173018, + "learning_rate": 1.9976553724687057e-05, + "loss": 0.1994, + "step": 1636 + }, + { + "epoch": 0.6657177714518097, + "grad_norm": 17.509727312781166, + "learning_rate": 1.9976484332921174e-05, + "loss": 0.5398, + "step": 1637 + }, + { + "epoch": 0.6661244408296055, + "grad_norm": 0.798975217804306, + "learning_rate": 1.9976414838741663e-05, + "loss": 0.0074, + "step": 1638 + }, + { + "epoch": 0.6665311102074014, + "grad_norm": 4.636880461636465, + "learning_rate": 1.997634524214923e-05, + "loss": 0.0849, + "step": 1639 + }, + { + "epoch": 0.6669377795851973, + "grad_norm": 10.411217014239536, + "learning_rate": 1.9976275543144596e-05, + "loss": 0.3059, + "step": 1640 + }, + { + "epoch": 0.667344448962993, + "grad_norm": 14.22054553922055, + "learning_rate": 1.9976205741728467e-05, + "loss": 0.5357, + "step": 1641 + }, + { + "epoch": 0.6677511183407889, + "grad_norm": 17.955112005770495, + "learning_rate": 1.9976135837901565e-05, + "loss": 0.4754, + "step": 1642 + }, + { + "epoch": 0.6681577877185848, + "grad_norm": 12.277772943161473, + "learning_rate": 1.9976065831664613e-05, + "loss": 0.2874, + "step": 1643 + }, + { + "epoch": 0.6685644570963807, + "grad_norm": 14.594391766784462, + "learning_rate": 1.997599572301832e-05, + "loss": 0.7227, + "step": 1644 + }, + { + "epoch": 0.6689711264741764, + "grad_norm": 23.530418994625137, + "learning_rate": 1.997592551196341e-05, + "loss": 0.7618, + "step": 1645 + }, + { + "epoch": 0.6693777958519723, + "grad_norm": 16.77732523682875, + "learning_rate": 1.997585519850061e-05, + "loss": 0.6596, + "step": 1646 + }, + { + "epoch": 0.6697844652297682, + "grad_norm": 10.906210163690215, + "learning_rate": 1.997578478263063e-05, + "loss": 0.2546, + "step": 1647 + }, + { + "epoch": 0.6701911346075641, + "grad_norm": 16.575076770802465, + "learning_rate": 1.9975714264354196e-05, + "loss": 0.6133, + "step": 1648 + }, + { + "epoch": 0.67059780398536, + "grad_norm": 9.288484488018586, + "learning_rate": 1.9975643643672043e-05, + "loss": 0.1468, + "step": 1649 + }, + { + "epoch": 0.6710044733631557, + "grad_norm": 8.047997098353685, + "learning_rate": 1.9975572920584883e-05, + "loss": 0.1214, + "step": 1650 + }, + { + "epoch": 0.6714111427409516, + "grad_norm": 15.78562250194495, + "learning_rate": 1.9975502095093446e-05, + "loss": 0.5889, + "step": 1651 + }, + { + "epoch": 0.6718178121187475, + "grad_norm": 2.194736341234762, + "learning_rate": 1.997543116719846e-05, + "loss": 0.0269, + "step": 1652 + }, + { + "epoch": 0.6722244814965433, + "grad_norm": 14.70759299371899, + "learning_rate": 1.997536013690065e-05, + "loss": 0.4358, + "step": 1653 + }, + { + "epoch": 0.6726311508743391, + "grad_norm": 24.940300285893592, + "learning_rate": 1.997528900420075e-05, + "loss": 0.4116, + "step": 1654 + }, + { + "epoch": 0.673037820252135, + "grad_norm": 28.472944544979747, + "learning_rate": 1.9975217769099487e-05, + "loss": 0.6684, + "step": 1655 + }, + { + "epoch": 0.6734444896299309, + "grad_norm": 7.167187410543654, + "learning_rate": 1.9975146431597592e-05, + "loss": 0.2015, + "step": 1656 + }, + { + "epoch": 0.6738511590077267, + "grad_norm": 21.356342422877326, + "learning_rate": 1.9975074991695804e-05, + "loss": 0.7041, + "step": 1657 + }, + { + "epoch": 0.6742578283855226, + "grad_norm": 10.882107425155786, + "learning_rate": 1.9975003449394844e-05, + "loss": 0.1296, + "step": 1658 + }, + { + "epoch": 0.6746644977633184, + "grad_norm": 61.94204086399131, + "learning_rate": 1.9974931804695458e-05, + "loss": 1.882, + "step": 1659 + }, + { + "epoch": 0.6750711671411143, + "grad_norm": 13.038707257834826, + "learning_rate": 1.9974860057598374e-05, + "loss": 0.2527, + "step": 1660 + }, + { + "epoch": 0.6754778365189101, + "grad_norm": 11.092583037336796, + "learning_rate": 1.997478820810433e-05, + "loss": 0.301, + "step": 1661 + }, + { + "epoch": 0.675884505896706, + "grad_norm": 1.80093177019115, + "learning_rate": 1.9974716256214067e-05, + "loss": 0.028, + "step": 1662 + }, + { + "epoch": 0.6762911752745018, + "grad_norm": 4.8213841183791715, + "learning_rate": 1.9974644201928325e-05, + "loss": 0.0431, + "step": 1663 + }, + { + "epoch": 0.6766978446522977, + "grad_norm": 6.7498824611351615, + "learning_rate": 1.9974572045247836e-05, + "loss": 0.0595, + "step": 1664 + }, + { + "epoch": 0.6771045140300935, + "grad_norm": 12.876802922227268, + "learning_rate": 1.9974499786173346e-05, + "loss": 0.2889, + "step": 1665 + }, + { + "epoch": 0.6775111834078894, + "grad_norm": 19.878019161861648, + "learning_rate": 1.9974427424705593e-05, + "loss": 0.4367, + "step": 1666 + }, + { + "epoch": 0.6779178527856853, + "grad_norm": 10.79988836291307, + "learning_rate": 1.9974354960845326e-05, + "loss": 0.4668, + "step": 1667 + }, + { + "epoch": 0.678324522163481, + "grad_norm": 18.805209885779433, + "learning_rate": 1.9974282394593284e-05, + "loss": 0.417, + "step": 1668 + }, + { + "epoch": 0.6787311915412769, + "grad_norm": 18.20877894857796, + "learning_rate": 1.9974209725950215e-05, + "loss": 1.0806, + "step": 1669 + }, + { + "epoch": 0.6791378609190728, + "grad_norm": 16.607878567233755, + "learning_rate": 1.997413695491686e-05, + "loss": 0.2833, + "step": 1670 + }, + { + "epoch": 0.6795445302968687, + "grad_norm": 22.105857665801064, + "learning_rate": 1.9974064081493972e-05, + "loss": 1.0638, + "step": 1671 + }, + { + "epoch": 0.6799511996746646, + "grad_norm": 26.87683528854633, + "learning_rate": 1.9973991105682296e-05, + "loss": 1.2997, + "step": 1672 + }, + { + "epoch": 0.6803578690524603, + "grad_norm": 17.096094199941383, + "learning_rate": 1.9973918027482578e-05, + "loss": 0.8062, + "step": 1673 + }, + { + "epoch": 0.6807645384302562, + "grad_norm": 16.001340714339545, + "learning_rate": 1.9973844846895574e-05, + "loss": 0.3816, + "step": 1674 + }, + { + "epoch": 0.6811712078080521, + "grad_norm": 31.356377768679174, + "learning_rate": 1.9973771563922036e-05, + "loss": 0.6004, + "step": 1675 + }, + { + "epoch": 0.681577877185848, + "grad_norm": 13.592502615824953, + "learning_rate": 1.9973698178562707e-05, + "loss": 0.3236, + "step": 1676 + }, + { + "epoch": 0.6819845465636437, + "grad_norm": 12.281111315057636, + "learning_rate": 1.997362469081835e-05, + "loss": 0.3379, + "step": 1677 + }, + { + "epoch": 0.6823912159414396, + "grad_norm": 10.204678038187653, + "learning_rate": 1.9973551100689717e-05, + "loss": 0.5601, + "step": 1678 + }, + { + "epoch": 0.6827978853192355, + "grad_norm": 5.3806648936343375, + "learning_rate": 1.9973477408177562e-05, + "loss": 0.0918, + "step": 1679 + }, + { + "epoch": 0.6832045546970313, + "grad_norm": 17.2035180753653, + "learning_rate": 1.9973403613282644e-05, + "loss": 1.1568, + "step": 1680 + }, + { + "epoch": 0.6836112240748272, + "grad_norm": 14.69252608113086, + "learning_rate": 1.9973329716005715e-05, + "loss": 0.5282, + "step": 1681 + }, + { + "epoch": 0.684017893452623, + "grad_norm": 11.025077338599399, + "learning_rate": 1.9973255716347535e-05, + "loss": 0.5016, + "step": 1682 + }, + { + "epoch": 0.6844245628304189, + "grad_norm": 27.069855148062985, + "learning_rate": 1.9973181614308868e-05, + "loss": 0.6156, + "step": 1683 + }, + { + "epoch": 0.6848312322082147, + "grad_norm": 10.80696815067302, + "learning_rate": 1.9973107409890474e-05, + "loss": 0.3395, + "step": 1684 + }, + { + "epoch": 0.6852379015860106, + "grad_norm": 8.082958825845418, + "learning_rate": 1.9973033103093112e-05, + "loss": 0.0885, + "step": 1685 + }, + { + "epoch": 0.6856445709638064, + "grad_norm": 6.0568370794919035, + "learning_rate": 1.9972958693917544e-05, + "loss": 0.0774, + "step": 1686 + }, + { + "epoch": 0.6860512403416023, + "grad_norm": 28.95580172118687, + "learning_rate": 1.997288418236454e-05, + "loss": 1.5578, + "step": 1687 + }, + { + "epoch": 0.6864579097193981, + "grad_norm": 15.38495835005815, + "learning_rate": 1.9972809568434854e-05, + "loss": 0.3573, + "step": 1688 + }, + { + "epoch": 0.686864579097194, + "grad_norm": 7.635880192190002, + "learning_rate": 1.9972734852129262e-05, + "loss": 0.1742, + "step": 1689 + }, + { + "epoch": 0.6872712484749899, + "grad_norm": 5.771122370903011, + "learning_rate": 1.997266003344853e-05, + "loss": 0.1061, + "step": 1690 + }, + { + "epoch": 0.6876779178527856, + "grad_norm": 6.09018469966942, + "learning_rate": 1.997258511239342e-05, + "loss": 0.0536, + "step": 1691 + }, + { + "epoch": 0.6880845872305815, + "grad_norm": 19.240727032284724, + "learning_rate": 1.9972510088964707e-05, + "loss": 1.0426, + "step": 1692 + }, + { + "epoch": 0.6884912566083774, + "grad_norm": 13.685867159919967, + "learning_rate": 1.997243496316316e-05, + "loss": 0.302, + "step": 1693 + }, + { + "epoch": 0.6888979259861733, + "grad_norm": 11.87350677876753, + "learning_rate": 1.9972359734989546e-05, + "loss": 0.4949, + "step": 1694 + }, + { + "epoch": 0.689304595363969, + "grad_norm": 15.656741264482168, + "learning_rate": 1.9972284404444644e-05, + "loss": 0.3957, + "step": 1695 + }, + { + "epoch": 0.6897112647417649, + "grad_norm": 7.6818255322953695, + "learning_rate": 1.997220897152922e-05, + "loss": 0.0964, + "step": 1696 + }, + { + "epoch": 0.6901179341195608, + "grad_norm": 14.521715927682939, + "learning_rate": 1.9972133436244053e-05, + "loss": 0.4417, + "step": 1697 + }, + { + "epoch": 0.6905246034973567, + "grad_norm": 8.750182391930904, + "learning_rate": 1.997205779858992e-05, + "loss": 0.2053, + "step": 1698 + }, + { + "epoch": 0.6909312728751525, + "grad_norm": 12.471450044105165, + "learning_rate": 1.997198205856759e-05, + "loss": 0.3171, + "step": 1699 + }, + { + "epoch": 0.6913379422529483, + "grad_norm": 15.20027193169012, + "learning_rate": 1.997190621617785e-05, + "loss": 0.3777, + "step": 1700 + }, + { + "epoch": 0.6917446116307442, + "grad_norm": 9.904344961350647, + "learning_rate": 1.9971830271421472e-05, + "loss": 0.2924, + "step": 1701 + }, + { + "epoch": 0.6921512810085401, + "grad_norm": 12.210728181032158, + "learning_rate": 1.9971754224299235e-05, + "loss": 0.5203, + "step": 1702 + }, + { + "epoch": 0.6925579503863359, + "grad_norm": 1.3960741892680402, + "learning_rate": 1.9971678074811926e-05, + "loss": 0.0219, + "step": 1703 + }, + { + "epoch": 0.6929646197641317, + "grad_norm": 16.70096534652163, + "learning_rate": 1.997160182296032e-05, + "loss": 0.2906, + "step": 1704 + }, + { + "epoch": 0.6933712891419276, + "grad_norm": 19.087822309282185, + "learning_rate": 1.9971525468745206e-05, + "loss": 0.4395, + "step": 1705 + }, + { + "epoch": 0.6937779585197235, + "grad_norm": 7.827666787058599, + "learning_rate": 1.997144901216736e-05, + "loss": 0.1151, + "step": 1706 + }, + { + "epoch": 0.6941846278975193, + "grad_norm": 17.34457779167553, + "learning_rate": 1.9971372453227574e-05, + "loss": 0.3131, + "step": 1707 + }, + { + "epoch": 0.6945912972753152, + "grad_norm": 15.05596020956218, + "learning_rate": 1.9971295791926633e-05, + "loss": 0.5395, + "step": 1708 + }, + { + "epoch": 0.694997966653111, + "grad_norm": 8.116861890017464, + "learning_rate": 1.997121902826532e-05, + "loss": 0.4664, + "step": 1709 + }, + { + "epoch": 0.6954046360309069, + "grad_norm": 9.467929789809789, + "learning_rate": 1.997114216224443e-05, + "loss": 0.1873, + "step": 1710 + }, + { + "epoch": 0.6958113054087027, + "grad_norm": 11.901083921806842, + "learning_rate": 1.997106519386474e-05, + "loss": 0.314, + "step": 1711 + }, + { + "epoch": 0.6962179747864986, + "grad_norm": 11.229865694274308, + "learning_rate": 1.997098812312705e-05, + "loss": 0.0967, + "step": 1712 + }, + { + "epoch": 0.6966246441642945, + "grad_norm": 21.3698646332805, + "learning_rate": 1.997091095003215e-05, + "loss": 0.8555, + "step": 1713 + }, + { + "epoch": 0.6970313135420902, + "grad_norm": 16.303356090847224, + "learning_rate": 1.997083367458083e-05, + "loss": 0.6793, + "step": 1714 + }, + { + "epoch": 0.6974379829198861, + "grad_norm": 9.594340149375883, + "learning_rate": 1.997075629677388e-05, + "loss": 0.117, + "step": 1715 + }, + { + "epoch": 0.697844652297682, + "grad_norm": 11.687309909580106, + "learning_rate": 1.997067881661211e-05, + "loss": 0.1692, + "step": 1716 + }, + { + "epoch": 0.6982513216754779, + "grad_norm": 15.457996195490495, + "learning_rate": 1.9970601234096292e-05, + "loss": 0.7281, + "step": 1717 + }, + { + "epoch": 0.6986579910532736, + "grad_norm": 18.855838083631475, + "learning_rate": 1.997052354922724e-05, + "loss": 1.2057, + "step": 1718 + }, + { + "epoch": 0.6990646604310695, + "grad_norm": 24.24377542176257, + "learning_rate": 1.9970445762005745e-05, + "loss": 0.0934, + "step": 1719 + }, + { + "epoch": 0.6994713298088654, + "grad_norm": 12.42207445668287, + "learning_rate": 1.9970367872432607e-05, + "loss": 0.2768, + "step": 1720 + }, + { + "epoch": 0.6998779991866613, + "grad_norm": 13.529874229031254, + "learning_rate": 1.997028988050862e-05, + "loss": 0.8074, + "step": 1721 + }, + { + "epoch": 0.7002846685644571, + "grad_norm": 4.268355786325398, + "learning_rate": 1.9970211786234593e-05, + "loss": 0.0461, + "step": 1722 + }, + { + "epoch": 0.7006913379422529, + "grad_norm": 31.599699544044103, + "learning_rate": 1.9970133589611324e-05, + "loss": 1.4873, + "step": 1723 + }, + { + "epoch": 0.7010980073200488, + "grad_norm": 27.266279104524436, + "learning_rate": 1.9970055290639617e-05, + "loss": 0.8785, + "step": 1724 + }, + { + "epoch": 0.7015046766978447, + "grad_norm": 12.886043548529214, + "learning_rate": 1.9969976889320276e-05, + "loss": 0.5709, + "step": 1725 + }, + { + "epoch": 0.7019113460756405, + "grad_norm": 6.3491774336869256, + "learning_rate": 1.99698983856541e-05, + "loss": 0.1142, + "step": 1726 + }, + { + "epoch": 0.7023180154534363, + "grad_norm": 18.205093423122594, + "learning_rate": 1.99698197796419e-05, + "loss": 0.9034, + "step": 1727 + }, + { + "epoch": 0.7027246848312322, + "grad_norm": 8.757648613794306, + "learning_rate": 1.9969741071284482e-05, + "loss": 0.2662, + "step": 1728 + }, + { + "epoch": 0.7031313542090281, + "grad_norm": 6.155111446748205, + "learning_rate": 1.9969662260582658e-05, + "loss": 0.0975, + "step": 1729 + }, + { + "epoch": 0.7035380235868239, + "grad_norm": 15.746500376123803, + "learning_rate": 1.996958334753723e-05, + "loss": 0.5379, + "step": 1730 + }, + { + "epoch": 0.7039446929646198, + "grad_norm": 25.134635866858673, + "learning_rate": 1.9969504332149015e-05, + "loss": 0.9369, + "step": 1731 + }, + { + "epoch": 0.7043513623424156, + "grad_norm": 22.038214791657758, + "learning_rate": 1.9969425214418815e-05, + "loss": 1.3829, + "step": 1732 + }, + { + "epoch": 0.7047580317202115, + "grad_norm": 20.821380919280745, + "learning_rate": 1.9969345994347452e-05, + "loss": 0.9668, + "step": 1733 + }, + { + "epoch": 0.7051647010980073, + "grad_norm": 18.56533987896325, + "learning_rate": 1.9969266671935732e-05, + "loss": 0.7161, + "step": 1734 + }, + { + "epoch": 0.7055713704758032, + "grad_norm": 13.045462264879164, + "learning_rate": 1.9969187247184475e-05, + "loss": 0.3803, + "step": 1735 + }, + { + "epoch": 0.705978039853599, + "grad_norm": 11.051412588234436, + "learning_rate": 1.9969107720094493e-05, + "loss": 0.643, + "step": 1736 + }, + { + "epoch": 0.7063847092313948, + "grad_norm": 7.248480897888599, + "learning_rate": 1.99690280906666e-05, + "loss": 0.1326, + "step": 1737 + }, + { + "epoch": 0.7067913786091907, + "grad_norm": 12.572612207722312, + "learning_rate": 1.9968948358901618e-05, + "loss": 0.3044, + "step": 1738 + }, + { + "epoch": 0.7071980479869866, + "grad_norm": 11.655609459062507, + "learning_rate": 1.9968868524800364e-05, + "loss": 0.1726, + "step": 1739 + }, + { + "epoch": 0.7076047173647825, + "grad_norm": 9.044041009368362, + "learning_rate": 1.996878858836366e-05, + "loss": 0.2708, + "step": 1740 + }, + { + "epoch": 0.7080113867425782, + "grad_norm": 12.312096625615288, + "learning_rate": 1.996870854959232e-05, + "loss": 0.896, + "step": 1741 + }, + { + "epoch": 0.7084180561203741, + "grad_norm": 8.14745183006522, + "learning_rate": 1.9968628408487172e-05, + "loss": 0.3749, + "step": 1742 + }, + { + "epoch": 0.70882472549817, + "grad_norm": 5.992963472283775, + "learning_rate": 1.9968548165049037e-05, + "loss": 0.0817, + "step": 1743 + }, + { + "epoch": 0.7092313948759659, + "grad_norm": 15.27389145732006, + "learning_rate": 1.9968467819278734e-05, + "loss": 0.5302, + "step": 1744 + }, + { + "epoch": 0.7096380642537617, + "grad_norm": 16.43036555044561, + "learning_rate": 1.9968387371177096e-05, + "loss": 1.0064, + "step": 1745 + }, + { + "epoch": 0.7100447336315575, + "grad_norm": 1.8552447118404534, + "learning_rate": 1.996830682074494e-05, + "loss": 0.0292, + "step": 1746 + }, + { + "epoch": 0.7104514030093534, + "grad_norm": 6.490702626601267, + "learning_rate": 1.9968226167983104e-05, + "loss": 0.2122, + "step": 1747 + }, + { + "epoch": 0.7108580723871493, + "grad_norm": 13.30109269593951, + "learning_rate": 1.9968145412892403e-05, + "loss": 0.5092, + "step": 1748 + }, + { + "epoch": 0.7112647417649451, + "grad_norm": 17.818527273208506, + "learning_rate": 1.9968064555473677e-05, + "loss": 0.3919, + "step": 1749 + }, + { + "epoch": 0.7116714111427409, + "grad_norm": 2.303136267190158, + "learning_rate": 1.9967983595727748e-05, + "loss": 0.0375, + "step": 1750 + }, + { + "epoch": 0.7120780805205368, + "grad_norm": 18.829888395739555, + "learning_rate": 1.9967902533655454e-05, + "loss": 0.8373, + "step": 1751 + }, + { + "epoch": 0.7124847498983327, + "grad_norm": 5.490695471885016, + "learning_rate": 1.996782136925762e-05, + "loss": 0.1475, + "step": 1752 + }, + { + "epoch": 0.7128914192761285, + "grad_norm": 8.570855350094652, + "learning_rate": 1.9967740102535088e-05, + "loss": 0.3294, + "step": 1753 + }, + { + "epoch": 0.7132980886539244, + "grad_norm": 1.70629171734173, + "learning_rate": 1.9967658733488682e-05, + "loss": 0.0333, + "step": 1754 + }, + { + "epoch": 0.7137047580317202, + "grad_norm": 9.253994715802358, + "learning_rate": 1.9967577262119247e-05, + "loss": 0.1686, + "step": 1755 + }, + { + "epoch": 0.714111427409516, + "grad_norm": 6.11713902387558, + "learning_rate": 1.996749568842761e-05, + "loss": 0.1402, + "step": 1756 + }, + { + "epoch": 0.7145180967873119, + "grad_norm": 15.242331087279549, + "learning_rate": 1.9967414012414616e-05, + "loss": 0.8229, + "step": 1757 + }, + { + "epoch": 0.7149247661651078, + "grad_norm": 16.956197813730963, + "learning_rate": 1.9967332234081104e-05, + "loss": 0.7968, + "step": 1758 + }, + { + "epoch": 0.7153314355429036, + "grad_norm": 12.292264036453338, + "learning_rate": 1.9967250353427903e-05, + "loss": 0.2781, + "step": 1759 + }, + { + "epoch": 0.7157381049206994, + "grad_norm": 16.921491349116536, + "learning_rate": 1.9967168370455867e-05, + "loss": 0.5722, + "step": 1760 + }, + { + "epoch": 0.7161447742984953, + "grad_norm": 8.019022982154983, + "learning_rate": 1.9967086285165828e-05, + "loss": 0.2228, + "step": 1761 + }, + { + "epoch": 0.7165514436762912, + "grad_norm": 24.392285444651172, + "learning_rate": 1.9967004097558634e-05, + "loss": 1.3409, + "step": 1762 + }, + { + "epoch": 0.7169581130540871, + "grad_norm": 32.7352448072474, + "learning_rate": 1.9966921807635125e-05, + "loss": 1.013, + "step": 1763 + }, + { + "epoch": 0.7173647824318828, + "grad_norm": 17.47285733860401, + "learning_rate": 1.996683941539615e-05, + "loss": 0.5528, + "step": 1764 + }, + { + "epoch": 0.7177714518096787, + "grad_norm": 12.198832793513393, + "learning_rate": 1.9966756920842547e-05, + "loss": 0.3608, + "step": 1765 + }, + { + "epoch": 0.7181781211874746, + "grad_norm": 9.141175157137788, + "learning_rate": 1.996667432397517e-05, + "loss": 0.1662, + "step": 1766 + }, + { + "epoch": 0.7185847905652705, + "grad_norm": 22.77903129163116, + "learning_rate": 1.9966591624794868e-05, + "loss": 0.8748, + "step": 1767 + }, + { + "epoch": 0.7189914599430662, + "grad_norm": 11.246271445983716, + "learning_rate": 1.9966508823302484e-05, + "loss": 0.2127, + "step": 1768 + }, + { + "epoch": 0.7193981293208621, + "grad_norm": 13.754553658552387, + "learning_rate": 1.996642591949887e-05, + "loss": 0.2717, + "step": 1769 + }, + { + "epoch": 0.719804798698658, + "grad_norm": 22.364111233972107, + "learning_rate": 1.996634291338488e-05, + "loss": 0.6066, + "step": 1770 + }, + { + "epoch": 0.7202114680764539, + "grad_norm": 2.569144780683314, + "learning_rate": 1.9966259804961363e-05, + "loss": 0.041, + "step": 1771 + }, + { + "epoch": 0.7206181374542497, + "grad_norm": 24.30509712311808, + "learning_rate": 1.9966176594229174e-05, + "loss": 0.9029, + "step": 1772 + }, + { + "epoch": 0.7210248068320455, + "grad_norm": 17.546112594278448, + "learning_rate": 1.996609328118917e-05, + "loss": 0.5323, + "step": 1773 + }, + { + "epoch": 0.7214314762098414, + "grad_norm": 9.244229200377257, + "learning_rate": 1.9966009865842198e-05, + "loss": 0.247, + "step": 1774 + }, + { + "epoch": 0.7218381455876373, + "grad_norm": 11.218394250595377, + "learning_rate": 1.996592634818912e-05, + "loss": 0.3042, + "step": 1775 + }, + { + "epoch": 0.7222448149654331, + "grad_norm": 20.731816036160872, + "learning_rate": 1.9965842728230792e-05, + "loss": 0.5453, + "step": 1776 + }, + { + "epoch": 0.7226514843432289, + "grad_norm": 19.37744483827568, + "learning_rate": 1.996575900596807e-05, + "loss": 1.3755, + "step": 1777 + }, + { + "epoch": 0.7230581537210248, + "grad_norm": 6.421539519827197, + "learning_rate": 1.996567518140182e-05, + "loss": 0.1928, + "step": 1778 + }, + { + "epoch": 0.7234648230988207, + "grad_norm": 19.444592010707044, + "learning_rate": 1.9965591254532895e-05, + "loss": 0.7258, + "step": 1779 + }, + { + "epoch": 0.7238714924766165, + "grad_norm": 54.37940371702133, + "learning_rate": 1.9965507225362165e-05, + "loss": 3.0264, + "step": 1780 + }, + { + "epoch": 0.7242781618544124, + "grad_norm": 13.26745429167931, + "learning_rate": 1.9965423093890485e-05, + "loss": 0.4165, + "step": 1781 + }, + { + "epoch": 0.7246848312322082, + "grad_norm": 13.63013800090853, + "learning_rate": 1.9965338860118723e-05, + "loss": 0.7418, + "step": 1782 + }, + { + "epoch": 0.725091500610004, + "grad_norm": 1.0495119803557487, + "learning_rate": 1.9965254524047742e-05, + "loss": 0.0159, + "step": 1783 + }, + { + "epoch": 0.7254981699877999, + "grad_norm": 6.893673956992069, + "learning_rate": 1.9965170085678407e-05, + "loss": 0.0993, + "step": 1784 + }, + { + "epoch": 0.7259048393655958, + "grad_norm": 4.012689745402008, + "learning_rate": 1.9965085545011585e-05, + "loss": 0.1108, + "step": 1785 + }, + { + "epoch": 0.7263115087433917, + "grad_norm": 61.96957014101903, + "learning_rate": 1.9965000902048147e-05, + "loss": 2.0721, + "step": 1786 + }, + { + "epoch": 0.7267181781211874, + "grad_norm": 35.13832959420863, + "learning_rate": 1.996491615678896e-05, + "loss": 1.5698, + "step": 1787 + }, + { + "epoch": 0.7271248474989833, + "grad_norm": 14.286303573746677, + "learning_rate": 1.9964831309234887e-05, + "loss": 0.2277, + "step": 1788 + }, + { + "epoch": 0.7275315168767792, + "grad_norm": 9.964529493656052, + "learning_rate": 1.996474635938681e-05, + "loss": 0.1704, + "step": 1789 + }, + { + "epoch": 0.7279381862545751, + "grad_norm": 13.82275756773364, + "learning_rate": 1.9964661307245598e-05, + "loss": 0.7979, + "step": 1790 + }, + { + "epoch": 0.7283448556323708, + "grad_norm": 12.661156555106443, + "learning_rate": 1.996457615281212e-05, + "loss": 0.4145, + "step": 1791 + }, + { + "epoch": 0.7287515250101667, + "grad_norm": 26.465563406361927, + "learning_rate": 1.9964490896087252e-05, + "loss": 0.9735, + "step": 1792 + }, + { + "epoch": 0.7291581943879626, + "grad_norm": 21.77006177736628, + "learning_rate": 1.996440553707187e-05, + "loss": 1.299, + "step": 1793 + }, + { + "epoch": 0.7295648637657585, + "grad_norm": 53.58055665190415, + "learning_rate": 1.9964320075766856e-05, + "loss": 0.6429, + "step": 1794 + }, + { + "epoch": 0.7299715331435543, + "grad_norm": 10.461634319635024, + "learning_rate": 1.9964234512173075e-05, + "loss": 0.3554, + "step": 1795 + }, + { + "epoch": 0.7303782025213501, + "grad_norm": 20.522146038450373, + "learning_rate": 1.9964148846291413e-05, + "loss": 1.5255, + "step": 1796 + }, + { + "epoch": 0.730784871899146, + "grad_norm": 8.84282510531448, + "learning_rate": 1.996406307812275e-05, + "loss": 0.2478, + "step": 1797 + }, + { + "epoch": 0.7311915412769419, + "grad_norm": 11.680123788880481, + "learning_rate": 1.9963977207667963e-05, + "loss": 0.4616, + "step": 1798 + }, + { + "epoch": 0.7315982106547377, + "grad_norm": 6.985358365975662, + "learning_rate": 1.9963891234927938e-05, + "loss": 0.1447, + "step": 1799 + }, + { + "epoch": 0.7320048800325335, + "grad_norm": 12.386284742857884, + "learning_rate": 1.996380515990355e-05, + "loss": 0.6171, + "step": 1800 + }, + { + "epoch": 0.7324115494103294, + "grad_norm": 22.84228731556866, + "learning_rate": 1.996371898259569e-05, + "loss": 1.0037, + "step": 1801 + }, + { + "epoch": 0.7328182187881253, + "grad_norm": 23.08617296889481, + "learning_rate": 1.996363270300524e-05, + "loss": 1.0818, + "step": 1802 + }, + { + "epoch": 0.7332248881659211, + "grad_norm": 13.55327478187076, + "learning_rate": 1.9963546321133086e-05, + "loss": 0.6191, + "step": 1803 + }, + { + "epoch": 0.733631557543717, + "grad_norm": 11.095669941079493, + "learning_rate": 1.9963459836980114e-05, + "loss": 0.6203, + "step": 1804 + }, + { + "epoch": 0.7340382269215128, + "grad_norm": 5.455208079503888, + "learning_rate": 1.9963373250547215e-05, + "loss": 0.0982, + "step": 1805 + }, + { + "epoch": 0.7344448962993086, + "grad_norm": 33.781367113826136, + "learning_rate": 1.9963286561835274e-05, + "loss": 0.452, + "step": 1806 + }, + { + "epoch": 0.7348515656771045, + "grad_norm": 10.679386246682002, + "learning_rate": 1.996319977084518e-05, + "loss": 0.7562, + "step": 1807 + }, + { + "epoch": 0.7352582350549004, + "grad_norm": 8.905667823661704, + "learning_rate": 1.9963112877577828e-05, + "loss": 0.1279, + "step": 1808 + }, + { + "epoch": 0.7356649044326962, + "grad_norm": 11.60911289297611, + "learning_rate": 1.9963025882034104e-05, + "loss": 0.1946, + "step": 1809 + }, + { + "epoch": 0.736071573810492, + "grad_norm": 3.4587100693370183, + "learning_rate": 1.9962938784214906e-05, + "loss": 0.0578, + "step": 1810 + }, + { + "epoch": 0.7364782431882879, + "grad_norm": 10.940285227573588, + "learning_rate": 1.996285158412113e-05, + "loss": 0.2126, + "step": 1811 + }, + { + "epoch": 0.7368849125660838, + "grad_norm": 11.09809829762788, + "learning_rate": 1.9962764281753668e-05, + "loss": 0.4381, + "step": 1812 + }, + { + "epoch": 0.7372915819438797, + "grad_norm": 20.385238756188205, + "learning_rate": 1.9962676877113414e-05, + "loss": 0.6514, + "step": 1813 + }, + { + "epoch": 0.7376982513216754, + "grad_norm": 9.450447575317986, + "learning_rate": 1.996258937020127e-05, + "loss": 0.3405, + "step": 1814 + }, + { + "epoch": 0.7381049206994713, + "grad_norm": 15.906546993222552, + "learning_rate": 1.996250176101813e-05, + "loss": 0.5401, + "step": 1815 + }, + { + "epoch": 0.7385115900772672, + "grad_norm": 13.037717844661714, + "learning_rate": 1.9962414049564897e-05, + "loss": 0.6996, + "step": 1816 + }, + { + "epoch": 0.7389182594550631, + "grad_norm": 16.13565228536507, + "learning_rate": 1.996232623584247e-05, + "loss": 0.7289, + "step": 1817 + }, + { + "epoch": 0.7393249288328589, + "grad_norm": 6.637843285980142, + "learning_rate": 1.996223831985175e-05, + "loss": 0.1208, + "step": 1818 + }, + { + "epoch": 0.7397315982106547, + "grad_norm": 17.827400865233034, + "learning_rate": 1.996215030159364e-05, + "loss": 0.4101, + "step": 1819 + }, + { + "epoch": 0.7401382675884506, + "grad_norm": 9.281132417754437, + "learning_rate": 1.9962062181069043e-05, + "loss": 0.5139, + "step": 1820 + }, + { + "epoch": 0.7405449369662465, + "grad_norm": 8.51611348681955, + "learning_rate": 1.9961973958278866e-05, + "loss": 0.2128, + "step": 1821 + }, + { + "epoch": 0.7409516063440423, + "grad_norm": 9.903406229710082, + "learning_rate": 1.9961885633224008e-05, + "loss": 0.1137, + "step": 1822 + }, + { + "epoch": 0.7413582757218381, + "grad_norm": 22.204279806563616, + "learning_rate": 1.9961797205905382e-05, + "loss": 0.4453, + "step": 1823 + }, + { + "epoch": 0.741764945099634, + "grad_norm": 3.9690323774602256, + "learning_rate": 1.9961708676323896e-05, + "loss": 0.046, + "step": 1824 + }, + { + "epoch": 0.7421716144774299, + "grad_norm": 13.93959251462128, + "learning_rate": 1.9961620044480455e-05, + "loss": 0.7533, + "step": 1825 + }, + { + "epoch": 0.7425782838552257, + "grad_norm": 13.913831382325421, + "learning_rate": 1.9961531310375972e-05, + "loss": 0.3538, + "step": 1826 + }, + { + "epoch": 0.7429849532330216, + "grad_norm": 21.45696978140943, + "learning_rate": 1.9961442474011353e-05, + "loss": 0.2276, + "step": 1827 + }, + { + "epoch": 0.7433916226108174, + "grad_norm": 15.890552482491087, + "learning_rate": 1.996135353538752e-05, + "loss": 0.7619, + "step": 1828 + }, + { + "epoch": 0.7437982919886132, + "grad_norm": 15.186536709334009, + "learning_rate": 1.9961264494505376e-05, + "loss": 0.4906, + "step": 1829 + }, + { + "epoch": 0.7442049613664091, + "grad_norm": 15.102989044386318, + "learning_rate": 1.9961175351365837e-05, + "loss": 0.4793, + "step": 1830 + }, + { + "epoch": 0.744611630744205, + "grad_norm": 5.818124350959679, + "learning_rate": 1.996108610596982e-05, + "loss": 0.1255, + "step": 1831 + }, + { + "epoch": 0.7450183001220008, + "grad_norm": 7.646801148414305, + "learning_rate": 1.9960996758318243e-05, + "loss": 0.4035, + "step": 1832 + }, + { + "epoch": 0.7454249694997966, + "grad_norm": 14.137902487932923, + "learning_rate": 1.996090730841202e-05, + "loss": 0.4517, + "step": 1833 + }, + { + "epoch": 0.7458316388775925, + "grad_norm": 9.64033702720875, + "learning_rate": 1.996081775625207e-05, + "loss": 0.2585, + "step": 1834 + }, + { + "epoch": 0.7462383082553884, + "grad_norm": 25.443284653112986, + "learning_rate": 1.996072810183931e-05, + "loss": 0.9862, + "step": 1835 + }, + { + "epoch": 0.7466449776331843, + "grad_norm": 7.615416080030093, + "learning_rate": 1.9960638345174668e-05, + "loss": 0.1035, + "step": 1836 + }, + { + "epoch": 0.74705164701098, + "grad_norm": 0.486931825836346, + "learning_rate": 1.9960548486259058e-05, + "loss": 0.0065, + "step": 1837 + }, + { + "epoch": 0.7474583163887759, + "grad_norm": 8.824047518218052, + "learning_rate": 1.99604585250934e-05, + "loss": 0.1256, + "step": 1838 + }, + { + "epoch": 0.7478649857665718, + "grad_norm": 16.500243098913746, + "learning_rate": 1.996036846167863e-05, + "loss": 0.7216, + "step": 1839 + }, + { + "epoch": 0.7482716551443677, + "grad_norm": 15.788317273645937, + "learning_rate": 1.9960278296015663e-05, + "loss": 0.5785, + "step": 1840 + }, + { + "epoch": 0.7486783245221634, + "grad_norm": 6.557807341423488, + "learning_rate": 1.9960188028105424e-05, + "loss": 0.1542, + "step": 1841 + }, + { + "epoch": 0.7490849938999593, + "grad_norm": 4.971756265473293, + "learning_rate": 1.9960097657948845e-05, + "loss": 0.1036, + "step": 1842 + }, + { + "epoch": 0.7494916632777552, + "grad_norm": 7.207375266133658, + "learning_rate": 1.9960007185546847e-05, + "loss": 0.3062, + "step": 1843 + }, + { + "epoch": 0.7498983326555511, + "grad_norm": 2.4189759893747573, + "learning_rate": 1.9959916610900364e-05, + "loss": 0.0225, + "step": 1844 + }, + { + "epoch": 0.7503050020333469, + "grad_norm": 6.248712058125559, + "learning_rate": 1.9959825934010325e-05, + "loss": 0.1382, + "step": 1845 + }, + { + "epoch": 0.7507116714111427, + "grad_norm": 12.831374186118364, + "learning_rate": 1.995973515487766e-05, + "loss": 0.9752, + "step": 1846 + }, + { + "epoch": 0.7511183407889386, + "grad_norm": 1.7694337797042117, + "learning_rate": 1.99596442735033e-05, + "loss": 0.0249, + "step": 1847 + }, + { + "epoch": 0.7515250101667345, + "grad_norm": 8.92013547633679, + "learning_rate": 1.995955328988818e-05, + "loss": 0.4674, + "step": 1848 + }, + { + "epoch": 0.7519316795445303, + "grad_norm": 10.832705573091475, + "learning_rate": 1.9959462204033232e-05, + "loss": 0.359, + "step": 1849 + }, + { + "epoch": 0.7523383489223261, + "grad_norm": 6.947357531503463, + "learning_rate": 1.9959371015939393e-05, + "loss": 0.1492, + "step": 1850 + }, + { + "epoch": 0.752745018300122, + "grad_norm": 15.671310810993404, + "learning_rate": 1.9959279725607596e-05, + "loss": 0.6842, + "step": 1851 + }, + { + "epoch": 0.7531516876779178, + "grad_norm": 3.1023938192972698, + "learning_rate": 1.995918833303878e-05, + "loss": 0.0794, + "step": 1852 + }, + { + "epoch": 0.7535583570557137, + "grad_norm": 10.979852858225223, + "learning_rate": 1.995909683823389e-05, + "loss": 0.4227, + "step": 1853 + }, + { + "epoch": 0.7539650264335096, + "grad_norm": 14.06798588434253, + "learning_rate": 1.995900524119385e-05, + "loss": 0.3912, + "step": 1854 + }, + { + "epoch": 0.7543716958113054, + "grad_norm": 3.8550790086902453, + "learning_rate": 1.9958913541919616e-05, + "loss": 0.0687, + "step": 1855 + }, + { + "epoch": 0.7547783651891012, + "grad_norm": 16.518966538980663, + "learning_rate": 1.9958821740412116e-05, + "loss": 0.7326, + "step": 1856 + }, + { + "epoch": 0.7551850345668971, + "grad_norm": 11.454802816729865, + "learning_rate": 1.9958729836672303e-05, + "loss": 0.5961, + "step": 1857 + }, + { + "epoch": 0.755591703944693, + "grad_norm": 14.395645471666056, + "learning_rate": 1.9958637830701115e-05, + "loss": 0.3307, + "step": 1858 + }, + { + "epoch": 0.7559983733224889, + "grad_norm": 7.578282302767417, + "learning_rate": 1.99585457224995e-05, + "loss": 0.0503, + "step": 1859 + }, + { + "epoch": 0.7564050427002846, + "grad_norm": 10.531760306716992, + "learning_rate": 1.9958453512068395e-05, + "loss": 0.2625, + "step": 1860 + }, + { + "epoch": 0.7568117120780805, + "grad_norm": 18.916252904180695, + "learning_rate": 1.995836119940876e-05, + "loss": 0.3451, + "step": 1861 + }, + { + "epoch": 0.7572183814558764, + "grad_norm": 10.934913323274877, + "learning_rate": 1.9958268784521527e-05, + "loss": 0.2434, + "step": 1862 + }, + { + "epoch": 0.7576250508336723, + "grad_norm": 14.874092467431316, + "learning_rate": 1.9958176267407657e-05, + "loss": 0.3318, + "step": 1863 + }, + { + "epoch": 0.758031720211468, + "grad_norm": 6.154418395422772, + "learning_rate": 1.9958083648068098e-05, + "loss": 0.0916, + "step": 1864 + }, + { + "epoch": 0.7584383895892639, + "grad_norm": 31.981906910252107, + "learning_rate": 1.9957990926503798e-05, + "loss": 2.0461, + "step": 1865 + }, + { + "epoch": 0.7588450589670598, + "grad_norm": 10.4415519068034, + "learning_rate": 1.9957898102715706e-05, + "loss": 0.2975, + "step": 1866 + }, + { + "epoch": 0.7592517283448557, + "grad_norm": 6.74139889451343, + "learning_rate": 1.9957805176704782e-05, + "loss": 0.1941, + "step": 1867 + }, + { + "epoch": 0.7596583977226515, + "grad_norm": 12.32710013961487, + "learning_rate": 1.9957712148471973e-05, + "loss": 0.5247, + "step": 1868 + }, + { + "epoch": 0.7600650671004473, + "grad_norm": 8.601750443085264, + "learning_rate": 1.9957619018018243e-05, + "loss": 0.2456, + "step": 1869 + }, + { + "epoch": 0.7604717364782432, + "grad_norm": 12.682516916202042, + "learning_rate": 1.995752578534454e-05, + "loss": 0.3793, + "step": 1870 + }, + { + "epoch": 0.760878405856039, + "grad_norm": 18.321405881156814, + "learning_rate": 1.9957432450451817e-05, + "loss": 0.4998, + "step": 1871 + }, + { + "epoch": 0.7612850752338349, + "grad_norm": 31.274136274118046, + "learning_rate": 1.9957339013341043e-05, + "loss": 0.9781, + "step": 1872 + }, + { + "epoch": 0.7616917446116307, + "grad_norm": 22.161779222880934, + "learning_rate": 1.9957245474013175e-05, + "loss": 0.6006, + "step": 1873 + }, + { + "epoch": 0.7620984139894266, + "grad_norm": 22.774331331787753, + "learning_rate": 1.9957151832469166e-05, + "loss": 1.145, + "step": 1874 + }, + { + "epoch": 0.7625050833672224, + "grad_norm": 13.754119434931601, + "learning_rate": 1.9957058088709985e-05, + "loss": 0.2093, + "step": 1875 + }, + { + "epoch": 0.7629117527450183, + "grad_norm": 12.055369766519565, + "learning_rate": 1.995696424273659e-05, + "loss": 0.2455, + "step": 1876 + }, + { + "epoch": 0.7633184221228142, + "grad_norm": 3.2656942680885366, + "learning_rate": 1.9956870294549947e-05, + "loss": 0.0633, + "step": 1877 + }, + { + "epoch": 0.76372509150061, + "grad_norm": 13.159651923685589, + "learning_rate": 1.9956776244151022e-05, + "loss": 0.6453, + "step": 1878 + }, + { + "epoch": 0.7641317608784058, + "grad_norm": 15.910679360328613, + "learning_rate": 1.9956682091540774e-05, + "loss": 0.7771, + "step": 1879 + }, + { + "epoch": 0.7645384302562017, + "grad_norm": 0.17324288476006905, + "learning_rate": 1.9956587836720175e-05, + "loss": 0.0028, + "step": 1880 + }, + { + "epoch": 0.7649450996339976, + "grad_norm": 6.520410983807981, + "learning_rate": 1.995649347969019e-05, + "loss": 0.144, + "step": 1881 + }, + { + "epoch": 0.7653517690117934, + "grad_norm": 11.963624383027076, + "learning_rate": 1.9956399020451785e-05, + "loss": 0.6005, + "step": 1882 + }, + { + "epoch": 0.7657584383895892, + "grad_norm": 12.807507118085894, + "learning_rate": 1.9956304459005936e-05, + "loss": 0.3774, + "step": 1883 + }, + { + "epoch": 0.7661651077673851, + "grad_norm": 5.119218690206734, + "learning_rate": 1.9956209795353612e-05, + "loss": 0.0743, + "step": 1884 + }, + { + "epoch": 0.766571777145181, + "grad_norm": 32.67305867766017, + "learning_rate": 1.9956115029495782e-05, + "loss": 1.2433, + "step": 1885 + }, + { + "epoch": 0.7669784465229769, + "grad_norm": 2.5087048819223456, + "learning_rate": 1.9956020161433422e-05, + "loss": 0.0437, + "step": 1886 + }, + { + "epoch": 0.7673851159007726, + "grad_norm": 9.476066198634062, + "learning_rate": 1.9955925191167504e-05, + "loss": 0.0908, + "step": 1887 + }, + { + "epoch": 0.7677917852785685, + "grad_norm": 4.649944618078596, + "learning_rate": 1.9955830118699e-05, + "loss": 0.0195, + "step": 1888 + }, + { + "epoch": 0.7681984546563644, + "grad_norm": 9.509665892008975, + "learning_rate": 1.995573494402889e-05, + "loss": 0.2772, + "step": 1889 + }, + { + "epoch": 0.7686051240341603, + "grad_norm": 19.763280207755376, + "learning_rate": 1.9955639667158155e-05, + "loss": 0.8945, + "step": 1890 + }, + { + "epoch": 0.7690117934119561, + "grad_norm": 0.7188382335476188, + "learning_rate": 1.9955544288087763e-05, + "loss": 0.0102, + "step": 1891 + }, + { + "epoch": 0.7694184627897519, + "grad_norm": 27.878910447930515, + "learning_rate": 1.9955448806818703e-05, + "loss": 0.4767, + "step": 1892 + }, + { + "epoch": 0.7698251321675478, + "grad_norm": 29.145215228799913, + "learning_rate": 1.9955353223351946e-05, + "loss": 1.4692, + "step": 1893 + }, + { + "epoch": 0.7702318015453437, + "grad_norm": 13.51396919292953, + "learning_rate": 1.9955257537688483e-05, + "loss": 0.4258, + "step": 1894 + }, + { + "epoch": 0.7706384709231395, + "grad_norm": 17.167335518430352, + "learning_rate": 1.9955161749829288e-05, + "loss": 0.9319, + "step": 1895 + }, + { + "epoch": 0.7710451403009353, + "grad_norm": 14.817617416535898, + "learning_rate": 1.9955065859775344e-05, + "loss": 0.4212, + "step": 1896 + }, + { + "epoch": 0.7714518096787312, + "grad_norm": 34.56194307261629, + "learning_rate": 1.995496986752764e-05, + "loss": 1.8936, + "step": 1897 + }, + { + "epoch": 0.771858479056527, + "grad_norm": 18.806271883430973, + "learning_rate": 1.9954873773087168e-05, + "loss": 1.0499, + "step": 1898 + }, + { + "epoch": 0.7722651484343229, + "grad_norm": 3.79047132805724, + "learning_rate": 1.9954777576454897e-05, + "loss": 0.0829, + "step": 1899 + }, + { + "epoch": 0.7726718178121188, + "grad_norm": 20.00838321622542, + "learning_rate": 1.9954681277631825e-05, + "loss": 0.9243, + "step": 1900 + }, + { + "epoch": 0.7730784871899146, + "grad_norm": 13.04427684448437, + "learning_rate": 1.9954584876618944e-05, + "loss": 0.6395, + "step": 1901 + }, + { + "epoch": 0.7734851565677104, + "grad_norm": 4.771315572928821, + "learning_rate": 1.9954488373417237e-05, + "loss": 0.0627, + "step": 1902 + }, + { + "epoch": 0.7738918259455063, + "grad_norm": 17.341780167848285, + "learning_rate": 1.99543917680277e-05, + "loss": 0.6046, + "step": 1903 + }, + { + "epoch": 0.7742984953233022, + "grad_norm": 11.59238723337806, + "learning_rate": 1.9954295060451318e-05, + "loss": 0.2391, + "step": 1904 + }, + { + "epoch": 0.774705164701098, + "grad_norm": 1.7948849879719904, + "learning_rate": 1.9954198250689088e-05, + "loss": 0.0245, + "step": 1905 + }, + { + "epoch": 0.7751118340788938, + "grad_norm": 25.50865228905883, + "learning_rate": 1.9954101338742003e-05, + "loss": 0.5996, + "step": 1906 + }, + { + "epoch": 0.7755185034566897, + "grad_norm": 28.018790527363794, + "learning_rate": 1.995400432461106e-05, + "loss": 0.5072, + "step": 1907 + }, + { + "epoch": 0.7759251728344856, + "grad_norm": 12.031566802392348, + "learning_rate": 1.9953907208297248e-05, + "loss": 0.4337, + "step": 1908 + }, + { + "epoch": 0.7763318422122815, + "grad_norm": 7.758463569299106, + "learning_rate": 1.995380998980157e-05, + "loss": 0.1499, + "step": 1909 + }, + { + "epoch": 0.7767385115900772, + "grad_norm": 16.90480664661948, + "learning_rate": 1.9953712669125025e-05, + "loss": 0.8939, + "step": 1910 + }, + { + "epoch": 0.7771451809678731, + "grad_norm": 5.360390053736156, + "learning_rate": 1.995361524626861e-05, + "loss": 0.0769, + "step": 1911 + }, + { + "epoch": 0.777551850345669, + "grad_norm": 14.708777518661007, + "learning_rate": 1.9953517721233323e-05, + "loss": 0.243, + "step": 1912 + }, + { + "epoch": 0.7779585197234649, + "grad_norm": 17.662183017106837, + "learning_rate": 1.9953420094020172e-05, + "loss": 0.2707, + "step": 1913 + }, + { + "epoch": 0.7783651891012606, + "grad_norm": 17.94749983307694, + "learning_rate": 1.995332236463015e-05, + "loss": 0.6688, + "step": 1914 + }, + { + "epoch": 0.7787718584790565, + "grad_norm": 1.5073263157074512, + "learning_rate": 1.9953224533064264e-05, + "loss": 0.0252, + "step": 1915 + }, + { + "epoch": 0.7791785278568524, + "grad_norm": 17.412108037448988, + "learning_rate": 1.995312659932352e-05, + "loss": 1.44, + "step": 1916 + }, + { + "epoch": 0.7795851972346483, + "grad_norm": 11.669239532162443, + "learning_rate": 1.9953028563408922e-05, + "loss": 0.3809, + "step": 1917 + }, + { + "epoch": 0.7799918666124441, + "grad_norm": 6.053850812187564, + "learning_rate": 1.9952930425321476e-05, + "loss": 0.0939, + "step": 1918 + }, + { + "epoch": 0.7803985359902399, + "grad_norm": 14.303326525893945, + "learning_rate": 1.995283218506219e-05, + "loss": 0.5556, + "step": 1919 + }, + { + "epoch": 0.7808052053680358, + "grad_norm": 18.32286514223047, + "learning_rate": 1.9952733842632076e-05, + "loss": 1.6104, + "step": 1920 + }, + { + "epoch": 0.7812118747458316, + "grad_norm": 5.149110926328627, + "learning_rate": 1.9952635398032137e-05, + "loss": 0.0722, + "step": 1921 + }, + { + "epoch": 0.7816185441236275, + "grad_norm": 10.978219732909567, + "learning_rate": 1.9952536851263383e-05, + "loss": 0.3745, + "step": 1922 + }, + { + "epoch": 0.7820252135014233, + "grad_norm": 13.646521298402371, + "learning_rate": 1.9952438202326836e-05, + "loss": 0.423, + "step": 1923 + }, + { + "epoch": 0.7824318828792192, + "grad_norm": 18.1950221450065, + "learning_rate": 1.99523394512235e-05, + "loss": 0.9919, + "step": 1924 + }, + { + "epoch": 0.782838552257015, + "grad_norm": 6.286075327247257, + "learning_rate": 1.995224059795439e-05, + "loss": 0.141, + "step": 1925 + }, + { + "epoch": 0.7832452216348109, + "grad_norm": 1.1938111525032276, + "learning_rate": 1.995214164252052e-05, + "loss": 0.013, + "step": 1926 + }, + { + "epoch": 0.7836518910126068, + "grad_norm": 15.13291834830899, + "learning_rate": 1.9952042584922908e-05, + "loss": 0.6149, + "step": 1927 + }, + { + "epoch": 0.7840585603904026, + "grad_norm": 19.459638806076157, + "learning_rate": 1.995194342516257e-05, + "loss": 0.6614, + "step": 1928 + }, + { + "epoch": 0.7844652297681984, + "grad_norm": 19.07605556762059, + "learning_rate": 1.9951844163240523e-05, + "loss": 0.9826, + "step": 1929 + }, + { + "epoch": 0.7848718991459943, + "grad_norm": 16.61776220918579, + "learning_rate": 1.995174479915779e-05, + "loss": 0.5094, + "step": 1930 + }, + { + "epoch": 0.7852785685237902, + "grad_norm": 12.71567825684146, + "learning_rate": 1.9951645332915387e-05, + "loss": 0.2836, + "step": 1931 + }, + { + "epoch": 0.7856852379015861, + "grad_norm": 9.889990024051382, + "learning_rate": 1.9951545764514334e-05, + "loss": 0.2612, + "step": 1932 + }, + { + "epoch": 0.7860919072793818, + "grad_norm": 10.053760117146064, + "learning_rate": 1.9951446093955655e-05, + "loss": 0.163, + "step": 1933 + }, + { + "epoch": 0.7864985766571777, + "grad_norm": 24.097348009173444, + "learning_rate": 1.9951346321240375e-05, + "loss": 1.4134, + "step": 1934 + }, + { + "epoch": 0.7869052460349736, + "grad_norm": 18.69049461955798, + "learning_rate": 1.9951246446369516e-05, + "loss": 0.3707, + "step": 1935 + }, + { + "epoch": 0.7873119154127695, + "grad_norm": 14.008294485183095, + "learning_rate": 1.9951146469344106e-05, + "loss": 0.6415, + "step": 1936 + }, + { + "epoch": 0.7877185847905652, + "grad_norm": 9.74072195828792, + "learning_rate": 1.9951046390165166e-05, + "loss": 0.155, + "step": 1937 + }, + { + "epoch": 0.7881252541683611, + "grad_norm": 5.969423738291952, + "learning_rate": 1.9950946208833732e-05, + "loss": 0.133, + "step": 1938 + }, + { + "epoch": 0.788531923546157, + "grad_norm": 8.380856592208554, + "learning_rate": 1.995084592535082e-05, + "loss": 0.1324, + "step": 1939 + }, + { + "epoch": 0.7889385929239529, + "grad_norm": 7.30524203315166, + "learning_rate": 1.995074553971747e-05, + "loss": 0.4016, + "step": 1940 + }, + { + "epoch": 0.7893452623017487, + "grad_norm": 16.55112728273054, + "learning_rate": 1.995064505193471e-05, + "loss": 0.1695, + "step": 1941 + }, + { + "epoch": 0.7897519316795445, + "grad_norm": 7.806618371215512, + "learning_rate": 1.995054446200357e-05, + "loss": 0.0905, + "step": 1942 + }, + { + "epoch": 0.7901586010573404, + "grad_norm": 13.674778390278314, + "learning_rate": 1.995044376992508e-05, + "loss": 0.2743, + "step": 1943 + }, + { + "epoch": 0.7905652704351362, + "grad_norm": 24.498962467196957, + "learning_rate": 1.995034297570028e-05, + "loss": 0.5947, + "step": 1944 + }, + { + "epoch": 0.7909719398129321, + "grad_norm": 2.3068884468317963, + "learning_rate": 1.99502420793302e-05, + "loss": 0.0388, + "step": 1945 + }, + { + "epoch": 0.7913786091907279, + "grad_norm": 11.20271040604938, + "learning_rate": 1.9950141080815877e-05, + "loss": 0.3613, + "step": 1946 + }, + { + "epoch": 0.7917852785685238, + "grad_norm": 6.180100451482982, + "learning_rate": 1.9950039980158348e-05, + "loss": 0.1175, + "step": 1947 + }, + { + "epoch": 0.7921919479463196, + "grad_norm": 7.707933523534667, + "learning_rate": 1.994993877735865e-05, + "loss": 0.1297, + "step": 1948 + }, + { + "epoch": 0.7925986173241155, + "grad_norm": 30.63221530039091, + "learning_rate": 1.9949837472417825e-05, + "loss": 1.5508, + "step": 1949 + }, + { + "epoch": 0.7930052867019114, + "grad_norm": 8.207265978883605, + "learning_rate": 1.994973606533691e-05, + "loss": 0.131, + "step": 1950 + }, + { + "epoch": 0.7934119560797072, + "grad_norm": 10.118281915516041, + "learning_rate": 1.9949634556116946e-05, + "loss": 0.2121, + "step": 1951 + }, + { + "epoch": 0.793818625457503, + "grad_norm": 21.477640302597518, + "learning_rate": 1.9949532944758977e-05, + "loss": 0.6721, + "step": 1952 + }, + { + "epoch": 0.7942252948352989, + "grad_norm": 6.682894919757705, + "learning_rate": 1.9949431231264044e-05, + "loss": 0.0999, + "step": 1953 + }, + { + "epoch": 0.7946319642130948, + "grad_norm": 1.0557295865549663, + "learning_rate": 1.9949329415633194e-05, + "loss": 0.0166, + "step": 1954 + }, + { + "epoch": 0.7950386335908906, + "grad_norm": 9.45758264098755, + "learning_rate": 1.9949227497867466e-05, + "loss": 0.279, + "step": 1955 + }, + { + "epoch": 0.7954453029686864, + "grad_norm": 17.326878170040978, + "learning_rate": 1.9949125477967916e-05, + "loss": 0.557, + "step": 1956 + }, + { + "epoch": 0.7958519723464823, + "grad_norm": 11.532689372635009, + "learning_rate": 1.994902335593558e-05, + "loss": 0.3137, + "step": 1957 + }, + { + "epoch": 0.7962586417242782, + "grad_norm": 19.501010355955223, + "learning_rate": 1.9948921131771515e-05, + "loss": 0.8888, + "step": 1958 + }, + { + "epoch": 0.7966653111020741, + "grad_norm": 11.472841441197025, + "learning_rate": 1.9948818805476766e-05, + "loss": 0.6135, + "step": 1959 + }, + { + "epoch": 0.7970719804798698, + "grad_norm": 10.773895466422786, + "learning_rate": 1.9948716377052384e-05, + "loss": 0.368, + "step": 1960 + }, + { + "epoch": 0.7974786498576657, + "grad_norm": 7.535406944116635, + "learning_rate": 1.9948613846499425e-05, + "loss": 0.2833, + "step": 1961 + }, + { + "epoch": 0.7978853192354616, + "grad_norm": 12.49387818193982, + "learning_rate": 1.9948511213818933e-05, + "loss": 0.493, + "step": 1962 + }, + { + "epoch": 0.7982919886132575, + "grad_norm": 14.493827555181833, + "learning_rate": 1.994840847901197e-05, + "loss": 0.3774, + "step": 1963 + }, + { + "epoch": 0.7986986579910533, + "grad_norm": 22.85868593531314, + "learning_rate": 1.9948305642079587e-05, + "loss": 0.7207, + "step": 1964 + }, + { + "epoch": 0.7991053273688491, + "grad_norm": 1.3497545770451347, + "learning_rate": 1.9948202703022837e-05, + "loss": 0.0161, + "step": 1965 + }, + { + "epoch": 0.799511996746645, + "grad_norm": 19.270343097578618, + "learning_rate": 1.994809966184278e-05, + "loss": 0.6418, + "step": 1966 + }, + { + "epoch": 0.7999186661244408, + "grad_norm": 7.322467879622389, + "learning_rate": 1.9947996518540477e-05, + "loss": 0.2047, + "step": 1967 + }, + { + "epoch": 0.8003253355022367, + "grad_norm": 12.513981395124723, + "learning_rate": 1.9947893273116982e-05, + "loss": 0.2612, + "step": 1968 + }, + { + "epoch": 0.8007320048800325, + "grad_norm": 22.64577515858154, + "learning_rate": 1.9947789925573355e-05, + "loss": 1.5421, + "step": 1969 + }, + { + "epoch": 0.8011386742578284, + "grad_norm": 10.136175753542565, + "learning_rate": 1.9947686475910656e-05, + "loss": 0.4605, + "step": 1970 + }, + { + "epoch": 0.8015453436356242, + "grad_norm": 18.6543996866193, + "learning_rate": 1.994758292412995e-05, + "loss": 0.3384, + "step": 1971 + }, + { + "epoch": 0.8019520130134201, + "grad_norm": 16.839648765721737, + "learning_rate": 1.9947479270232297e-05, + "loss": 0.4669, + "step": 1972 + }, + { + "epoch": 0.802358682391216, + "grad_norm": 13.441381080845552, + "learning_rate": 1.9947375514218764e-05, + "loss": 0.5354, + "step": 1973 + }, + { + "epoch": 0.8027653517690118, + "grad_norm": 3.666865090757835, + "learning_rate": 1.9947271656090415e-05, + "loss": 0.0396, + "step": 1974 + }, + { + "epoch": 0.8031720211468076, + "grad_norm": 12.694753711951895, + "learning_rate": 1.9947167695848317e-05, + "loss": 0.2272, + "step": 1975 + }, + { + "epoch": 0.8035786905246035, + "grad_norm": 12.999354910935532, + "learning_rate": 1.994706363349354e-05, + "loss": 0.2377, + "step": 1976 + }, + { + "epoch": 0.8039853599023994, + "grad_norm": 35.0534358522722, + "learning_rate": 1.994695946902714e-05, + "loss": 0.3519, + "step": 1977 + }, + { + "epoch": 0.8043920292801952, + "grad_norm": 15.759511649874245, + "learning_rate": 1.99468552024502e-05, + "loss": 0.4617, + "step": 1978 + }, + { + "epoch": 0.804798698657991, + "grad_norm": 17.492645462699326, + "learning_rate": 1.9946750833763783e-05, + "loss": 0.7984, + "step": 1979 + }, + { + "epoch": 0.8052053680357869, + "grad_norm": 0.9002837326661532, + "learning_rate": 1.9946646362968965e-05, + "loss": 0.013, + "step": 1980 + }, + { + "epoch": 0.8056120374135828, + "grad_norm": 24.799348407955975, + "learning_rate": 1.9946541790066812e-05, + "loss": 1.3735, + "step": 1981 + }, + { + "epoch": 0.8060187067913787, + "grad_norm": 1.2052676853051856, + "learning_rate": 1.9946437115058405e-05, + "loss": 0.0189, + "step": 1982 + }, + { + "epoch": 0.8064253761691744, + "grad_norm": 12.877716991963215, + "learning_rate": 1.9946332337944814e-05, + "loss": 0.6168, + "step": 1983 + }, + { + "epoch": 0.8068320455469703, + "grad_norm": 26.395408136215654, + "learning_rate": 1.9946227458727118e-05, + "loss": 1.3609, + "step": 1984 + }, + { + "epoch": 0.8072387149247662, + "grad_norm": 6.8929873908366535, + "learning_rate": 1.9946122477406392e-05, + "loss": 0.2109, + "step": 1985 + }, + { + "epoch": 0.807645384302562, + "grad_norm": 8.412661221040606, + "learning_rate": 1.994601739398371e-05, + "loss": 0.2058, + "step": 1986 + }, + { + "epoch": 0.8080520536803578, + "grad_norm": 24.356488659694982, + "learning_rate": 1.9945912208460152e-05, + "loss": 0.9394, + "step": 1987 + }, + { + "epoch": 0.8084587230581537, + "grad_norm": 13.356151707173582, + "learning_rate": 1.9945806920836803e-05, + "loss": 0.522, + "step": 1988 + }, + { + "epoch": 0.8088653924359496, + "grad_norm": 12.819275970482913, + "learning_rate": 1.9945701531114737e-05, + "loss": 0.5116, + "step": 1989 + }, + { + "epoch": 0.8092720618137454, + "grad_norm": 12.730327842706002, + "learning_rate": 1.9945596039295044e-05, + "loss": 0.326, + "step": 1990 + }, + { + "epoch": 0.8096787311915413, + "grad_norm": 11.05869001239297, + "learning_rate": 1.99454904453788e-05, + "loss": 0.5271, + "step": 1991 + }, + { + "epoch": 0.8100854005693371, + "grad_norm": 10.842951576180281, + "learning_rate": 1.994538474936709e-05, + "loss": 0.1567, + "step": 1992 + }, + { + "epoch": 0.810492069947133, + "grad_norm": 3.0859720033091054, + "learning_rate": 1.9945278951261e-05, + "loss": 0.0408, + "step": 1993 + }, + { + "epoch": 0.8108987393249288, + "grad_norm": 3.3156198143869338, + "learning_rate": 1.994517305106162e-05, + "loss": 0.0552, + "step": 1994 + }, + { + "epoch": 0.8113054087027247, + "grad_norm": 23.656963520930045, + "learning_rate": 1.994506704877003e-05, + "loss": 0.8139, + "step": 1995 + }, + { + "epoch": 0.8117120780805205, + "grad_norm": 5.027770248803762, + "learning_rate": 1.994496094438732e-05, + "loss": 0.0917, + "step": 1996 + }, + { + "epoch": 0.8121187474583164, + "grad_norm": 23.142114270758068, + "learning_rate": 1.9944854737914582e-05, + "loss": 1.2504, + "step": 1997 + }, + { + "epoch": 0.8125254168361122, + "grad_norm": 8.547659643113446, + "learning_rate": 1.9944748429352904e-05, + "loss": 0.2729, + "step": 1998 + }, + { + "epoch": 0.8129320862139081, + "grad_norm": 9.216745602743648, + "learning_rate": 1.9944642018703382e-05, + "loss": 0.204, + "step": 1999 + }, + { + "epoch": 0.813338755591704, + "grad_norm": 22.053206279186465, + "learning_rate": 1.9944535505967102e-05, + "loss": 1.9055, + "step": 2000 + }, + { + "epoch": 0.8137454249694998, + "grad_norm": 14.462137683005288, + "learning_rate": 1.994442889114516e-05, + "loss": 0.3185, + "step": 2001 + }, + { + "epoch": 0.8141520943472956, + "grad_norm": 26.218851291042537, + "learning_rate": 1.994432217423865e-05, + "loss": 0.5951, + "step": 2002 + }, + { + "epoch": 0.8145587637250915, + "grad_norm": 12.520479318648686, + "learning_rate": 1.994421535524867e-05, + "loss": 0.2164, + "step": 2003 + }, + { + "epoch": 0.8149654331028874, + "grad_norm": 13.17907340437112, + "learning_rate": 1.994410843417631e-05, + "loss": 0.2981, + "step": 2004 + }, + { + "epoch": 0.8153721024806833, + "grad_norm": 14.2366347011046, + "learning_rate": 1.994400141102268e-05, + "loss": 0.1622, + "step": 2005 + }, + { + "epoch": 0.815778771858479, + "grad_norm": 7.098115530576877, + "learning_rate": 1.9943894285788866e-05, + "loss": 0.1183, + "step": 2006 + }, + { + "epoch": 0.8161854412362749, + "grad_norm": 7.401970001629291, + "learning_rate": 1.994378705847597e-05, + "loss": 0.128, + "step": 2007 + }, + { + "epoch": 0.8165921106140708, + "grad_norm": 4.264634554890935, + "learning_rate": 1.9943679729085103e-05, + "loss": 0.0579, + "step": 2008 + }, + { + "epoch": 0.8169987799918667, + "grad_norm": 25.3160723858588, + "learning_rate": 1.9943572297617352e-05, + "loss": 0.9199, + "step": 2009 + }, + { + "epoch": 0.8174054493696624, + "grad_norm": 21.27638982240311, + "learning_rate": 1.994346476407383e-05, + "loss": 0.8307, + "step": 2010 + }, + { + "epoch": 0.8178121187474583, + "grad_norm": 16.436503412621956, + "learning_rate": 1.9943357128455635e-05, + "loss": 0.3723, + "step": 2011 + }, + { + "epoch": 0.8182187881252542, + "grad_norm": 3.3769799038404136, + "learning_rate": 1.9943249390763878e-05, + "loss": 0.0484, + "step": 2012 + }, + { + "epoch": 0.81862545750305, + "grad_norm": 8.597060350341515, + "learning_rate": 1.994314155099966e-05, + "loss": 0.2151, + "step": 2013 + }, + { + "epoch": 0.8190321268808459, + "grad_norm": 11.165871715502403, + "learning_rate": 1.994303360916409e-05, + "loss": 0.4446, + "step": 2014 + }, + { + "epoch": 0.8194387962586417, + "grad_norm": 14.730431121512824, + "learning_rate": 1.9942925565258276e-05, + "loss": 0.7322, + "step": 2015 + }, + { + "epoch": 0.8198454656364376, + "grad_norm": 16.357241245525046, + "learning_rate": 1.9942817419283326e-05, + "loss": 0.6612, + "step": 2016 + }, + { + "epoch": 0.8202521350142334, + "grad_norm": 19.48904605207007, + "learning_rate": 1.9942709171240354e-05, + "loss": 0.8339, + "step": 2017 + }, + { + "epoch": 0.8206588043920293, + "grad_norm": 0.6327803948881865, + "learning_rate": 1.9942600821130464e-05, + "loss": 0.0099, + "step": 2018 + }, + { + "epoch": 0.8210654737698251, + "grad_norm": 9.045421915847841, + "learning_rate": 1.9942492368954774e-05, + "loss": 0.3214, + "step": 2019 + }, + { + "epoch": 0.821472143147621, + "grad_norm": 10.01646176027994, + "learning_rate": 1.9942383814714396e-05, + "loss": 0.305, + "step": 2020 + }, + { + "epoch": 0.8218788125254168, + "grad_norm": 9.874314812030734, + "learning_rate": 1.9942275158410447e-05, + "loss": 0.2074, + "step": 2021 + }, + { + "epoch": 0.8222854819032127, + "grad_norm": 7.031219838230026, + "learning_rate": 1.9942166400044036e-05, + "loss": 0.1654, + "step": 2022 + }, + { + "epoch": 0.8226921512810086, + "grad_norm": 2.9922557537569534, + "learning_rate": 1.9942057539616284e-05, + "loss": 0.0425, + "step": 2023 + }, + { + "epoch": 0.8230988206588044, + "grad_norm": 2.808671623798086, + "learning_rate": 1.994194857712831e-05, + "loss": 0.0317, + "step": 2024 + }, + { + "epoch": 0.8235054900366002, + "grad_norm": 19.171197300504634, + "learning_rate": 1.994183951258123e-05, + "loss": 0.3536, + "step": 2025 + }, + { + "epoch": 0.8239121594143961, + "grad_norm": 13.659596888857514, + "learning_rate": 1.9941730345976162e-05, + "loss": 0.5851, + "step": 2026 + }, + { + "epoch": 0.824318828792192, + "grad_norm": 5.142983167545676, + "learning_rate": 1.9941621077314226e-05, + "loss": 0.0289, + "step": 2027 + }, + { + "epoch": 0.8247254981699877, + "grad_norm": 5.9419212111479816, + "learning_rate": 1.9941511706596547e-05, + "loss": 0.1533, + "step": 2028 + }, + { + "epoch": 0.8251321675477836, + "grad_norm": 17.12833253580407, + "learning_rate": 1.994140223382425e-05, + "loss": 1.6309, + "step": 2029 + }, + { + "epoch": 0.8255388369255795, + "grad_norm": 15.162648786134834, + "learning_rate": 1.9941292658998453e-05, + "loss": 0.4744, + "step": 2030 + }, + { + "epoch": 0.8259455063033754, + "grad_norm": 7.377923971479412, + "learning_rate": 1.9941182982120285e-05, + "loss": 0.1988, + "step": 2031 + }, + { + "epoch": 0.8263521756811713, + "grad_norm": 7.336581009850666, + "learning_rate": 1.9941073203190872e-05, + "loss": 0.1705, + "step": 2032 + }, + { + "epoch": 0.826758845058967, + "grad_norm": 6.884158296378955, + "learning_rate": 1.9940963322211336e-05, + "loss": 0.1647, + "step": 2033 + }, + { + "epoch": 0.8271655144367629, + "grad_norm": 1.800750909485248, + "learning_rate": 1.994085333918281e-05, + "loss": 0.0276, + "step": 2034 + }, + { + "epoch": 0.8275721838145588, + "grad_norm": 8.066421702222364, + "learning_rate": 1.994074325410642e-05, + "loss": 0.2997, + "step": 2035 + }, + { + "epoch": 0.8279788531923546, + "grad_norm": 2.1208781606173073, + "learning_rate": 1.9940633066983298e-05, + "loss": 0.0142, + "step": 2036 + }, + { + "epoch": 0.8283855225701505, + "grad_norm": 10.65265169186425, + "learning_rate": 1.9940522777814574e-05, + "loss": 0.2072, + "step": 2037 + }, + { + "epoch": 0.8287921919479463, + "grad_norm": 11.104524421158672, + "learning_rate": 1.9940412386601384e-05, + "loss": 0.4646, + "step": 2038 + }, + { + "epoch": 0.8291988613257422, + "grad_norm": 9.410633141037644, + "learning_rate": 1.9940301893344855e-05, + "loss": 0.2164, + "step": 2039 + }, + { + "epoch": 0.829605530703538, + "grad_norm": 1.6562043734041585, + "learning_rate": 1.9940191298046125e-05, + "loss": 0.0171, + "step": 2040 + }, + { + "epoch": 0.8300122000813339, + "grad_norm": 8.01441948778531, + "learning_rate": 1.9940080600706328e-05, + "loss": 0.1302, + "step": 2041 + }, + { + "epoch": 0.8304188694591297, + "grad_norm": 12.89838821051091, + "learning_rate": 1.9939969801326603e-05, + "loss": 0.5989, + "step": 2042 + }, + { + "epoch": 0.8308255388369256, + "grad_norm": 8.358352450791816, + "learning_rate": 1.993985889990808e-05, + "loss": 0.2742, + "step": 2043 + }, + { + "epoch": 0.8312322082147214, + "grad_norm": 1.9442017708836379, + "learning_rate": 1.993974789645191e-05, + "loss": 0.0268, + "step": 2044 + }, + { + "epoch": 0.8316388775925173, + "grad_norm": 5.0947712772124385, + "learning_rate": 1.9939636790959224e-05, + "loss": 0.0662, + "step": 2045 + }, + { + "epoch": 0.8320455469703132, + "grad_norm": 11.715613828187514, + "learning_rate": 1.9939525583431157e-05, + "loss": 0.3859, + "step": 2046 + }, + { + "epoch": 0.832452216348109, + "grad_norm": 91.21979071509273, + "learning_rate": 1.9939414273868865e-05, + "loss": 0.5977, + "step": 2047 + }, + { + "epoch": 0.8328588857259048, + "grad_norm": 17.61570771749604, + "learning_rate": 1.993930286227348e-05, + "loss": 0.943, + "step": 2048 + }, + { + "epoch": 0.8332655551037007, + "grad_norm": 4.6932192966986745, + "learning_rate": 1.9939191348646146e-05, + "loss": 0.0664, + "step": 2049 + }, + { + "epoch": 0.8336722244814966, + "grad_norm": 20.36007421918401, + "learning_rate": 1.9939079732988014e-05, + "loss": 0.5743, + "step": 2050 + }, + { + "epoch": 0.8340788938592923, + "grad_norm": 26.0575406178084, + "learning_rate": 1.9938968015300226e-05, + "loss": 1.2461, + "step": 2051 + }, + { + "epoch": 0.8344855632370882, + "grad_norm": 15.585193109781867, + "learning_rate": 1.993885619558393e-05, + "loss": 0.3912, + "step": 2052 + }, + { + "epoch": 0.8348922326148841, + "grad_norm": 0.47922279774459564, + "learning_rate": 1.993874427384027e-05, + "loss": 0.005, + "step": 2053 + }, + { + "epoch": 0.83529890199268, + "grad_norm": 18.776849580892062, + "learning_rate": 1.99386322500704e-05, + "loss": 1.0721, + "step": 2054 + }, + { + "epoch": 0.8357055713704759, + "grad_norm": 28.635371579873173, + "learning_rate": 1.9938520124275468e-05, + "loss": 0.6066, + "step": 2055 + }, + { + "epoch": 0.8361122407482716, + "grad_norm": 18.406454742131633, + "learning_rate": 1.9938407896456623e-05, + "loss": 0.9289, + "step": 2056 + }, + { + "epoch": 0.8365189101260675, + "grad_norm": 7.703411980863079, + "learning_rate": 1.993829556661502e-05, + "loss": 0.1712, + "step": 2057 + }, + { + "epoch": 0.8369255795038634, + "grad_norm": 18.820318779641475, + "learning_rate": 1.9938183134751814e-05, + "loss": 0.5291, + "step": 2058 + }, + { + "epoch": 0.8373322488816592, + "grad_norm": 12.399764123941655, + "learning_rate": 1.9938070600868157e-05, + "loss": 0.8009, + "step": 2059 + }, + { + "epoch": 0.837738918259455, + "grad_norm": 12.820427287843593, + "learning_rate": 1.99379579649652e-05, + "loss": 0.4289, + "step": 2060 + }, + { + "epoch": 0.8381455876372509, + "grad_norm": 22.25704633611564, + "learning_rate": 1.9937845227044102e-05, + "loss": 0.6867, + "step": 2061 + }, + { + "epoch": 0.8385522570150468, + "grad_norm": 5.596863762752662, + "learning_rate": 1.9937732387106022e-05, + "loss": 0.1249, + "step": 2062 + }, + { + "epoch": 0.8389589263928426, + "grad_norm": 2.4062242849214646, + "learning_rate": 1.993761944515212e-05, + "loss": 0.0338, + "step": 2063 + }, + { + "epoch": 0.8393655957706385, + "grad_norm": 17.22609978117106, + "learning_rate": 1.993750640118355e-05, + "loss": 1.0308, + "step": 2064 + }, + { + "epoch": 0.8397722651484343, + "grad_norm": 23.436820303905915, + "learning_rate": 1.9937393255201477e-05, + "loss": 1.0494, + "step": 2065 + }, + { + "epoch": 0.8401789345262302, + "grad_norm": 15.999542552783126, + "learning_rate": 1.993728000720706e-05, + "loss": 0.5737, + "step": 2066 + }, + { + "epoch": 0.840585603904026, + "grad_norm": 27.389800743152612, + "learning_rate": 1.9937166657201463e-05, + "loss": 0.5536, + "step": 2067 + }, + { + "epoch": 0.8409922732818219, + "grad_norm": 10.097714873822554, + "learning_rate": 1.9937053205185848e-05, + "loss": 0.2647, + "step": 2068 + }, + { + "epoch": 0.8413989426596177, + "grad_norm": 3.4058241297188814, + "learning_rate": 1.993693965116138e-05, + "loss": 0.1209, + "step": 2069 + }, + { + "epoch": 0.8418056120374136, + "grad_norm": 9.107380227121117, + "learning_rate": 1.9936825995129228e-05, + "loss": 0.2203, + "step": 2070 + }, + { + "epoch": 0.8422122814152094, + "grad_norm": 2.2603008600841434, + "learning_rate": 1.9936712237090554e-05, + "loss": 0.0455, + "step": 2071 + }, + { + "epoch": 0.8426189507930053, + "grad_norm": 8.411405025756896, + "learning_rate": 1.993659837704653e-05, + "loss": 0.1731, + "step": 2072 + }, + { + "epoch": 0.8430256201708012, + "grad_norm": 16.825660472078958, + "learning_rate": 1.993648441499832e-05, + "loss": 0.3734, + "step": 2073 + }, + { + "epoch": 0.843432289548597, + "grad_norm": 8.295058259958234, + "learning_rate": 1.9936370350947098e-05, + "loss": 0.1864, + "step": 2074 + }, + { + "epoch": 0.8438389589263928, + "grad_norm": 14.07023025263079, + "learning_rate": 1.9936256184894032e-05, + "loss": 0.7899, + "step": 2075 + }, + { + "epoch": 0.8442456283041887, + "grad_norm": 7.528112973979489, + "learning_rate": 1.9936141916840296e-05, + "loss": 0.2313, + "step": 2076 + }, + { + "epoch": 0.8446522976819846, + "grad_norm": 5.547662217995996, + "learning_rate": 1.9936027546787063e-05, + "loss": 0.0838, + "step": 2077 + }, + { + "epoch": 0.8450589670597805, + "grad_norm": 16.457787708903748, + "learning_rate": 1.9935913074735508e-05, + "loss": 0.16, + "step": 2078 + }, + { + "epoch": 0.8454656364375762, + "grad_norm": 14.032827167893695, + "learning_rate": 1.9935798500686804e-05, + "loss": 0.3807, + "step": 2079 + }, + { + "epoch": 0.8458723058153721, + "grad_norm": 7.079027693829188, + "learning_rate": 1.9935683824642126e-05, + "loss": 0.1217, + "step": 2080 + }, + { + "epoch": 0.846278975193168, + "grad_norm": 72.0096872952101, + "learning_rate": 1.9935569046602653e-05, + "loss": 0.8899, + "step": 2081 + }, + { + "epoch": 0.8466856445709638, + "grad_norm": 3.967450112332029, + "learning_rate": 1.9935454166569568e-05, + "loss": 0.0942, + "step": 2082 + }, + { + "epoch": 0.8470923139487596, + "grad_norm": 2.5028952990494258, + "learning_rate": 1.993533918454404e-05, + "loss": 0.0394, + "step": 2083 + }, + { + "epoch": 0.8474989833265555, + "grad_norm": 7.538695231337433, + "learning_rate": 1.9935224100527257e-05, + "loss": 0.2199, + "step": 2084 + }, + { + "epoch": 0.8479056527043514, + "grad_norm": 13.620363764131966, + "learning_rate": 1.99351089145204e-05, + "loss": 0.1565, + "step": 2085 + }, + { + "epoch": 0.8483123220821472, + "grad_norm": 14.638588982526175, + "learning_rate": 1.9934993626524646e-05, + "loss": 0.5775, + "step": 2086 + }, + { + "epoch": 0.8487189914599431, + "grad_norm": 8.690356459332133, + "learning_rate": 1.9934878236541185e-05, + "loss": 0.3439, + "step": 2087 + }, + { + "epoch": 0.8491256608377389, + "grad_norm": 16.737511765991137, + "learning_rate": 1.9934762744571197e-05, + "loss": 0.4413, + "step": 2088 + }, + { + "epoch": 0.8495323302155348, + "grad_norm": 2.26536963360341, + "learning_rate": 1.993464715061587e-05, + "loss": 0.0411, + "step": 2089 + }, + { + "epoch": 0.8499389995933306, + "grad_norm": 65.25612654917717, + "learning_rate": 1.993453145467639e-05, + "loss": 1.2167, + "step": 2090 + }, + { + "epoch": 0.8503456689711265, + "grad_norm": 6.0239946756676215, + "learning_rate": 1.9934415656753946e-05, + "loss": 0.0861, + "step": 2091 + }, + { + "epoch": 0.8507523383489223, + "grad_norm": 6.997628565451673, + "learning_rate": 1.9934299756849726e-05, + "loss": 0.3247, + "step": 2092 + }, + { + "epoch": 0.8511590077267182, + "grad_norm": 8.881684681158626, + "learning_rate": 1.9934183754964918e-05, + "loss": 0.1806, + "step": 2093 + }, + { + "epoch": 0.851565677104514, + "grad_norm": 5.956648358288979, + "learning_rate": 1.9934067651100714e-05, + "loss": 0.1219, + "step": 2094 + }, + { + "epoch": 0.8519723464823099, + "grad_norm": 8.758481379673903, + "learning_rate": 1.9933951445258304e-05, + "loss": 0.1668, + "step": 2095 + }, + { + "epoch": 0.8523790158601058, + "grad_norm": 23.993744992762167, + "learning_rate": 1.9933835137438886e-05, + "loss": 0.4982, + "step": 2096 + }, + { + "epoch": 0.8527856852379015, + "grad_norm": 5.834133647026547, + "learning_rate": 1.9933718727643648e-05, + "loss": 0.1229, + "step": 2097 + }, + { + "epoch": 0.8531923546156974, + "grad_norm": 2.947290219231841, + "learning_rate": 1.9933602215873787e-05, + "loss": 0.0296, + "step": 2098 + }, + { + "epoch": 0.8535990239934933, + "grad_norm": 16.606249061990663, + "learning_rate": 1.99334856021305e-05, + "loss": 0.4, + "step": 2099 + }, + { + "epoch": 0.8540056933712892, + "grad_norm": 17.091544659738922, + "learning_rate": 1.993336888641499e-05, + "loss": 0.5502, + "step": 2100 + }, + { + "epoch": 0.8544123627490849, + "grad_norm": 28.43191846709287, + "learning_rate": 1.9933252068728443e-05, + "loss": 1.6317, + "step": 2101 + }, + { + "epoch": 0.8548190321268808, + "grad_norm": 13.11632919327444, + "learning_rate": 1.9933135149072066e-05, + "loss": 0.1382, + "step": 2102 + }, + { + "epoch": 0.8552257015046767, + "grad_norm": 17.425364678540205, + "learning_rate": 1.9933018127447057e-05, + "loss": 0.8314, + "step": 2103 + }, + { + "epoch": 0.8556323708824726, + "grad_norm": 4.01308686061664, + "learning_rate": 1.9932901003854616e-05, + "loss": 0.0331, + "step": 2104 + }, + { + "epoch": 0.8560390402602684, + "grad_norm": 24.57821621304979, + "learning_rate": 1.993278377829595e-05, + "loss": 0.071, + "step": 2105 + }, + { + "epoch": 0.8564457096380642, + "grad_norm": 3.174150929660143, + "learning_rate": 1.993266645077226e-05, + "loss": 0.05, + "step": 2106 + }, + { + "epoch": 0.8568523790158601, + "grad_norm": 30.474205325243222, + "learning_rate": 1.9932549021284745e-05, + "loss": 1.291, + "step": 2107 + }, + { + "epoch": 0.857259048393656, + "grad_norm": 2.018930820397115, + "learning_rate": 1.9932431489834617e-05, + "loss": 0.0221, + "step": 2108 + }, + { + "epoch": 0.8576657177714518, + "grad_norm": 10.320104553949, + "learning_rate": 1.9932313856423085e-05, + "loss": 0.2026, + "step": 2109 + }, + { + "epoch": 0.8580723871492476, + "grad_norm": 21.307466183330195, + "learning_rate": 1.993219612105135e-05, + "loss": 0.7822, + "step": 2110 + }, + { + "epoch": 0.8584790565270435, + "grad_norm": 15.304896156852733, + "learning_rate": 1.9932078283720623e-05, + "loss": 0.483, + "step": 2111 + }, + { + "epoch": 0.8588857259048394, + "grad_norm": 6.814356398804827, + "learning_rate": 1.9931960344432113e-05, + "loss": 0.1295, + "step": 2112 + }, + { + "epoch": 0.8592923952826352, + "grad_norm": 13.183908218854162, + "learning_rate": 1.9931842303187033e-05, + "loss": 1.1454, + "step": 2113 + }, + { + "epoch": 0.8596990646604311, + "grad_norm": 4.7532185528838795, + "learning_rate": 1.993172415998659e-05, + "loss": 0.1345, + "step": 2114 + }, + { + "epoch": 0.8601057340382269, + "grad_norm": 8.587236244092798, + "learning_rate": 1.9931605914832003e-05, + "loss": 0.1171, + "step": 2115 + }, + { + "epoch": 0.8605124034160228, + "grad_norm": 22.171938050703307, + "learning_rate": 1.993148756772448e-05, + "loss": 1.2739, + "step": 2116 + }, + { + "epoch": 0.8609190727938186, + "grad_norm": 5.484294276997895, + "learning_rate": 1.9931369118665242e-05, + "loss": 0.2483, + "step": 2117 + }, + { + "epoch": 0.8613257421716145, + "grad_norm": 1.9767767133071443, + "learning_rate": 1.99312505676555e-05, + "loss": 0.0281, + "step": 2118 + }, + { + "epoch": 0.8617324115494104, + "grad_norm": 20.550605343602566, + "learning_rate": 1.9931131914696475e-05, + "loss": 0.915, + "step": 2119 + }, + { + "epoch": 0.8621390809272061, + "grad_norm": 9.298724439381829, + "learning_rate": 1.993101315978938e-05, + "loss": 0.487, + "step": 2120 + }, + { + "epoch": 0.862545750305002, + "grad_norm": 2.6106621310711673, + "learning_rate": 1.9930894302935434e-05, + "loss": 0.033, + "step": 2121 + }, + { + "epoch": 0.8629524196827979, + "grad_norm": 8.569643756521948, + "learning_rate": 1.9930775344135863e-05, + "loss": 0.3399, + "step": 2122 + }, + { + "epoch": 0.8633590890605938, + "grad_norm": 25.27613574939992, + "learning_rate": 1.9930656283391886e-05, + "loss": 0.787, + "step": 2123 + }, + { + "epoch": 0.8637657584383895, + "grad_norm": 48.862229857316464, + "learning_rate": 1.9930537120704725e-05, + "loss": 1.7854, + "step": 2124 + }, + { + "epoch": 0.8641724278161854, + "grad_norm": 16.359779562975735, + "learning_rate": 1.99304178560756e-05, + "loss": 0.4016, + "step": 2125 + }, + { + "epoch": 0.8645790971939813, + "grad_norm": 7.445160449469062, + "learning_rate": 1.993029848950574e-05, + "loss": 0.2289, + "step": 2126 + }, + { + "epoch": 0.8649857665717772, + "grad_norm": 15.922082978060173, + "learning_rate": 1.9930179020996365e-05, + "loss": 0.7027, + "step": 2127 + }, + { + "epoch": 0.865392435949573, + "grad_norm": 15.6003907706361, + "learning_rate": 1.9930059450548705e-05, + "loss": 0.3599, + "step": 2128 + }, + { + "epoch": 0.8657991053273688, + "grad_norm": 11.645941099147622, + "learning_rate": 1.9929939778163992e-05, + "loss": 0.4324, + "step": 2129 + }, + { + "epoch": 0.8662057747051647, + "grad_norm": 16.113543704583083, + "learning_rate": 1.9929820003843447e-05, + "loss": 0.4996, + "step": 2130 + }, + { + "epoch": 0.8666124440829606, + "grad_norm": 10.346569552719773, + "learning_rate": 1.9929700127588302e-05, + "loss": 0.6475, + "step": 2131 + }, + { + "epoch": 0.8670191134607564, + "grad_norm": 25.780087369799862, + "learning_rate": 1.9929580149399785e-05, + "loss": 1.2015, + "step": 2132 + }, + { + "epoch": 0.8674257828385522, + "grad_norm": 9.060583081068238, + "learning_rate": 1.9929460069279133e-05, + "loss": 0.1801, + "step": 2133 + }, + { + "epoch": 0.8678324522163481, + "grad_norm": 12.983010758690911, + "learning_rate": 1.9929339887227573e-05, + "loss": 0.5748, + "step": 2134 + }, + { + "epoch": 0.868239121594144, + "grad_norm": 17.981143816918504, + "learning_rate": 1.9929219603246346e-05, + "loss": 0.3618, + "step": 2135 + }, + { + "epoch": 0.8686457909719398, + "grad_norm": 17.281937496315468, + "learning_rate": 1.992909921733668e-05, + "loss": 1.0228, + "step": 2136 + }, + { + "epoch": 0.8690524603497357, + "grad_norm": 19.029864657084282, + "learning_rate": 1.9928978729499816e-05, + "loss": 0.85, + "step": 2137 + }, + { + "epoch": 0.8694591297275315, + "grad_norm": 20.811224345950286, + "learning_rate": 1.9928858139736984e-05, + "loss": 0.5055, + "step": 2138 + }, + { + "epoch": 0.8698657991053274, + "grad_norm": 1.10626344213499, + "learning_rate": 1.9928737448049427e-05, + "loss": 0.0158, + "step": 2139 + }, + { + "epoch": 0.8702724684831232, + "grad_norm": 9.37943144694748, + "learning_rate": 1.9928616654438385e-05, + "loss": 0.2789, + "step": 2140 + }, + { + "epoch": 0.8706791378609191, + "grad_norm": 14.781184673038954, + "learning_rate": 1.9928495758905095e-05, + "loss": 0.7796, + "step": 2141 + }, + { + "epoch": 0.8710858072387149, + "grad_norm": 1.8200146443076421, + "learning_rate": 1.99283747614508e-05, + "loss": 0.0235, + "step": 2142 + }, + { + "epoch": 0.8714924766165107, + "grad_norm": 4.218597115386158, + "learning_rate": 1.992825366207674e-05, + "loss": 0.0767, + "step": 2143 + }, + { + "epoch": 0.8718991459943066, + "grad_norm": 5.440321785910101, + "learning_rate": 1.992813246078416e-05, + "loss": 0.0578, + "step": 2144 + }, + { + "epoch": 0.8723058153721025, + "grad_norm": 7.485988556510832, + "learning_rate": 1.9928011157574305e-05, + "loss": 0.1245, + "step": 2145 + }, + { + "epoch": 0.8727124847498984, + "grad_norm": 16.145239332690995, + "learning_rate": 1.9927889752448416e-05, + "loss": 0.8726, + "step": 2146 + }, + { + "epoch": 0.8731191541276941, + "grad_norm": 13.958276548269373, + "learning_rate": 1.992776824540774e-05, + "loss": 0.6422, + "step": 2147 + }, + { + "epoch": 0.87352582350549, + "grad_norm": 20.167724991735973, + "learning_rate": 1.9927646636453534e-05, + "loss": 0.8707, + "step": 2148 + }, + { + "epoch": 0.8739324928832859, + "grad_norm": 14.63755605019186, + "learning_rate": 1.9927524925587034e-05, + "loss": 0.5072, + "step": 2149 + }, + { + "epoch": 0.8743391622610818, + "grad_norm": 22.522890748041068, + "learning_rate": 1.9927403112809494e-05, + "loss": 1.1675, + "step": 2150 + }, + { + "epoch": 0.8747458316388776, + "grad_norm": 12.537192323702675, + "learning_rate": 1.9927281198122165e-05, + "loss": 0.3653, + "step": 2151 + }, + { + "epoch": 0.8751525010166734, + "grad_norm": 8.93703042528573, + "learning_rate": 1.9927159181526302e-05, + "loss": 0.3075, + "step": 2152 + }, + { + "epoch": 0.8755591703944693, + "grad_norm": 51.401212776497886, + "learning_rate": 1.9927037063023146e-05, + "loss": 0.6563, + "step": 2153 + }, + { + "epoch": 0.8759658397722652, + "grad_norm": 16.452659137991887, + "learning_rate": 1.9926914842613965e-05, + "loss": 0.32, + "step": 2154 + }, + { + "epoch": 0.876372509150061, + "grad_norm": 8.778652743075163, + "learning_rate": 1.9926792520300005e-05, + "loss": 0.1908, + "step": 2155 + }, + { + "epoch": 0.8767791785278568, + "grad_norm": 3.7583398632013836, + "learning_rate": 1.9926670096082523e-05, + "loss": 0.0282, + "step": 2156 + }, + { + "epoch": 0.8771858479056527, + "grad_norm": 17.902396837787233, + "learning_rate": 1.9926547569962776e-05, + "loss": 0.7322, + "step": 2157 + }, + { + "epoch": 0.8775925172834486, + "grad_norm": 5.768736888251965, + "learning_rate": 1.9926424941942025e-05, + "loss": 0.2059, + "step": 2158 + }, + { + "epoch": 0.8779991866612444, + "grad_norm": 22.359019295769087, + "learning_rate": 1.9926302212021523e-05, + "loss": 1.5518, + "step": 2159 + }, + { + "epoch": 0.8784058560390403, + "grad_norm": 13.9974608926407, + "learning_rate": 1.9926179380202538e-05, + "loss": 0.4049, + "step": 2160 + }, + { + "epoch": 0.8788125254168361, + "grad_norm": 28.060455353978533, + "learning_rate": 1.9926056446486322e-05, + "loss": 0.8947, + "step": 2161 + }, + { + "epoch": 0.879219194794632, + "grad_norm": 17.606757137806888, + "learning_rate": 1.992593341087414e-05, + "loss": 0.6415, + "step": 2162 + }, + { + "epoch": 0.8796258641724278, + "grad_norm": 3.6929027726119616, + "learning_rate": 1.9925810273367255e-05, + "loss": 0.0402, + "step": 2163 + }, + { + "epoch": 0.8800325335502237, + "grad_norm": 7.295708118304406, + "learning_rate": 1.9925687033966935e-05, + "loss": 0.1367, + "step": 2164 + }, + { + "epoch": 0.8804392029280195, + "grad_norm": 7.234051202042791, + "learning_rate": 1.9925563692674443e-05, + "loss": 0.1506, + "step": 2165 + }, + { + "epoch": 0.8808458723058153, + "grad_norm": 16.629754465917106, + "learning_rate": 1.992544024949104e-05, + "loss": 1.1415, + "step": 2166 + }, + { + "epoch": 0.8812525416836112, + "grad_norm": 15.71969427189399, + "learning_rate": 1.9925316704418e-05, + "loss": 1.0036, + "step": 2167 + }, + { + "epoch": 0.8816592110614071, + "grad_norm": 14.772521177163878, + "learning_rate": 1.992519305745659e-05, + "loss": 0.4746, + "step": 2168 + }, + { + "epoch": 0.882065880439203, + "grad_norm": 10.73842522022741, + "learning_rate": 1.9925069308608077e-05, + "loss": 0.7222, + "step": 2169 + }, + { + "epoch": 0.8824725498169987, + "grad_norm": 14.743434682585818, + "learning_rate": 1.9924945457873732e-05, + "loss": 0.3648, + "step": 2170 + }, + { + "epoch": 0.8828792191947946, + "grad_norm": 38.09745855640946, + "learning_rate": 1.9924821505254828e-05, + "loss": 0.6036, + "step": 2171 + }, + { + "epoch": 0.8832858885725905, + "grad_norm": 8.678944161231255, + "learning_rate": 1.9924697450752636e-05, + "loss": 0.1038, + "step": 2172 + }, + { + "epoch": 0.8836925579503864, + "grad_norm": 29.20128646929139, + "learning_rate": 1.9924573294368427e-05, + "loss": 2.1396, + "step": 2173 + }, + { + "epoch": 0.8840992273281821, + "grad_norm": 17.33439597412145, + "learning_rate": 1.992444903610348e-05, + "loss": 0.7308, + "step": 2174 + }, + { + "epoch": 0.884505896705978, + "grad_norm": 9.394336481452127, + "learning_rate": 1.992432467595907e-05, + "loss": 0.1835, + "step": 2175 + }, + { + "epoch": 0.8849125660837739, + "grad_norm": 19.330627191937456, + "learning_rate": 1.992420021393647e-05, + "loss": 0.8347, + "step": 2176 + }, + { + "epoch": 0.8853192354615698, + "grad_norm": 4.722639427791571, + "learning_rate": 1.992407565003696e-05, + "loss": 0.0877, + "step": 2177 + }, + { + "epoch": 0.8857259048393656, + "grad_norm": 7.995578693199461, + "learning_rate": 1.9923950984261823e-05, + "loss": 0.1306, + "step": 2178 + }, + { + "epoch": 0.8861325742171614, + "grad_norm": 14.769478061058644, + "learning_rate": 1.992382621661233e-05, + "loss": 0.2586, + "step": 2179 + }, + { + "epoch": 0.8865392435949573, + "grad_norm": 10.254459663261647, + "learning_rate": 1.9923701347089768e-05, + "loss": 0.3613, + "step": 2180 + }, + { + "epoch": 0.8869459129727532, + "grad_norm": 9.920896371408828, + "learning_rate": 1.9923576375695414e-05, + "loss": 0.1685, + "step": 2181 + }, + { + "epoch": 0.887352582350549, + "grad_norm": 14.80559966421379, + "learning_rate": 1.992345130243056e-05, + "loss": 1.1234, + "step": 2182 + }, + { + "epoch": 0.8877592517283448, + "grad_norm": 69.47815731598428, + "learning_rate": 1.992332612729648e-05, + "loss": 0.9145, + "step": 2183 + }, + { + "epoch": 0.8881659211061407, + "grad_norm": 13.731016102024133, + "learning_rate": 1.9923200850294464e-05, + "loss": 0.3693, + "step": 2184 + }, + { + "epoch": 0.8885725904839366, + "grad_norm": 19.22281646143345, + "learning_rate": 1.9923075471425796e-05, + "loss": 1.4127, + "step": 2185 + }, + { + "epoch": 0.8889792598617324, + "grad_norm": 14.310160511565837, + "learning_rate": 1.9922949990691767e-05, + "loss": 0.593, + "step": 2186 + }, + { + "epoch": 0.8893859292395283, + "grad_norm": 7.583591635337189, + "learning_rate": 1.9922824408093658e-05, + "loss": 0.1552, + "step": 2187 + }, + { + "epoch": 0.8897925986173241, + "grad_norm": 14.025142187784931, + "learning_rate": 1.992269872363277e-05, + "loss": 0.101, + "step": 2188 + }, + { + "epoch": 0.89019926799512, + "grad_norm": 8.624367642399209, + "learning_rate": 1.9922572937310377e-05, + "loss": 0.2272, + "step": 2189 + }, + { + "epoch": 0.8906059373729158, + "grad_norm": 3.5109311013709363, + "learning_rate": 1.992244704912778e-05, + "loss": 0.0769, + "step": 2190 + }, + { + "epoch": 0.8910126067507117, + "grad_norm": 20.585250229012743, + "learning_rate": 1.9922321059086272e-05, + "loss": 1.7423, + "step": 2191 + }, + { + "epoch": 0.8914192761285076, + "grad_norm": 6.809188503829421, + "learning_rate": 1.9922194967187147e-05, + "loss": 0.0361, + "step": 2192 + }, + { + "epoch": 0.8918259455063033, + "grad_norm": 15.785312308791065, + "learning_rate": 1.9922068773431696e-05, + "loss": 0.5956, + "step": 2193 + }, + { + "epoch": 0.8922326148840992, + "grad_norm": 10.633583393909731, + "learning_rate": 1.9921942477821214e-05, + "loss": 0.3601, + "step": 2194 + }, + { + "epoch": 0.8926392842618951, + "grad_norm": 16.920948071752925, + "learning_rate": 1.9921816080357e-05, + "loss": 0.602, + "step": 2195 + }, + { + "epoch": 0.893045953639691, + "grad_norm": 5.88325610976127, + "learning_rate": 1.992168958104035e-05, + "loss": 0.0815, + "step": 2196 + }, + { + "epoch": 0.8934526230174867, + "grad_norm": 6.389994032573293, + "learning_rate": 1.992156297987256e-05, + "loss": 0.1351, + "step": 2197 + }, + { + "epoch": 0.8938592923952826, + "grad_norm": 2.2174736318690966, + "learning_rate": 1.9921436276854935e-05, + "loss": 0.0173, + "step": 2198 + }, + { + "epoch": 0.8942659617730785, + "grad_norm": 14.30322065798344, + "learning_rate": 1.9921309471988774e-05, + "loss": 1.0326, + "step": 2199 + }, + { + "epoch": 0.8946726311508744, + "grad_norm": 19.98573066935293, + "learning_rate": 1.9921182565275378e-05, + "loss": 0.8138, + "step": 2200 + }, + { + "epoch": 0.8950793005286702, + "grad_norm": 16.78557964612242, + "learning_rate": 1.992105555671605e-05, + "loss": 0.6017, + "step": 2201 + }, + { + "epoch": 0.895485969906466, + "grad_norm": 4.782045592460802, + "learning_rate": 1.992092844631209e-05, + "loss": 0.0655, + "step": 2202 + }, + { + "epoch": 0.8958926392842619, + "grad_norm": 31.15757271915579, + "learning_rate": 1.992080123406481e-05, + "loss": 1.6663, + "step": 2203 + }, + { + "epoch": 0.8962993086620578, + "grad_norm": 16.334481402210567, + "learning_rate": 1.992067391997551e-05, + "loss": 0.6512, + "step": 2204 + }, + { + "epoch": 0.8967059780398536, + "grad_norm": 15.188109909328519, + "learning_rate": 1.9920546504045505e-05, + "loss": 0.5634, + "step": 2205 + }, + { + "epoch": 0.8971126474176494, + "grad_norm": 29.971344798935654, + "learning_rate": 1.992041898627609e-05, + "loss": 0.7362, + "step": 2206 + }, + { + "epoch": 0.8975193167954453, + "grad_norm": 3.9123230993365055, + "learning_rate": 1.9920291366668584e-05, + "loss": 0.0422, + "step": 2207 + }, + { + "epoch": 0.8979259861732412, + "grad_norm": 5.795371890474043, + "learning_rate": 1.9920163645224296e-05, + "loss": 0.1476, + "step": 2208 + }, + { + "epoch": 0.898332655551037, + "grad_norm": 13.005888783266544, + "learning_rate": 1.992003582194453e-05, + "loss": 0.4547, + "step": 2209 + }, + { + "epoch": 0.8987393249288329, + "grad_norm": 10.401462587802452, + "learning_rate": 1.9919907896830612e-05, + "loss": 0.1508, + "step": 2210 + }, + { + "epoch": 0.8991459943066287, + "grad_norm": 45.95903513341711, + "learning_rate": 1.991977986988384e-05, + "loss": 0.4568, + "step": 2211 + }, + { + "epoch": 0.8995526636844245, + "grad_norm": 7.482020469471845, + "learning_rate": 1.9919651741105535e-05, + "loss": 0.3338, + "step": 2212 + }, + { + "epoch": 0.8999593330622204, + "grad_norm": 9.066596619985106, + "learning_rate": 1.9919523510497017e-05, + "loss": 0.1367, + "step": 2213 + }, + { + "epoch": 0.9003660024400163, + "grad_norm": 13.865777728876155, + "learning_rate": 1.9919395178059594e-05, + "loss": 0.5002, + "step": 2214 + }, + { + "epoch": 0.9007726718178121, + "grad_norm": 12.408720121596994, + "learning_rate": 1.991926674379459e-05, + "loss": 0.3994, + "step": 2215 + }, + { + "epoch": 0.9011793411956079, + "grad_norm": 10.23598625743683, + "learning_rate": 1.9919138207703322e-05, + "loss": 0.271, + "step": 2216 + }, + { + "epoch": 0.9015860105734038, + "grad_norm": 10.24569237759494, + "learning_rate": 1.9919009569787105e-05, + "loss": 0.3743, + "step": 2217 + }, + { + "epoch": 0.9019926799511997, + "grad_norm": 14.098535554195935, + "learning_rate": 1.9918880830047262e-05, + "loss": 0.5599, + "step": 2218 + }, + { + "epoch": 0.9023993493289956, + "grad_norm": 16.361113817198728, + "learning_rate": 1.9918751988485117e-05, + "loss": 0.7408, + "step": 2219 + }, + { + "epoch": 0.9028060187067913, + "grad_norm": 15.617703487603634, + "learning_rate": 1.991862304510199e-05, + "loss": 0.783, + "step": 2220 + }, + { + "epoch": 0.9032126880845872, + "grad_norm": 11.73333491737993, + "learning_rate": 1.9918493999899206e-05, + "loss": 0.577, + "step": 2221 + }, + { + "epoch": 0.9036193574623831, + "grad_norm": 18.141541450193703, + "learning_rate": 1.9918364852878088e-05, + "loss": 0.7813, + "step": 2222 + }, + { + "epoch": 0.904026026840179, + "grad_norm": 13.879185587468584, + "learning_rate": 1.9918235604039965e-05, + "loss": 0.6351, + "step": 2223 + }, + { + "epoch": 0.9044326962179748, + "grad_norm": 16.451527991228325, + "learning_rate": 1.991810625338616e-05, + "loss": 0.6411, + "step": 2224 + }, + { + "epoch": 0.9048393655957706, + "grad_norm": 4.549091977493769, + "learning_rate": 1.9917976800918003e-05, + "loss": 0.0495, + "step": 2225 + }, + { + "epoch": 0.9052460349735665, + "grad_norm": 17.93334176819827, + "learning_rate": 1.9917847246636827e-05, + "loss": 0.3748, + "step": 2226 + }, + { + "epoch": 0.9056527043513624, + "grad_norm": 17.03956685785701, + "learning_rate": 1.991771759054395e-05, + "loss": 0.552, + "step": 2227 + }, + { + "epoch": 0.9060593737291582, + "grad_norm": 5.734131185426635, + "learning_rate": 1.9917587832640714e-05, + "loss": 0.1815, + "step": 2228 + }, + { + "epoch": 0.906466043106954, + "grad_norm": 14.716692910083468, + "learning_rate": 1.9917457972928446e-05, + "loss": 0.8331, + "step": 2229 + }, + { + "epoch": 0.9068727124847499, + "grad_norm": 11.731423146398319, + "learning_rate": 1.9917328011408483e-05, + "loss": 0.488, + "step": 2230 + }, + { + "epoch": 0.9072793818625458, + "grad_norm": 7.059727408312487, + "learning_rate": 1.9917197948082157e-05, + "loss": 0.0479, + "step": 2231 + }, + { + "epoch": 0.9076860512403416, + "grad_norm": 9.315269802363824, + "learning_rate": 1.9917067782950798e-05, + "loss": 0.1493, + "step": 2232 + }, + { + "epoch": 0.9080927206181375, + "grad_norm": 21.638614696127927, + "learning_rate": 1.991693751601575e-05, + "loss": 1.7409, + "step": 2233 + }, + { + "epoch": 0.9084993899959333, + "grad_norm": 8.640441460818767, + "learning_rate": 1.991680714727835e-05, + "loss": 0.2316, + "step": 2234 + }, + { + "epoch": 0.9089060593737291, + "grad_norm": 11.758846727617437, + "learning_rate": 1.991667667673993e-05, + "loss": 0.2001, + "step": 2235 + }, + { + "epoch": 0.909312728751525, + "grad_norm": 15.122367478161598, + "learning_rate": 1.9916546104401836e-05, + "loss": 0.595, + "step": 2236 + }, + { + "epoch": 0.9097193981293209, + "grad_norm": 17.378857160032894, + "learning_rate": 1.9916415430265404e-05, + "loss": 1.0186, + "step": 2237 + }, + { + "epoch": 0.9101260675071167, + "grad_norm": 24.04598312756523, + "learning_rate": 1.9916284654331973e-05, + "loss": 1.1282, + "step": 2238 + }, + { + "epoch": 0.9105327368849125, + "grad_norm": 17.354636098724267, + "learning_rate": 1.9916153776602895e-05, + "loss": 0.6211, + "step": 2239 + }, + { + "epoch": 0.9109394062627084, + "grad_norm": 9.039458496858193, + "learning_rate": 1.9916022797079505e-05, + "loss": 0.265, + "step": 2240 + }, + { + "epoch": 0.9113460756405043, + "grad_norm": 10.634563371260606, + "learning_rate": 1.991589171576315e-05, + "loss": 0.3207, + "step": 2241 + }, + { + "epoch": 0.9117527450183002, + "grad_norm": 9.732655227580516, + "learning_rate": 1.991576053265518e-05, + "loss": 0.1459, + "step": 2242 + }, + { + "epoch": 0.9121594143960959, + "grad_norm": 14.150545251816283, + "learning_rate": 1.9915629247756933e-05, + "loss": 0.3214, + "step": 2243 + }, + { + "epoch": 0.9125660837738918, + "grad_norm": 18.483233690699127, + "learning_rate": 1.991549786106976e-05, + "loss": 1.1412, + "step": 2244 + }, + { + "epoch": 0.9129727531516877, + "grad_norm": 2.9129885109321783, + "learning_rate": 1.9915366372595017e-05, + "loss": 0.0402, + "step": 2245 + }, + { + "epoch": 0.9133794225294836, + "grad_norm": 18.427204764598862, + "learning_rate": 1.9915234782334045e-05, + "loss": 1.241, + "step": 2246 + }, + { + "epoch": 0.9137860919072793, + "grad_norm": 9.833415017792232, + "learning_rate": 1.9915103090288196e-05, + "loss": 0.173, + "step": 2247 + }, + { + "epoch": 0.9141927612850752, + "grad_norm": 7.23616684207515, + "learning_rate": 1.9914971296458823e-05, + "loss": 0.1584, + "step": 2248 + }, + { + "epoch": 0.9145994306628711, + "grad_norm": 15.095861375188752, + "learning_rate": 1.9914839400847285e-05, + "loss": 0.5369, + "step": 2249 + }, + { + "epoch": 0.915006100040667, + "grad_norm": 18.998536780748797, + "learning_rate": 1.991470740345493e-05, + "loss": 0.7062, + "step": 2250 + }, + { + "epoch": 0.9154127694184628, + "grad_norm": 11.330411375761011, + "learning_rate": 1.9914575304283107e-05, + "loss": 0.4126, + "step": 2251 + }, + { + "epoch": 0.9158194387962586, + "grad_norm": 7.852220948594977, + "learning_rate": 1.9914443103333183e-05, + "loss": 0.561, + "step": 2252 + }, + { + "epoch": 0.9162261081740545, + "grad_norm": 15.464049575688872, + "learning_rate": 1.991431080060651e-05, + "loss": 0.3156, + "step": 2253 + }, + { + "epoch": 0.9166327775518504, + "grad_norm": 8.278767483760198, + "learning_rate": 1.9914178396104447e-05, + "loss": 0.2507, + "step": 2254 + }, + { + "epoch": 0.9170394469296462, + "grad_norm": 12.853188873364784, + "learning_rate": 1.9914045889828354e-05, + "loss": 0.2859, + "step": 2255 + }, + { + "epoch": 0.917446116307442, + "grad_norm": 14.022549987129382, + "learning_rate": 1.9913913281779586e-05, + "loss": 0.5613, + "step": 2256 + }, + { + "epoch": 0.9178527856852379, + "grad_norm": 6.365199626128816, + "learning_rate": 1.9913780571959514e-05, + "loss": 0.1021, + "step": 2257 + }, + { + "epoch": 0.9182594550630337, + "grad_norm": 20.870928059183882, + "learning_rate": 1.991364776036949e-05, + "loss": 0.5852, + "step": 2258 + }, + { + "epoch": 0.9186661244408296, + "grad_norm": 25.91951438187393, + "learning_rate": 1.9913514847010887e-05, + "loss": 1.1151, + "step": 2259 + }, + { + "epoch": 0.9190727938186255, + "grad_norm": 16.451343597526712, + "learning_rate": 1.991338183188506e-05, + "loss": 0.8435, + "step": 2260 + }, + { + "epoch": 0.9194794631964213, + "grad_norm": 10.231898741439748, + "learning_rate": 1.991324871499338e-05, + "loss": 0.2231, + "step": 2261 + }, + { + "epoch": 0.9198861325742171, + "grad_norm": 10.528953973812603, + "learning_rate": 1.991311549633722e-05, + "loss": 0.4295, + "step": 2262 + }, + { + "epoch": 0.920292801952013, + "grad_norm": 14.048665830119436, + "learning_rate": 1.9912982175917933e-05, + "loss": 0.4628, + "step": 2263 + }, + { + "epoch": 0.9206994713298089, + "grad_norm": 12.878380326202485, + "learning_rate": 1.9912848753736894e-05, + "loss": 0.3509, + "step": 2264 + }, + { + "epoch": 0.9211061407076048, + "grad_norm": 16.13172842596463, + "learning_rate": 1.991271522979548e-05, + "loss": 0.441, + "step": 2265 + }, + { + "epoch": 0.9215128100854005, + "grad_norm": 7.252107644815567, + "learning_rate": 1.9912581604095053e-05, + "loss": 0.2915, + "step": 2266 + }, + { + "epoch": 0.9219194794631964, + "grad_norm": 17.94229753627952, + "learning_rate": 1.9912447876636984e-05, + "loss": 0.7213, + "step": 2267 + }, + { + "epoch": 0.9223261488409923, + "grad_norm": 12.677891430126603, + "learning_rate": 1.9912314047422648e-05, + "loss": 0.4693, + "step": 2268 + }, + { + "epoch": 0.9227328182187882, + "grad_norm": 15.240991079734659, + "learning_rate": 1.991218011645342e-05, + "loss": 0.2337, + "step": 2269 + }, + { + "epoch": 0.9231394875965839, + "grad_norm": 16.843954986586997, + "learning_rate": 1.9912046083730678e-05, + "loss": 0.5684, + "step": 2270 + }, + { + "epoch": 0.9235461569743798, + "grad_norm": 10.129545932260294, + "learning_rate": 1.9911911949255792e-05, + "loss": 0.2936, + "step": 2271 + }, + { + "epoch": 0.9239528263521757, + "grad_norm": 8.585427953191182, + "learning_rate": 1.991177771303014e-05, + "loss": 0.1548, + "step": 2272 + }, + { + "epoch": 0.9243594957299716, + "grad_norm": 5.281157117512158, + "learning_rate": 1.991164337505511e-05, + "loss": 0.067, + "step": 2273 + }, + { + "epoch": 0.9247661651077674, + "grad_norm": 13.472239584520041, + "learning_rate": 1.991150893533206e-05, + "loss": 0.568, + "step": 2274 + }, + { + "epoch": 0.9251728344855632, + "grad_norm": 1.8290776044323482, + "learning_rate": 1.9911374393862388e-05, + "loss": 0.0216, + "step": 2275 + }, + { + "epoch": 0.9255795038633591, + "grad_norm": 13.303465285165224, + "learning_rate": 1.991123975064747e-05, + "loss": 0.66, + "step": 2276 + }, + { + "epoch": 0.925986173241155, + "grad_norm": 11.382273988873527, + "learning_rate": 1.9911105005688687e-05, + "loss": 0.4959, + "step": 2277 + }, + { + "epoch": 0.9263928426189508, + "grad_norm": 16.530694887468137, + "learning_rate": 1.9910970158987424e-05, + "loss": 0.3851, + "step": 2278 + }, + { + "epoch": 0.9267995119967466, + "grad_norm": 0.6582261323555657, + "learning_rate": 1.991083521054506e-05, + "loss": 0.0131, + "step": 2279 + }, + { + "epoch": 0.9272061813745425, + "grad_norm": 11.873828985330475, + "learning_rate": 1.991070016036299e-05, + "loss": 0.4243, + "step": 2280 + }, + { + "epoch": 0.9276128507523383, + "grad_norm": 13.19408807633938, + "learning_rate": 1.9910565008442594e-05, + "loss": 0.3634, + "step": 2281 + }, + { + "epoch": 0.9280195201301342, + "grad_norm": 5.060443171287354, + "learning_rate": 1.991042975478526e-05, + "loss": 0.0786, + "step": 2282 + }, + { + "epoch": 0.9284261895079301, + "grad_norm": 14.147413649250902, + "learning_rate": 1.9910294399392374e-05, + "loss": 0.6292, + "step": 2283 + }, + { + "epoch": 0.9288328588857259, + "grad_norm": 3.3413581283877307, + "learning_rate": 1.991015894226533e-05, + "loss": 0.0604, + "step": 2284 + }, + { + "epoch": 0.9292395282635217, + "grad_norm": 12.658085886158208, + "learning_rate": 1.9910023383405517e-05, + "loss": 0.3572, + "step": 2285 + }, + { + "epoch": 0.9296461976413176, + "grad_norm": 7.176103317237001, + "learning_rate": 1.9909887722814324e-05, + "loss": 0.2363, + "step": 2286 + }, + { + "epoch": 0.9300528670191135, + "grad_norm": 3.7715413107110547, + "learning_rate": 1.990975196049315e-05, + "loss": 0.1101, + "step": 2287 + }, + { + "epoch": 0.9304595363969093, + "grad_norm": 18.474138087469882, + "learning_rate": 1.9909616096443382e-05, + "loss": 0.5077, + "step": 2288 + }, + { + "epoch": 0.9308662057747051, + "grad_norm": 9.712524442062037, + "learning_rate": 1.9909480130666418e-05, + "loss": 0.1923, + "step": 2289 + }, + { + "epoch": 0.931272875152501, + "grad_norm": 0.6616062944371331, + "learning_rate": 1.9909344063163653e-05, + "loss": 0.0097, + "step": 2290 + }, + { + "epoch": 0.9316795445302969, + "grad_norm": 9.218441410173302, + "learning_rate": 1.9909207893936483e-05, + "loss": 0.23, + "step": 2291 + }, + { + "epoch": 0.9320862139080928, + "grad_norm": 9.027012820055532, + "learning_rate": 1.990907162298631e-05, + "loss": 0.1935, + "step": 2292 + }, + { + "epoch": 0.9324928832858885, + "grad_norm": 8.103218427797762, + "learning_rate": 1.9908935250314524e-05, + "loss": 0.2742, + "step": 2293 + }, + { + "epoch": 0.9328995526636844, + "grad_norm": 3.3914341950495497, + "learning_rate": 1.990879877592254e-05, + "loss": 0.18, + "step": 2294 + }, + { + "epoch": 0.9333062220414803, + "grad_norm": 9.760657223169, + "learning_rate": 1.990866219981174e-05, + "loss": 0.2778, + "step": 2295 + }, + { + "epoch": 0.9337128914192762, + "grad_norm": 7.638329180961594, + "learning_rate": 1.9908525521983538e-05, + "loss": 0.3294, + "step": 2296 + }, + { + "epoch": 0.934119560797072, + "grad_norm": 7.936256642523004, + "learning_rate": 1.9908388742439338e-05, + "loss": 0.394, + "step": 2297 + }, + { + "epoch": 0.9345262301748678, + "grad_norm": 23.874235956557875, + "learning_rate": 1.990825186118054e-05, + "loss": 0.3715, + "step": 2298 + }, + { + "epoch": 0.9349328995526637, + "grad_norm": 7.308457794897202, + "learning_rate": 1.9908114878208545e-05, + "loss": 0.1364, + "step": 2299 + }, + { + "epoch": 0.9353395689304596, + "grad_norm": 11.407044875931255, + "learning_rate": 1.990797779352477e-05, + "loss": 0.5398, + "step": 2300 + }, + { + "epoch": 0.9357462383082554, + "grad_norm": 9.280295927472478, + "learning_rate": 1.9907840607130617e-05, + "loss": 0.2007, + "step": 2301 + }, + { + "epoch": 0.9361529076860512, + "grad_norm": 20.618412692337476, + "learning_rate": 1.990770331902749e-05, + "loss": 0.7581, + "step": 2302 + }, + { + "epoch": 0.9365595770638471, + "grad_norm": 7.335251411727146, + "learning_rate": 1.99075659292168e-05, + "loss": 0.1971, + "step": 2303 + }, + { + "epoch": 0.936966246441643, + "grad_norm": 29.813517487154947, + "learning_rate": 1.9907428437699964e-05, + "loss": 1.9747, + "step": 2304 + }, + { + "epoch": 0.9373729158194388, + "grad_norm": 1.6345820505059923, + "learning_rate": 1.9907290844478385e-05, + "loss": 0.0281, + "step": 2305 + }, + { + "epoch": 0.9377795851972347, + "grad_norm": 3.312064513053262, + "learning_rate": 1.9907153149553482e-05, + "loss": 0.124, + "step": 2306 + }, + { + "epoch": 0.9381862545750305, + "grad_norm": 8.145251341330976, + "learning_rate": 1.9907015352926664e-05, + "loss": 0.14, + "step": 2307 + }, + { + "epoch": 0.9385929239528263, + "grad_norm": 19.647737646057543, + "learning_rate": 1.990687745459935e-05, + "loss": 0.5438, + "step": 2308 + }, + { + "epoch": 0.9389995933306222, + "grad_norm": 15.20576532032338, + "learning_rate": 1.990673945457295e-05, + "loss": 0.5096, + "step": 2309 + }, + { + "epoch": 0.9394062627084181, + "grad_norm": 12.701000106249253, + "learning_rate": 1.9906601352848884e-05, + "loss": 0.479, + "step": 2310 + }, + { + "epoch": 0.9398129320862139, + "grad_norm": 11.363288002390805, + "learning_rate": 1.990646314942857e-05, + "loss": 0.3404, + "step": 2311 + }, + { + "epoch": 0.9402196014640097, + "grad_norm": 8.89714526775571, + "learning_rate": 1.9906324844313424e-05, + "loss": 0.3006, + "step": 2312 + }, + { + "epoch": 0.9406262708418056, + "grad_norm": 2.5560475191416936, + "learning_rate": 1.990618643750487e-05, + "loss": 0.0336, + "step": 2313 + }, + { + "epoch": 0.9410329402196015, + "grad_norm": 11.579638525836167, + "learning_rate": 1.9906047929004328e-05, + "loss": 0.2991, + "step": 2314 + }, + { + "epoch": 0.9414396095973974, + "grad_norm": 12.289030008794487, + "learning_rate": 1.9905909318813216e-05, + "loss": 0.3956, + "step": 2315 + }, + { + "epoch": 0.9418462789751931, + "grad_norm": 4.822201160166208, + "learning_rate": 1.990577060693296e-05, + "loss": 0.0913, + "step": 2316 + }, + { + "epoch": 0.942252948352989, + "grad_norm": 9.766884298418775, + "learning_rate": 1.9905631793364983e-05, + "loss": 0.3243, + "step": 2317 + }, + { + "epoch": 0.9426596177307849, + "grad_norm": 17.90390664629689, + "learning_rate": 1.990549287811071e-05, + "loss": 1.657, + "step": 2318 + }, + { + "epoch": 0.9430662871085808, + "grad_norm": 20.241906074678873, + "learning_rate": 1.990535386117157e-05, + "loss": 0.5393, + "step": 2319 + }, + { + "epoch": 0.9434729564863765, + "grad_norm": 22.836591001031742, + "learning_rate": 1.9905214742548982e-05, + "loss": 0.8688, + "step": 2320 + }, + { + "epoch": 0.9438796258641724, + "grad_norm": 17.49236636285456, + "learning_rate": 1.9905075522244383e-05, + "loss": 1.2791, + "step": 2321 + }, + { + "epoch": 0.9442862952419683, + "grad_norm": 17.530036066775274, + "learning_rate": 1.99049362002592e-05, + "loss": 0.5494, + "step": 2322 + }, + { + "epoch": 0.9446929646197642, + "grad_norm": 51.100728459587486, + "learning_rate": 1.9904796776594858e-05, + "loss": 0.8748, + "step": 2323 + }, + { + "epoch": 0.94509963399756, + "grad_norm": 17.306998904602693, + "learning_rate": 1.9904657251252796e-05, + "loss": 0.6639, + "step": 2324 + }, + { + "epoch": 0.9455063033753558, + "grad_norm": 26.382050221935742, + "learning_rate": 1.9904517624234438e-05, + "loss": 1.0723, + "step": 2325 + }, + { + "epoch": 0.9459129727531517, + "grad_norm": 17.889772964994524, + "learning_rate": 1.9904377895541226e-05, + "loss": 0.7925, + "step": 2326 + }, + { + "epoch": 0.9463196421309475, + "grad_norm": 8.31447127115715, + "learning_rate": 1.9904238065174586e-05, + "loss": 0.2557, + "step": 2327 + }, + { + "epoch": 0.9467263115087434, + "grad_norm": 11.816694881464768, + "learning_rate": 1.9904098133135964e-05, + "loss": 0.4626, + "step": 2328 + }, + { + "epoch": 0.9471329808865392, + "grad_norm": 10.217904655881291, + "learning_rate": 1.9903958099426784e-05, + "loss": 0.5692, + "step": 2329 + }, + { + "epoch": 0.9475396502643351, + "grad_norm": 17.343227976598083, + "learning_rate": 1.990381796404849e-05, + "loss": 0.9618, + "step": 2330 + }, + { + "epoch": 0.9479463196421309, + "grad_norm": 3.825384048161816, + "learning_rate": 1.9903677727002525e-05, + "loss": 0.0533, + "step": 2331 + }, + { + "epoch": 0.9483529890199268, + "grad_norm": 1.0961103471819813, + "learning_rate": 1.9903537388290316e-05, + "loss": 0.0152, + "step": 2332 + }, + { + "epoch": 0.9487596583977227, + "grad_norm": 28.366880579692957, + "learning_rate": 1.9903396947913317e-05, + "loss": 1.6647, + "step": 2333 + }, + { + "epoch": 0.9491663277755185, + "grad_norm": 9.37739382011478, + "learning_rate": 1.9903256405872963e-05, + "loss": 0.2276, + "step": 2334 + }, + { + "epoch": 0.9495729971533143, + "grad_norm": 6.645416322445524, + "learning_rate": 1.99031157621707e-05, + "loss": 0.2063, + "step": 2335 + }, + { + "epoch": 0.9499796665311102, + "grad_norm": 8.907531280685378, + "learning_rate": 1.9902975016807967e-05, + "loss": 0.1869, + "step": 2336 + }, + { + "epoch": 0.9503863359089061, + "grad_norm": 3.937890198094938, + "learning_rate": 1.990283416978621e-05, + "loss": 0.1289, + "step": 2337 + }, + { + "epoch": 0.950793005286702, + "grad_norm": 21.795754601774682, + "learning_rate": 1.9902693221106878e-05, + "loss": 0.4172, + "step": 2338 + }, + { + "epoch": 0.9511996746644977, + "grad_norm": 9.199225072671252, + "learning_rate": 1.9902552170771417e-05, + "loss": 0.4287, + "step": 2339 + }, + { + "epoch": 0.9516063440422936, + "grad_norm": 8.104353120389577, + "learning_rate": 1.9902411018781273e-05, + "loss": 0.2945, + "step": 2340 + }, + { + "epoch": 0.9520130134200895, + "grad_norm": 16.132954618529617, + "learning_rate": 1.9902269765137895e-05, + "loss": 0.6502, + "step": 2341 + }, + { + "epoch": 0.9524196827978854, + "grad_norm": 5.172387256925061, + "learning_rate": 1.9902128409842733e-05, + "loss": 0.1104, + "step": 2342 + }, + { + "epoch": 0.9528263521756811, + "grad_norm": 9.263469751685149, + "learning_rate": 1.9901986952897245e-05, + "loss": 0.4399, + "step": 2343 + }, + { + "epoch": 0.953233021553477, + "grad_norm": 10.600204189205728, + "learning_rate": 1.990184539430287e-05, + "loss": 0.1567, + "step": 2344 + }, + { + "epoch": 0.9536396909312729, + "grad_norm": 62.41438798703757, + "learning_rate": 1.9901703734061074e-05, + "loss": 1.1578, + "step": 2345 + }, + { + "epoch": 0.9540463603090688, + "grad_norm": 0.7725008592059357, + "learning_rate": 1.9901561972173303e-05, + "loss": 0.009, + "step": 2346 + }, + { + "epoch": 0.9544530296868646, + "grad_norm": 30.915350526859704, + "learning_rate": 1.990142010864102e-05, + "loss": 1.6256, + "step": 2347 + }, + { + "epoch": 0.9548596990646604, + "grad_norm": 8.448641640214316, + "learning_rate": 1.990127814346567e-05, + "loss": 0.1097, + "step": 2348 + }, + { + "epoch": 0.9552663684424563, + "grad_norm": 14.788233112731278, + "learning_rate": 1.990113607664872e-05, + "loss": 0.6332, + "step": 2349 + }, + { + "epoch": 0.9556730378202521, + "grad_norm": 11.811199796215858, + "learning_rate": 1.9900993908191625e-05, + "loss": 0.5308, + "step": 2350 + }, + { + "epoch": 0.956079707198048, + "grad_norm": 3.139164269032694, + "learning_rate": 1.9900851638095842e-05, + "loss": 0.0406, + "step": 2351 + }, + { + "epoch": 0.9564863765758438, + "grad_norm": 12.913548066836858, + "learning_rate": 1.9900709266362834e-05, + "loss": 0.7247, + "step": 2352 + }, + { + "epoch": 0.9568930459536397, + "grad_norm": 11.812183521272303, + "learning_rate": 1.9900566792994065e-05, + "loss": 0.3578, + "step": 2353 + }, + { + "epoch": 0.9572997153314355, + "grad_norm": 8.637286866506102, + "learning_rate": 1.9900424217990992e-05, + "loss": 0.0918, + "step": 2354 + }, + { + "epoch": 0.9577063847092314, + "grad_norm": 16.98503184797614, + "learning_rate": 1.9900281541355085e-05, + "loss": 0.5981, + "step": 2355 + }, + { + "epoch": 0.9581130540870273, + "grad_norm": 17.461505571694385, + "learning_rate": 1.9900138763087805e-05, + "loss": 1.0405, + "step": 2356 + }, + { + "epoch": 0.9585197234648231, + "grad_norm": 38.91798959413893, + "learning_rate": 1.9899995883190618e-05, + "loss": 0.9851, + "step": 2357 + }, + { + "epoch": 0.9589263928426189, + "grad_norm": 10.404369766827884, + "learning_rate": 1.9899852901664988e-05, + "loss": 0.1314, + "step": 2358 + }, + { + "epoch": 0.9593330622204148, + "grad_norm": 22.371061598147026, + "learning_rate": 1.9899709818512386e-05, + "loss": 0.808, + "step": 2359 + }, + { + "epoch": 0.9597397315982107, + "grad_norm": 7.97365642774499, + "learning_rate": 1.989956663373428e-05, + "loss": 0.4312, + "step": 2360 + }, + { + "epoch": 0.9601464009760065, + "grad_norm": 25.20266038279562, + "learning_rate": 1.9899423347332142e-05, + "loss": 1.1662, + "step": 2361 + }, + { + "epoch": 0.9605530703538023, + "grad_norm": 12.72661594516639, + "learning_rate": 1.9899279959307443e-05, + "loss": 0.4102, + "step": 2362 + }, + { + "epoch": 0.9609597397315982, + "grad_norm": 5.152415561573249, + "learning_rate": 1.989913646966165e-05, + "loss": 0.174, + "step": 2363 + }, + { + "epoch": 0.9613664091093941, + "grad_norm": 12.955009646891073, + "learning_rate": 1.9898992878396238e-05, + "loss": 0.7011, + "step": 2364 + }, + { + "epoch": 0.96177307848719, + "grad_norm": 8.736112783490238, + "learning_rate": 1.9898849185512684e-05, + "loss": 0.2088, + "step": 2365 + }, + { + "epoch": 0.9621797478649857, + "grad_norm": 5.610313450403301, + "learning_rate": 1.9898705391012462e-05, + "loss": 0.1945, + "step": 2366 + }, + { + "epoch": 0.9625864172427816, + "grad_norm": 14.269231583220964, + "learning_rate": 1.9898561494897044e-05, + "loss": 0.5619, + "step": 2367 + }, + { + "epoch": 0.9629930866205775, + "grad_norm": 16.96666710014505, + "learning_rate": 1.9898417497167912e-05, + "loss": 0.8661, + "step": 2368 + }, + { + "epoch": 0.9633997559983734, + "grad_norm": 9.473773180994527, + "learning_rate": 1.9898273397826545e-05, + "loss": 0.3491, + "step": 2369 + }, + { + "epoch": 0.9638064253761692, + "grad_norm": 2.9059130663008848, + "learning_rate": 1.9898129196874416e-05, + "loss": 0.0515, + "step": 2370 + }, + { + "epoch": 0.964213094753965, + "grad_norm": 4.8629964302003215, + "learning_rate": 1.9897984894313013e-05, + "loss": 0.2047, + "step": 2371 + }, + { + "epoch": 0.9646197641317609, + "grad_norm": 18.682504811740817, + "learning_rate": 1.989784049014381e-05, + "loss": 1.0156, + "step": 2372 + }, + { + "epoch": 0.9650264335095567, + "grad_norm": 20.38938988785188, + "learning_rate": 1.9897695984368295e-05, + "loss": 0.2861, + "step": 2373 + }, + { + "epoch": 0.9654331028873526, + "grad_norm": 12.258244374959691, + "learning_rate": 1.9897551376987948e-05, + "loss": 0.2318, + "step": 2374 + }, + { + "epoch": 0.9658397722651484, + "grad_norm": 8.322126473866367, + "learning_rate": 1.9897406668004257e-05, + "loss": 0.238, + "step": 2375 + }, + { + "epoch": 0.9662464416429443, + "grad_norm": 6.7392964522813275, + "learning_rate": 1.9897261857418703e-05, + "loss": 0.2994, + "step": 2376 + }, + { + "epoch": 0.9666531110207401, + "grad_norm": 3.8989036965058217, + "learning_rate": 1.9897116945232774e-05, + "loss": 0.0766, + "step": 2377 + }, + { + "epoch": 0.967059780398536, + "grad_norm": 6.2074712422698965, + "learning_rate": 1.9896971931447963e-05, + "loss": 0.1092, + "step": 2378 + }, + { + "epoch": 0.9674664497763319, + "grad_norm": 17.80488555182075, + "learning_rate": 1.989682681606575e-05, + "loss": 0.7535, + "step": 2379 + }, + { + "epoch": 0.9678731191541277, + "grad_norm": 15.478681093167845, + "learning_rate": 1.989668159908763e-05, + "loss": 0.5928, + "step": 2380 + }, + { + "epoch": 0.9682797885319235, + "grad_norm": 8.859388135823767, + "learning_rate": 1.9896536280515093e-05, + "loss": 0.3612, + "step": 2381 + }, + { + "epoch": 0.9686864579097194, + "grad_norm": 8.88067566832072, + "learning_rate": 1.989639086034963e-05, + "loss": 0.276, + "step": 2382 + }, + { + "epoch": 0.9690931272875153, + "grad_norm": 9.819842766846739, + "learning_rate": 1.9896245338592734e-05, + "loss": 0.2192, + "step": 2383 + }, + { + "epoch": 0.969499796665311, + "grad_norm": 3.805637303434447, + "learning_rate": 1.98960997152459e-05, + "loss": 0.0673, + "step": 2384 + }, + { + "epoch": 0.9699064660431069, + "grad_norm": 8.366193861193643, + "learning_rate": 1.989595399031062e-05, + "loss": 0.6018, + "step": 2385 + }, + { + "epoch": 0.9703131354209028, + "grad_norm": 8.66750010622022, + "learning_rate": 1.989580816378839e-05, + "loss": 0.2172, + "step": 2386 + }, + { + "epoch": 0.9707198047986987, + "grad_norm": 14.100051575445477, + "learning_rate": 1.9895662235680713e-05, + "loss": 0.4554, + "step": 2387 + }, + { + "epoch": 0.9711264741764946, + "grad_norm": 10.95554590927815, + "learning_rate": 1.9895516205989082e-05, + "loss": 0.6074, + "step": 2388 + }, + { + "epoch": 0.9715331435542903, + "grad_norm": 12.069090777358818, + "learning_rate": 1.9895370074714995e-05, + "loss": 0.234, + "step": 2389 + }, + { + "epoch": 0.9719398129320862, + "grad_norm": 3.5997061797994645, + "learning_rate": 1.9895223841859953e-05, + "loss": 0.0615, + "step": 2390 + }, + { + "epoch": 0.9723464823098821, + "grad_norm": 2.0564114202979153, + "learning_rate": 1.989507750742546e-05, + "loss": 0.0254, + "step": 2391 + }, + { + "epoch": 0.972753151687678, + "grad_norm": 1.2407681617907753, + "learning_rate": 1.989493107141302e-05, + "loss": 0.0169, + "step": 2392 + }, + { + "epoch": 0.9731598210654737, + "grad_norm": 3.7753734878856857, + "learning_rate": 1.9894784533824122e-05, + "loss": 0.0446, + "step": 2393 + }, + { + "epoch": 0.9735664904432696, + "grad_norm": 11.744461390355662, + "learning_rate": 1.989463789466029e-05, + "loss": 0.3709, + "step": 2394 + }, + { + "epoch": 0.9739731598210655, + "grad_norm": 0.8616883919347805, + "learning_rate": 1.9894491153923016e-05, + "loss": 0.0113, + "step": 2395 + }, + { + "epoch": 0.9743798291988613, + "grad_norm": 4.689284552090733, + "learning_rate": 1.9894344311613813e-05, + "loss": 0.0842, + "step": 2396 + }, + { + "epoch": 0.9747864985766572, + "grad_norm": 8.358596345927838, + "learning_rate": 1.9894197367734186e-05, + "loss": 0.1474, + "step": 2397 + }, + { + "epoch": 0.975193167954453, + "grad_norm": 12.444576200507415, + "learning_rate": 1.989405032228564e-05, + "loss": 0.2479, + "step": 2398 + }, + { + "epoch": 0.9755998373322489, + "grad_norm": 14.153255208943618, + "learning_rate": 1.989390317526969e-05, + "loss": 0.8417, + "step": 2399 + }, + { + "epoch": 0.9760065067100447, + "grad_norm": 9.654689370565084, + "learning_rate": 1.9893755926687847e-05, + "loss": 0.3838, + "step": 2400 + }, + { + "epoch": 0.9764131760878406, + "grad_norm": 4.554119366409659, + "learning_rate": 1.9893608576541616e-05, + "loss": 0.0947, + "step": 2401 + }, + { + "epoch": 0.9768198454656364, + "grad_norm": 11.863893272879345, + "learning_rate": 1.9893461124832516e-05, + "loss": 0.6395, + "step": 2402 + }, + { + "epoch": 0.9772265148434323, + "grad_norm": 3.2229258403792502, + "learning_rate": 1.9893313571562055e-05, + "loss": 0.0462, + "step": 2403 + }, + { + "epoch": 0.9776331842212281, + "grad_norm": 6.4151324485158, + "learning_rate": 1.9893165916731757e-05, + "loss": 0.1999, + "step": 2404 + }, + { + "epoch": 0.978039853599024, + "grad_norm": 11.63955058085429, + "learning_rate": 1.9893018160343127e-05, + "loss": 0.4127, + "step": 2405 + }, + { + "epoch": 0.9784465229768199, + "grad_norm": 13.91369816597201, + "learning_rate": 1.9892870302397686e-05, + "loss": 0.291, + "step": 2406 + }, + { + "epoch": 0.9788531923546157, + "grad_norm": 9.04539130030017, + "learning_rate": 1.9892722342896955e-05, + "loss": 0.1836, + "step": 2407 + }, + { + "epoch": 0.9792598617324115, + "grad_norm": 1.1925458549357348, + "learning_rate": 1.989257428184245e-05, + "loss": 0.0156, + "step": 2408 + }, + { + "epoch": 0.9796665311102074, + "grad_norm": 21.346471622140097, + "learning_rate": 1.989242611923569e-05, + "loss": 0.9048, + "step": 2409 + }, + { + "epoch": 0.9800732004880033, + "grad_norm": 11.38416524713684, + "learning_rate": 1.9892277855078198e-05, + "loss": 0.3447, + "step": 2410 + }, + { + "epoch": 0.9804798698657992, + "grad_norm": 1.902082268850442, + "learning_rate": 1.9892129489371494e-05, + "loss": 0.0089, + "step": 2411 + }, + { + "epoch": 0.9808865392435949, + "grad_norm": 15.918183126585724, + "learning_rate": 1.9891981022117104e-05, + "loss": 0.6716, + "step": 2412 + }, + { + "epoch": 0.9812932086213908, + "grad_norm": 18.402751755326147, + "learning_rate": 1.989183245331655e-05, + "loss": 1.2409, + "step": 2413 + }, + { + "epoch": 0.9816998779991867, + "grad_norm": 13.823451525910112, + "learning_rate": 1.9891683782971356e-05, + "loss": 0.3865, + "step": 2414 + }, + { + "epoch": 0.9821065473769826, + "grad_norm": 7.323652566849245, + "learning_rate": 1.989153501108305e-05, + "loss": 0.2114, + "step": 2415 + }, + { + "epoch": 0.9825132167547783, + "grad_norm": 12.922502697485003, + "learning_rate": 1.989138613765316e-05, + "loss": 0.3451, + "step": 2416 + }, + { + "epoch": 0.9829198861325742, + "grad_norm": 17.343531640929232, + "learning_rate": 1.989123716268321e-05, + "loss": 0.4826, + "step": 2417 + }, + { + "epoch": 0.9833265555103701, + "grad_norm": 18.819641365416157, + "learning_rate": 1.9891088086174734e-05, + "loss": 1.0571, + "step": 2418 + }, + { + "epoch": 0.983733224888166, + "grad_norm": 15.32998732118205, + "learning_rate": 1.989093890812926e-05, + "loss": 0.483, + "step": 2419 + }, + { + "epoch": 0.9841398942659618, + "grad_norm": 13.76046074985593, + "learning_rate": 1.9890789628548323e-05, + "loss": 0.7841, + "step": 2420 + }, + { + "epoch": 0.9845465636437576, + "grad_norm": 16.18567651411569, + "learning_rate": 1.9890640247433454e-05, + "loss": 0.1337, + "step": 2421 + }, + { + "epoch": 0.9849532330215535, + "grad_norm": 5.11227708998539, + "learning_rate": 1.9890490764786178e-05, + "loss": 0.0393, + "step": 2422 + }, + { + "epoch": 0.9853599023993493, + "grad_norm": 9.953321293119343, + "learning_rate": 1.989034118060804e-05, + "loss": 0.3907, + "step": 2423 + }, + { + "epoch": 0.9857665717771452, + "grad_norm": 20.06511555650121, + "learning_rate": 1.9890191494900575e-05, + "loss": 0.764, + "step": 2424 + }, + { + "epoch": 0.986173241154941, + "grad_norm": 4.773974997515339, + "learning_rate": 1.9890041707665314e-05, + "loss": 0.0658, + "step": 2425 + }, + { + "epoch": 0.9865799105327369, + "grad_norm": 6.588387315020807, + "learning_rate": 1.9889891818903796e-05, + "loss": 0.2892, + "step": 2426 + }, + { + "epoch": 0.9869865799105327, + "grad_norm": 5.311098921982844, + "learning_rate": 1.9889741828617562e-05, + "loss": 0.1774, + "step": 2427 + }, + { + "epoch": 0.9873932492883286, + "grad_norm": 1.932839015787581, + "learning_rate": 1.9889591736808152e-05, + "loss": 0.0383, + "step": 2428 + }, + { + "epoch": 0.9877999186661245, + "grad_norm": 37.104870538179355, + "learning_rate": 1.9889441543477103e-05, + "loss": 1.2887, + "step": 2429 + }, + { + "epoch": 0.9882065880439203, + "grad_norm": 5.635344932250223, + "learning_rate": 1.988929124862596e-05, + "loss": 0.1515, + "step": 2430 + }, + { + "epoch": 0.9886132574217161, + "grad_norm": 2.9662194133135116, + "learning_rate": 1.988914085225627e-05, + "loss": 0.0463, + "step": 2431 + }, + { + "epoch": 0.989019926799512, + "grad_norm": 1.9699175861828884, + "learning_rate": 1.9888990354369567e-05, + "loss": 0.0316, + "step": 2432 + }, + { + "epoch": 0.9894265961773079, + "grad_norm": 8.098206640291574, + "learning_rate": 1.9888839754967403e-05, + "loss": 0.2816, + "step": 2433 + }, + { + "epoch": 0.9898332655551036, + "grad_norm": 4.3875803854923205, + "learning_rate": 1.988868905405132e-05, + "loss": 0.0677, + "step": 2434 + }, + { + "epoch": 0.9902399349328995, + "grad_norm": 3.803735409757071, + "learning_rate": 1.988853825162287e-05, + "loss": 0.0474, + "step": 2435 + }, + { + "epoch": 0.9906466043106954, + "grad_norm": 12.845621392661547, + "learning_rate": 1.9888387347683595e-05, + "loss": 0.4733, + "step": 2436 + }, + { + "epoch": 0.9910532736884913, + "grad_norm": 6.822935294666262, + "learning_rate": 1.988823634223505e-05, + "loss": 0.1645, + "step": 2437 + }, + { + "epoch": 0.9914599430662872, + "grad_norm": 17.49117709653834, + "learning_rate": 1.988808523527878e-05, + "loss": 0.8336, + "step": 2438 + }, + { + "epoch": 0.9918666124440829, + "grad_norm": 20.342401157291146, + "learning_rate": 1.9887934026816337e-05, + "loss": 1.7416, + "step": 2439 + }, + { + "epoch": 0.9922732818218788, + "grad_norm": 4.72503297503122, + "learning_rate": 1.988778271684928e-05, + "loss": 0.1049, + "step": 2440 + }, + { + "epoch": 0.9926799511996747, + "grad_norm": 7.040958082325064, + "learning_rate": 1.9887631305379153e-05, + "loss": 0.1915, + "step": 2441 + }, + { + "epoch": 0.9930866205774705, + "grad_norm": 8.445286289419863, + "learning_rate": 1.9887479792407513e-05, + "loss": 0.1905, + "step": 2442 + }, + { + "epoch": 0.9934932899552664, + "grad_norm": 19.59872395825318, + "learning_rate": 1.9887328177935918e-05, + "loss": 0.5607, + "step": 2443 + }, + { + "epoch": 0.9938999593330622, + "grad_norm": 0.6385239644195327, + "learning_rate": 1.9887176461965924e-05, + "loss": 0.0128, + "step": 2444 + }, + { + "epoch": 0.9943066287108581, + "grad_norm": 16.398053023668552, + "learning_rate": 1.988702464449909e-05, + "loss": 0.3442, + "step": 2445 + }, + { + "epoch": 0.9947132980886539, + "grad_norm": 5.265114954685035, + "learning_rate": 1.9886872725536968e-05, + "loss": 0.1322, + "step": 2446 + }, + { + "epoch": 0.9951199674664498, + "grad_norm": 5.706625404718848, + "learning_rate": 1.988672070508112e-05, + "loss": 0.1062, + "step": 2447 + }, + { + "epoch": 0.9955266368442456, + "grad_norm": 7.436423799422732, + "learning_rate": 1.9886568583133113e-05, + "loss": 0.1594, + "step": 2448 + }, + { + "epoch": 0.9959333062220415, + "grad_norm": 5.114945452954158, + "learning_rate": 1.98864163596945e-05, + "loss": 0.0418, + "step": 2449 + }, + { + "epoch": 0.9963399755998373, + "grad_norm": 11.84086754999865, + "learning_rate": 1.988626403476685e-05, + "loss": 0.3715, + "step": 2450 + }, + { + "epoch": 0.9967466449776332, + "grad_norm": 0.44322184204381593, + "learning_rate": 1.9886111608351723e-05, + "loss": 0.0077, + "step": 2451 + }, + { + "epoch": 0.9971533143554291, + "grad_norm": 4.770463001358974, + "learning_rate": 1.9885959080450685e-05, + "loss": 0.1084, + "step": 2452 + }, + { + "epoch": 0.9975599837332249, + "grad_norm": 13.727820684107767, + "learning_rate": 1.98858064510653e-05, + "loss": 0.4583, + "step": 2453 + }, + { + "epoch": 0.9979666531110207, + "grad_norm": 17.0745257649218, + "learning_rate": 1.988565372019714e-05, + "loss": 0.5311, + "step": 2454 + }, + { + "epoch": 0.9983733224888166, + "grad_norm": 6.838019325999088, + "learning_rate": 1.9885500887847766e-05, + "loss": 0.113, + "step": 2455 + }, + { + "epoch": 0.9987799918666125, + "grad_norm": 9.374934661508405, + "learning_rate": 1.988534795401875e-05, + "loss": 0.2037, + "step": 2456 + }, + { + "epoch": 0.9991866612444082, + "grad_norm": 0.04021426756073328, + "learning_rate": 1.9885194918711665e-05, + "loss": 0.0005, + "step": 2457 + }, + { + "epoch": 0.9995933306222041, + "grad_norm": 9.287269588320983, + "learning_rate": 1.9885041781928076e-05, + "loss": 0.2696, + "step": 2458 + }, + { + "epoch": 1.0, + "grad_norm": 14.184127073174754, + "learning_rate": 1.9884888543669558e-05, + "loss": 0.8749, + "step": 2459 + }, + { + "epoch": 1.0004066693777958, + "grad_norm": 12.271747713179272, + "learning_rate": 1.9884735203937685e-05, + "loss": 0.4147, + "step": 2460 + }, + { + "epoch": 1.0008133387555918, + "grad_norm": 14.179451766382952, + "learning_rate": 1.9884581762734034e-05, + "loss": 0.4211, + "step": 2461 + }, + { + "epoch": 1.0012200081333875, + "grad_norm": 23.30612259690859, + "learning_rate": 1.988442822006017e-05, + "loss": 0.7861, + "step": 2462 + }, + { + "epoch": 1.0016266775111835, + "grad_norm": 1.1355280148564861, + "learning_rate": 1.9884274575917678e-05, + "loss": 0.0152, + "step": 2463 + }, + { + "epoch": 1.0020333468889793, + "grad_norm": 0.23804674462042477, + "learning_rate": 1.9884120830308132e-05, + "loss": 0.003, + "step": 2464 + }, + { + "epoch": 1.002440016266775, + "grad_norm": 23.794420773088472, + "learning_rate": 1.9883966983233116e-05, + "loss": 1.3661, + "step": 2465 + }, + { + "epoch": 1.002846685644571, + "grad_norm": 6.706528139176097, + "learning_rate": 1.9883813034694198e-05, + "loss": 0.089, + "step": 2466 + }, + { + "epoch": 1.0032533550223668, + "grad_norm": 11.354312308382227, + "learning_rate": 1.9883658984692968e-05, + "loss": 0.6185, + "step": 2467 + }, + { + "epoch": 1.0036600244001628, + "grad_norm": 11.449821519515176, + "learning_rate": 1.9883504833231004e-05, + "loss": 0.2586, + "step": 2468 + }, + { + "epoch": 1.0040666937779585, + "grad_norm": 19.55023078562721, + "learning_rate": 1.988335058030989e-05, + "loss": 0.5155, + "step": 2469 + }, + { + "epoch": 1.0044733631557543, + "grad_norm": 11.405196393354837, + "learning_rate": 1.9883196225931204e-05, + "loss": 0.6324, + "step": 2470 + }, + { + "epoch": 1.0048800325335503, + "grad_norm": 12.876913758499352, + "learning_rate": 1.988304177009654e-05, + "loss": 0.4377, + "step": 2471 + }, + { + "epoch": 1.005286701911346, + "grad_norm": 14.778736950711423, + "learning_rate": 1.9882887212807477e-05, + "loss": 0.5058, + "step": 2472 + }, + { + "epoch": 1.0056933712891418, + "grad_norm": 12.89298238959854, + "learning_rate": 1.98827325540656e-05, + "loss": 0.3447, + "step": 2473 + }, + { + "epoch": 1.0061000406669378, + "grad_norm": 17.008666846318143, + "learning_rate": 1.98825777938725e-05, + "loss": 0.0767, + "step": 2474 + }, + { + "epoch": 1.0065067100447336, + "grad_norm": 14.380762284752999, + "learning_rate": 1.9882422932229765e-05, + "loss": 0.6423, + "step": 2475 + }, + { + "epoch": 1.0069133794225296, + "grad_norm": 15.043597823243202, + "learning_rate": 1.9882267969138985e-05, + "loss": 0.3135, + "step": 2476 + }, + { + "epoch": 1.0073200488003253, + "grad_norm": 12.683536392028767, + "learning_rate": 1.9882112904601755e-05, + "loss": 0.8187, + "step": 2477 + }, + { + "epoch": 1.007726718178121, + "grad_norm": 2.924680942425618, + "learning_rate": 1.9881957738619658e-05, + "loss": 0.0306, + "step": 2478 + }, + { + "epoch": 1.008133387555917, + "grad_norm": 7.832886143735547, + "learning_rate": 1.988180247119429e-05, + "loss": 0.3083, + "step": 2479 + }, + { + "epoch": 1.0085400569337128, + "grad_norm": 11.9856648021499, + "learning_rate": 1.988164710232725e-05, + "loss": 0.4557, + "step": 2480 + }, + { + "epoch": 1.0089467263115088, + "grad_norm": 3.9708453796671392, + "learning_rate": 1.9881491632020127e-05, + "loss": 0.0723, + "step": 2481 + }, + { + "epoch": 1.0093533956893046, + "grad_norm": 15.826662608181108, + "learning_rate": 1.988133606027452e-05, + "loss": 0.8067, + "step": 2482 + }, + { + "epoch": 1.0097600650671004, + "grad_norm": 14.094053200519676, + "learning_rate": 1.988118038709202e-05, + "loss": 0.3845, + "step": 2483 + }, + { + "epoch": 1.0101667344448964, + "grad_norm": 13.622960840179148, + "learning_rate": 1.988102461247424e-05, + "loss": 0.5549, + "step": 2484 + }, + { + "epoch": 1.0105734038226921, + "grad_norm": 11.577249585019128, + "learning_rate": 1.9880868736422764e-05, + "loss": 0.6419, + "step": 2485 + }, + { + "epoch": 1.010980073200488, + "grad_norm": 7.181844529546469, + "learning_rate": 1.9880712758939197e-05, + "loss": 0.1281, + "step": 2486 + }, + { + "epoch": 1.0113867425782839, + "grad_norm": 12.78390362957064, + "learning_rate": 1.9880556680025142e-05, + "loss": 0.3621, + "step": 2487 + }, + { + "epoch": 1.0117934119560796, + "grad_norm": 7.915688248403112, + "learning_rate": 1.9880400499682198e-05, + "loss": 0.1494, + "step": 2488 + }, + { + "epoch": 1.0122000813338756, + "grad_norm": 13.542606888255436, + "learning_rate": 1.9880244217911975e-05, + "loss": 0.3763, + "step": 2489 + }, + { + "epoch": 1.0126067507116714, + "grad_norm": 9.397792122231168, + "learning_rate": 1.9880087834716066e-05, + "loss": 0.1926, + "step": 2490 + }, + { + "epoch": 1.0130134200894672, + "grad_norm": 5.744066291201222, + "learning_rate": 1.987993135009609e-05, + "loss": 0.2321, + "step": 2491 + }, + { + "epoch": 1.0134200894672631, + "grad_norm": 10.367343789218799, + "learning_rate": 1.987977476405364e-05, + "loss": 0.2743, + "step": 2492 + }, + { + "epoch": 1.013826758845059, + "grad_norm": 9.014008239574352, + "learning_rate": 1.987961807659033e-05, + "loss": 0.142, + "step": 2493 + }, + { + "epoch": 1.014233428222855, + "grad_norm": 12.80797801249379, + "learning_rate": 1.987946128770777e-05, + "loss": 0.3238, + "step": 2494 + }, + { + "epoch": 1.0146400976006507, + "grad_norm": 2.7535276890862446, + "learning_rate": 1.987930439740757e-05, + "loss": 0.0417, + "step": 2495 + }, + { + "epoch": 1.0150467669784464, + "grad_norm": 10.48501120792601, + "learning_rate": 1.9879147405691335e-05, + "loss": 0.7319, + "step": 2496 + }, + { + "epoch": 1.0154534363562424, + "grad_norm": 7.925350730320746, + "learning_rate": 1.987899031256068e-05, + "loss": 0.1415, + "step": 2497 + }, + { + "epoch": 1.0158601057340382, + "grad_norm": 8.949674190158893, + "learning_rate": 1.987883311801722e-05, + "loss": 0.2194, + "step": 2498 + }, + { + "epoch": 1.0162667751118342, + "grad_norm": 10.211422447926747, + "learning_rate": 1.9878675822062563e-05, + "loss": 0.3742, + "step": 2499 + }, + { + "epoch": 1.01667344448963, + "grad_norm": 19.224869369489642, + "learning_rate": 1.9878518424698327e-05, + "loss": 0.2236, + "step": 2500 + }, + { + "epoch": 1.0170801138674257, + "grad_norm": 11.733209832645487, + "learning_rate": 1.987836092592613e-05, + "loss": 0.2083, + "step": 2501 + }, + { + "epoch": 1.0174867832452217, + "grad_norm": 18.989305399930476, + "learning_rate": 1.9878203325747584e-05, + "loss": 0.7668, + "step": 2502 + }, + { + "epoch": 1.0178934526230174, + "grad_norm": 14.740913493967426, + "learning_rate": 1.987804562416431e-05, + "loss": 0.767, + "step": 2503 + }, + { + "epoch": 1.0183001220008134, + "grad_norm": 57.90554562102686, + "learning_rate": 1.9877887821177927e-05, + "loss": 0.1752, + "step": 2504 + }, + { + "epoch": 1.0187067913786092, + "grad_norm": 6.923845066880764, + "learning_rate": 1.9877729916790055e-05, + "loss": 0.2201, + "step": 2505 + }, + { + "epoch": 1.019113460756405, + "grad_norm": 23.756821277090296, + "learning_rate": 1.987757191100231e-05, + "loss": 0.6438, + "step": 2506 + }, + { + "epoch": 1.019520130134201, + "grad_norm": 12.985964622989977, + "learning_rate": 1.9877413803816323e-05, + "loss": 0.5393, + "step": 2507 + }, + { + "epoch": 1.0199267995119967, + "grad_norm": 14.79721695891029, + "learning_rate": 1.987725559523371e-05, + "loss": 0.712, + "step": 2508 + }, + { + "epoch": 1.0203334688897927, + "grad_norm": 6.928481929124975, + "learning_rate": 1.9877097285256094e-05, + "loss": 0.3029, + "step": 2509 + }, + { + "epoch": 1.0207401382675885, + "grad_norm": 16.121465062267827, + "learning_rate": 1.987693887388511e-05, + "loss": 1.0435, + "step": 2510 + }, + { + "epoch": 1.0211468076453842, + "grad_norm": 14.44947734180091, + "learning_rate": 1.987678036112237e-05, + "loss": 0.321, + "step": 2511 + }, + { + "epoch": 1.0215534770231802, + "grad_norm": 20.40283233653533, + "learning_rate": 1.9876621746969516e-05, + "loss": 0.2456, + "step": 2512 + }, + { + "epoch": 1.021960146400976, + "grad_norm": 16.163732736085244, + "learning_rate": 1.9876463031428165e-05, + "loss": 0.7302, + "step": 2513 + }, + { + "epoch": 1.0223668157787718, + "grad_norm": 11.480261788639515, + "learning_rate": 1.987630421449995e-05, + "loss": 0.3718, + "step": 2514 + }, + { + "epoch": 1.0227734851565677, + "grad_norm": 7.827425171180507, + "learning_rate": 1.98761452961865e-05, + "loss": 0.1713, + "step": 2515 + }, + { + "epoch": 1.0231801545343635, + "grad_norm": 9.178028505141638, + "learning_rate": 1.9875986276489453e-05, + "loss": 0.2914, + "step": 2516 + }, + { + "epoch": 1.0235868239121595, + "grad_norm": 14.56303902149554, + "learning_rate": 1.987582715541043e-05, + "loss": 1.1287, + "step": 2517 + }, + { + "epoch": 1.0239934932899553, + "grad_norm": 9.01253958768126, + "learning_rate": 1.9875667932951077e-05, + "loss": 0.2535, + "step": 2518 + }, + { + "epoch": 1.024400162667751, + "grad_norm": 11.87323927013569, + "learning_rate": 1.9875508609113022e-05, + "loss": 0.1605, + "step": 2519 + }, + { + "epoch": 1.024806832045547, + "grad_norm": 3.5241406146097063, + "learning_rate": 1.98753491838979e-05, + "loss": 0.0634, + "step": 2520 + }, + { + "epoch": 1.0252135014233428, + "grad_norm": 6.279396286566081, + "learning_rate": 1.9875189657307344e-05, + "loss": 0.1212, + "step": 2521 + }, + { + "epoch": 1.0256201708011388, + "grad_norm": 14.575589693698657, + "learning_rate": 1.9875030029343e-05, + "loss": 1.086, + "step": 2522 + }, + { + "epoch": 1.0260268401789345, + "grad_norm": 5.837864857879128, + "learning_rate": 1.9874870300006503e-05, + "loss": 0.2052, + "step": 2523 + }, + { + "epoch": 1.0264335095567303, + "grad_norm": 10.960049062180172, + "learning_rate": 1.987471046929949e-05, + "loss": 0.5112, + "step": 2524 + }, + { + "epoch": 1.0268401789345263, + "grad_norm": 21.654333208847046, + "learning_rate": 1.9874550537223607e-05, + "loss": 0.5248, + "step": 2525 + }, + { + "epoch": 1.027246848312322, + "grad_norm": 10.88979901370975, + "learning_rate": 1.9874390503780492e-05, + "loss": 0.5029, + "step": 2526 + }, + { + "epoch": 1.027653517690118, + "grad_norm": 23.874415612371095, + "learning_rate": 1.987423036897179e-05, + "loss": 1.2619, + "step": 2527 + }, + { + "epoch": 1.0280601870679138, + "grad_norm": 19.207747646406318, + "learning_rate": 1.9874070132799138e-05, + "loss": 0.6386, + "step": 2528 + }, + { + "epoch": 1.0284668564457096, + "grad_norm": 26.169108464146372, + "learning_rate": 1.9873909795264192e-05, + "loss": 0.614, + "step": 2529 + }, + { + "epoch": 1.0288735258235056, + "grad_norm": 14.254532704082038, + "learning_rate": 1.9873749356368594e-05, + "loss": 0.2569, + "step": 2530 + }, + { + "epoch": 1.0292801952013013, + "grad_norm": 15.497612278979709, + "learning_rate": 1.9873588816113988e-05, + "loss": 0.7144, + "step": 2531 + }, + { + "epoch": 1.029686864579097, + "grad_norm": 82.27500888090685, + "learning_rate": 1.987342817450202e-05, + "loss": 0.2271, + "step": 2532 + }, + { + "epoch": 1.030093533956893, + "grad_norm": 20.384101623856747, + "learning_rate": 1.9873267431534343e-05, + "loss": 0.3837, + "step": 2533 + }, + { + "epoch": 1.0305002033346888, + "grad_norm": 6.827476829061492, + "learning_rate": 1.9873106587212612e-05, + "loss": 0.1169, + "step": 2534 + }, + { + "epoch": 1.0309068727124848, + "grad_norm": 9.211379430362568, + "learning_rate": 1.987294564153847e-05, + "loss": 0.1188, + "step": 2535 + }, + { + "epoch": 1.0313135420902806, + "grad_norm": 4.823821925771231, + "learning_rate": 1.987278459451357e-05, + "loss": 0.149, + "step": 2536 + }, + { + "epoch": 1.0317202114680764, + "grad_norm": 11.02417886349473, + "learning_rate": 1.9872623446139568e-05, + "loss": 0.3116, + "step": 2537 + }, + { + "epoch": 1.0321268808458723, + "grad_norm": 15.152913403700511, + "learning_rate": 1.9872462196418117e-05, + "loss": 0.4314, + "step": 2538 + }, + { + "epoch": 1.032533550223668, + "grad_norm": 16.544826473012364, + "learning_rate": 1.9872300845350875e-05, + "loss": 0.8471, + "step": 2539 + }, + { + "epoch": 1.032940219601464, + "grad_norm": 5.341189193245412, + "learning_rate": 1.9872139392939495e-05, + "loss": 0.0585, + "step": 2540 + }, + { + "epoch": 1.0333468889792599, + "grad_norm": 8.288680601715408, + "learning_rate": 1.9871977839185637e-05, + "loss": 0.493, + "step": 2541 + }, + { + "epoch": 1.0337535583570556, + "grad_norm": 6.068481807863349, + "learning_rate": 1.9871816184090958e-05, + "loss": 0.1632, + "step": 2542 + }, + { + "epoch": 1.0341602277348516, + "grad_norm": 2.395134085235187, + "learning_rate": 1.9871654427657115e-05, + "loss": 0.0362, + "step": 2543 + }, + { + "epoch": 1.0345668971126474, + "grad_norm": 13.063705077464343, + "learning_rate": 1.9871492569885772e-05, + "loss": 0.3405, + "step": 2544 + }, + { + "epoch": 1.0349735664904434, + "grad_norm": 13.504686490708437, + "learning_rate": 1.9871330610778588e-05, + "loss": 0.3119, + "step": 2545 + }, + { + "epoch": 1.0353802358682391, + "grad_norm": 1.9746449274319195, + "learning_rate": 1.987116855033723e-05, + "loss": 0.0265, + "step": 2546 + }, + { + "epoch": 1.035786905246035, + "grad_norm": 14.155634117597439, + "learning_rate": 1.9871006388563357e-05, + "loss": 0.3328, + "step": 2547 + }, + { + "epoch": 1.0361935746238309, + "grad_norm": 1.2183647152074066, + "learning_rate": 1.987084412545864e-05, + "loss": 0.0266, + "step": 2548 + }, + { + "epoch": 1.0366002440016266, + "grad_norm": 10.057956764221345, + "learning_rate": 1.9870681761024735e-05, + "loss": 0.3191, + "step": 2549 + }, + { + "epoch": 1.0370069133794226, + "grad_norm": 7.461187815726835, + "learning_rate": 1.9870519295263316e-05, + "loss": 0.1381, + "step": 2550 + }, + { + "epoch": 1.0374135827572184, + "grad_norm": 3.106403443711801, + "learning_rate": 1.987035672817605e-05, + "loss": 0.1142, + "step": 2551 + }, + { + "epoch": 1.0378202521350142, + "grad_norm": 11.950487518962031, + "learning_rate": 1.9870194059764604e-05, + "loss": 0.2523, + "step": 2552 + }, + { + "epoch": 1.0382269215128102, + "grad_norm": 15.092417184876147, + "learning_rate": 1.9870031290030643e-05, + "loss": 0.3434, + "step": 2553 + }, + { + "epoch": 1.038633590890606, + "grad_norm": 0.9729359712250901, + "learning_rate": 1.9869868418975852e-05, + "loss": 0.009, + "step": 2554 + }, + { + "epoch": 1.0390402602684017, + "grad_norm": 8.416123553006965, + "learning_rate": 1.9869705446601887e-05, + "loss": 0.236, + "step": 2555 + }, + { + "epoch": 1.0394469296461977, + "grad_norm": 18.381238505221408, + "learning_rate": 1.986954237291043e-05, + "loss": 0.6708, + "step": 2556 + }, + { + "epoch": 1.0398535990239934, + "grad_norm": 11.626526983770402, + "learning_rate": 1.9869379197903152e-05, + "loss": 0.3761, + "step": 2557 + }, + { + "epoch": 1.0402602684017894, + "grad_norm": 13.906730038683401, + "learning_rate": 1.9869215921581734e-05, + "loss": 0.5537, + "step": 2558 + }, + { + "epoch": 1.0406669377795852, + "grad_norm": 19.9809496488898, + "learning_rate": 1.9869052543947846e-05, + "loss": 0.6069, + "step": 2559 + }, + { + "epoch": 1.041073607157381, + "grad_norm": 24.42466912177254, + "learning_rate": 1.9868889065003164e-05, + "loss": 1.1007, + "step": 2560 + }, + { + "epoch": 1.041480276535177, + "grad_norm": 1.701390618066504, + "learning_rate": 1.9868725484749373e-05, + "loss": 0.0293, + "step": 2561 + }, + { + "epoch": 1.0418869459129727, + "grad_norm": 14.640383397433132, + "learning_rate": 1.986856180318815e-05, + "loss": 0.5238, + "step": 2562 + }, + { + "epoch": 1.0422936152907687, + "grad_norm": 7.633137332118226, + "learning_rate": 1.9868398020321163e-05, + "loss": 0.1138, + "step": 2563 + }, + { + "epoch": 1.0427002846685645, + "grad_norm": 10.568801614718785, + "learning_rate": 1.986823413615011e-05, + "loss": 0.1621, + "step": 2564 + }, + { + "epoch": 1.0431069540463602, + "grad_norm": 2.2330130359055933, + "learning_rate": 1.9868070150676667e-05, + "loss": 0.0264, + "step": 2565 + }, + { + "epoch": 1.0435136234241562, + "grad_norm": 16.7146296245627, + "learning_rate": 1.9867906063902517e-05, + "loss": 0.7246, + "step": 2566 + }, + { + "epoch": 1.043920292801952, + "grad_norm": 20.51617428980504, + "learning_rate": 1.986774187582935e-05, + "loss": 1.0569, + "step": 2567 + }, + { + "epoch": 1.044326962179748, + "grad_norm": 10.024446797770375, + "learning_rate": 1.9867577586458836e-05, + "loss": 0.2628, + "step": 2568 + }, + { + "epoch": 1.0447336315575437, + "grad_norm": 22.73599092141548, + "learning_rate": 1.986741319579268e-05, + "loss": 0.4669, + "step": 2569 + }, + { + "epoch": 1.0451403009353395, + "grad_norm": 4.4206481766749235, + "learning_rate": 1.9867248703832554e-05, + "loss": 0.0926, + "step": 2570 + }, + { + "epoch": 1.0455469703131355, + "grad_norm": 9.432333129071644, + "learning_rate": 1.9867084110580163e-05, + "loss": 0.2851, + "step": 2571 + }, + { + "epoch": 1.0459536396909312, + "grad_norm": 5.478927548311535, + "learning_rate": 1.9866919416037183e-05, + "loss": 0.151, + "step": 2572 + }, + { + "epoch": 1.046360309068727, + "grad_norm": 9.255824825019532, + "learning_rate": 1.986675462020531e-05, + "loss": 0.2029, + "step": 2573 + }, + { + "epoch": 1.046766978446523, + "grad_norm": 16.17594638721515, + "learning_rate": 1.9866589723086234e-05, + "loss": 0.9668, + "step": 2574 + }, + { + "epoch": 1.0471736478243188, + "grad_norm": 14.91458068210229, + "learning_rate": 1.9866424724681646e-05, + "loss": 0.461, + "step": 2575 + }, + { + "epoch": 1.0475803172021148, + "grad_norm": 10.356249246580356, + "learning_rate": 1.9866259624993246e-05, + "loss": 0.6219, + "step": 2576 + }, + { + "epoch": 1.0479869865799105, + "grad_norm": 6.8671206618215335, + "learning_rate": 1.9866094424022725e-05, + "loss": 0.1381, + "step": 2577 + }, + { + "epoch": 1.0483936559577063, + "grad_norm": 23.875364732702526, + "learning_rate": 1.986592912177178e-05, + "loss": 0.4521, + "step": 2578 + }, + { + "epoch": 1.0488003253355023, + "grad_norm": 8.076339167461686, + "learning_rate": 1.9865763718242107e-05, + "loss": 0.1994, + "step": 2579 + }, + { + "epoch": 1.049206994713298, + "grad_norm": 13.03035691168155, + "learning_rate": 1.9865598213435402e-05, + "loss": 0.3512, + "step": 2580 + }, + { + "epoch": 1.049613664091094, + "grad_norm": 18.02321167772044, + "learning_rate": 1.986543260735337e-05, + "loss": 0.153, + "step": 2581 + }, + { + "epoch": 1.0500203334688898, + "grad_norm": 9.555556229249495, + "learning_rate": 1.9865266899997705e-05, + "loss": 0.5077, + "step": 2582 + }, + { + "epoch": 1.0504270028466856, + "grad_norm": 6.424168712677872, + "learning_rate": 1.9865101091370108e-05, + "loss": 0.0802, + "step": 2583 + }, + { + "epoch": 1.0508336722244815, + "grad_norm": 14.318201872595877, + "learning_rate": 1.986493518147229e-05, + "loss": 0.623, + "step": 2584 + }, + { + "epoch": 1.0512403416022773, + "grad_norm": 5.126450507992535, + "learning_rate": 1.9864769170305937e-05, + "loss": 0.1125, + "step": 2585 + }, + { + "epoch": 1.0516470109800733, + "grad_norm": 17.445405395179762, + "learning_rate": 1.986460305787277e-05, + "loss": 0.6228, + "step": 2586 + }, + { + "epoch": 1.052053680357869, + "grad_norm": 9.431198834011626, + "learning_rate": 1.9864436844174486e-05, + "loss": 0.3739, + "step": 2587 + }, + { + "epoch": 1.0524603497356648, + "grad_norm": 8.475436563468858, + "learning_rate": 1.9864270529212796e-05, + "loss": 0.3666, + "step": 2588 + }, + { + "epoch": 1.0528670191134608, + "grad_norm": 6.379497645932018, + "learning_rate": 1.98641041129894e-05, + "loss": 0.1308, + "step": 2589 + }, + { + "epoch": 1.0532736884912566, + "grad_norm": 5.448526870980353, + "learning_rate": 1.9863937595506015e-05, + "loss": 0.1665, + "step": 2590 + }, + { + "epoch": 1.0536803578690526, + "grad_norm": 9.241547744001592, + "learning_rate": 1.9863770976764346e-05, + "loss": 0.3736, + "step": 2591 + }, + { + "epoch": 1.0540870272468483, + "grad_norm": 12.496764165135085, + "learning_rate": 1.98636042567661e-05, + "loss": 0.4725, + "step": 2592 + }, + { + "epoch": 1.054493696624644, + "grad_norm": 15.154727673273301, + "learning_rate": 1.9863437435512996e-05, + "loss": 0.5377, + "step": 2593 + }, + { + "epoch": 1.05490036600244, + "grad_norm": 6.2745912522129395, + "learning_rate": 1.986327051300674e-05, + "loss": 0.1765, + "step": 2594 + }, + { + "epoch": 1.0553070353802358, + "grad_norm": 17.339565093428263, + "learning_rate": 1.986310348924905e-05, + "loss": 0.6371, + "step": 2595 + }, + { + "epoch": 1.0557137047580318, + "grad_norm": 11.260907904292417, + "learning_rate": 1.9862936364241638e-05, + "loss": 0.1586, + "step": 2596 + }, + { + "epoch": 1.0561203741358276, + "grad_norm": 8.022919978053151, + "learning_rate": 1.9862769137986218e-05, + "loss": 0.3634, + "step": 2597 + }, + { + "epoch": 1.0565270435136234, + "grad_norm": 34.28894311580357, + "learning_rate": 1.9862601810484514e-05, + "loss": 0.4828, + "step": 2598 + }, + { + "epoch": 1.0569337128914194, + "grad_norm": 19.787713313956072, + "learning_rate": 1.9862434381738235e-05, + "loss": 0.6898, + "step": 2599 + }, + { + "epoch": 1.0573403822692151, + "grad_norm": 2.294940559138041, + "learning_rate": 1.9862266851749103e-05, + "loss": 0.0359, + "step": 2600 + }, + { + "epoch": 1.0577470516470109, + "grad_norm": 10.957909145392064, + "learning_rate": 1.986209922051884e-05, + "loss": 0.2531, + "step": 2601 + }, + { + "epoch": 1.0581537210248069, + "grad_norm": 5.301853582696262, + "learning_rate": 1.986193148804916e-05, + "loss": 0.0667, + "step": 2602 + }, + { + "epoch": 1.0585603904026026, + "grad_norm": 10.51689592019991, + "learning_rate": 1.9861763654341797e-05, + "loss": 0.1915, + "step": 2603 + }, + { + "epoch": 1.0589670597803986, + "grad_norm": 23.97075141585459, + "learning_rate": 1.9861595719398465e-05, + "loss": 0.3198, + "step": 2604 + }, + { + "epoch": 1.0593737291581944, + "grad_norm": 25.34698540288796, + "learning_rate": 1.9861427683220888e-05, + "loss": 0.9041, + "step": 2605 + }, + { + "epoch": 1.0597803985359902, + "grad_norm": 3.1520393801041915, + "learning_rate": 1.9861259545810793e-05, + "loss": 0.0161, + "step": 2606 + }, + { + "epoch": 1.0601870679137861, + "grad_norm": 17.393086858631182, + "learning_rate": 1.9861091307169906e-05, + "loss": 0.5094, + "step": 2607 + }, + { + "epoch": 1.060593737291582, + "grad_norm": 28.21259095352015, + "learning_rate": 1.9860922967299952e-05, + "loss": 0.3768, + "step": 2608 + }, + { + "epoch": 1.061000406669378, + "grad_norm": 5.124936627048064, + "learning_rate": 1.9860754526202663e-05, + "loss": 0.1413, + "step": 2609 + }, + { + "epoch": 1.0614070760471737, + "grad_norm": 21.401047495552383, + "learning_rate": 1.9860585983879766e-05, + "loss": 0.5132, + "step": 2610 + }, + { + "epoch": 1.0618137454249694, + "grad_norm": 9.567295869650229, + "learning_rate": 1.986041734033299e-05, + "loss": 0.4127, + "step": 2611 + }, + { + "epoch": 1.0622204148027654, + "grad_norm": 17.526625961743747, + "learning_rate": 1.9860248595564066e-05, + "loss": 0.6024, + "step": 2612 + }, + { + "epoch": 1.0626270841805612, + "grad_norm": 6.680430631298237, + "learning_rate": 1.986007974957473e-05, + "loss": 0.2212, + "step": 2613 + }, + { + "epoch": 1.063033753558357, + "grad_norm": 18.344237136048786, + "learning_rate": 1.9859910802366715e-05, + "loss": 1.1616, + "step": 2614 + }, + { + "epoch": 1.063440422936153, + "grad_norm": 9.943325216269878, + "learning_rate": 1.985974175394175e-05, + "loss": 0.1176, + "step": 2615 + }, + { + "epoch": 1.0638470923139487, + "grad_norm": 0.9979522817041366, + "learning_rate": 1.9859572604301574e-05, + "loss": 0.0122, + "step": 2616 + }, + { + "epoch": 1.0642537616917447, + "grad_norm": 8.972101898248518, + "learning_rate": 1.9859403353447923e-05, + "loss": 0.3582, + "step": 2617 + }, + { + "epoch": 1.0646604310695404, + "grad_norm": 11.42903185813273, + "learning_rate": 1.9859234001382535e-05, + "loss": 0.62, + "step": 2618 + }, + { + "epoch": 1.0650671004473362, + "grad_norm": 13.34278622648772, + "learning_rate": 1.985906454810715e-05, + "loss": 0.3875, + "step": 2619 + }, + { + "epoch": 1.0654737698251322, + "grad_norm": 12.896553210950458, + "learning_rate": 1.9858894993623502e-05, + "loss": 0.2671, + "step": 2620 + }, + { + "epoch": 1.065880439202928, + "grad_norm": 7.024633354053173, + "learning_rate": 1.9858725337933334e-05, + "loss": 0.0781, + "step": 2621 + }, + { + "epoch": 1.066287108580724, + "grad_norm": 7.941746614516993, + "learning_rate": 1.985855558103839e-05, + "loss": 0.2982, + "step": 2622 + }, + { + "epoch": 1.0666937779585197, + "grad_norm": 18.5420357529913, + "learning_rate": 1.9858385722940413e-05, + "loss": 0.6015, + "step": 2623 + }, + { + "epoch": 1.0671004473363155, + "grad_norm": 10.423694109723053, + "learning_rate": 1.9858215763641144e-05, + "loss": 0.2758, + "step": 2624 + }, + { + "epoch": 1.0675071167141115, + "grad_norm": 2.6403058262569936, + "learning_rate": 1.985804570314233e-05, + "loss": 0.0458, + "step": 2625 + }, + { + "epoch": 1.0679137860919072, + "grad_norm": 10.229753177927833, + "learning_rate": 1.9857875541445713e-05, + "loss": 0.3153, + "step": 2626 + }, + { + "epoch": 1.0683204554697032, + "grad_norm": 22.56367546147423, + "learning_rate": 1.9857705278553044e-05, + "loss": 0.1225, + "step": 2627 + }, + { + "epoch": 1.068727124847499, + "grad_norm": 9.640187229011996, + "learning_rate": 1.9857534914466064e-05, + "loss": 0.0947, + "step": 2628 + }, + { + "epoch": 1.0691337942252948, + "grad_norm": 6.723086025614594, + "learning_rate": 1.9857364449186534e-05, + "loss": 0.3864, + "step": 2629 + }, + { + "epoch": 1.0695404636030907, + "grad_norm": 9.472815633593077, + "learning_rate": 1.9857193882716193e-05, + "loss": 0.1441, + "step": 2630 + }, + { + "epoch": 1.0699471329808865, + "grad_norm": 18.120756930234556, + "learning_rate": 1.98570232150568e-05, + "loss": 0.4318, + "step": 2631 + }, + { + "epoch": 1.0703538023586825, + "grad_norm": 17.046043428598246, + "learning_rate": 1.9856852446210097e-05, + "loss": 0.3108, + "step": 2632 + }, + { + "epoch": 1.0707604717364783, + "grad_norm": 26.72044515251691, + "learning_rate": 1.9856681576177846e-05, + "loss": 0.4974, + "step": 2633 + }, + { + "epoch": 1.071167141114274, + "grad_norm": 12.225834642475663, + "learning_rate": 1.9856510604961796e-05, + "loss": 0.2471, + "step": 2634 + }, + { + "epoch": 1.07157381049207, + "grad_norm": 1.590011966355084, + "learning_rate": 1.9856339532563704e-05, + "loss": 0.021, + "step": 2635 + }, + { + "epoch": 1.0719804798698658, + "grad_norm": 11.419875664782241, + "learning_rate": 1.985616835898533e-05, + "loss": 0.3015, + "step": 2636 + }, + { + "epoch": 1.0723871492476618, + "grad_norm": 16.319687497980084, + "learning_rate": 1.9855997084228424e-05, + "loss": 0.4749, + "step": 2637 + }, + { + "epoch": 1.0727938186254575, + "grad_norm": 17.84466153376299, + "learning_rate": 1.9855825708294746e-05, + "loss": 0.7991, + "step": 2638 + }, + { + "epoch": 1.0732004880032533, + "grad_norm": 13.975047098791201, + "learning_rate": 1.985565423118606e-05, + "loss": 0.1973, + "step": 2639 + }, + { + "epoch": 1.0736071573810493, + "grad_norm": 8.678232929444675, + "learning_rate": 1.985548265290412e-05, + "loss": 0.556, + "step": 2640 + }, + { + "epoch": 1.074013826758845, + "grad_norm": 0.47653887654667876, + "learning_rate": 1.9855310973450694e-05, + "loss": 0.0049, + "step": 2641 + }, + { + "epoch": 1.0744204961366408, + "grad_norm": 28.364863039252295, + "learning_rate": 1.985513919282754e-05, + "loss": 0.7811, + "step": 2642 + }, + { + "epoch": 1.0748271655144368, + "grad_norm": 12.027391872359292, + "learning_rate": 1.9854967311036423e-05, + "loss": 0.4395, + "step": 2643 + }, + { + "epoch": 1.0752338348922326, + "grad_norm": 5.2505631687610155, + "learning_rate": 1.98547953280791e-05, + "loss": 0.0578, + "step": 2644 + }, + { + "epoch": 1.0756405042700286, + "grad_norm": 12.898895874029199, + "learning_rate": 1.9854623243957348e-05, + "loss": 0.434, + "step": 2645 + }, + { + "epoch": 1.0760471736478243, + "grad_norm": 18.92649014663912, + "learning_rate": 1.9854451058672932e-05, + "loss": 0.6619, + "step": 2646 + }, + { + "epoch": 1.07645384302562, + "grad_norm": 16.983495823156133, + "learning_rate": 1.9854278772227612e-05, + "loss": 1.0419, + "step": 2647 + }, + { + "epoch": 1.076860512403416, + "grad_norm": 6.181577392943238, + "learning_rate": 1.9854106384623163e-05, + "loss": 0.0889, + "step": 2648 + }, + { + "epoch": 1.0772671817812118, + "grad_norm": 17.1664286752682, + "learning_rate": 1.9853933895861347e-05, + "loss": 0.3146, + "step": 2649 + }, + { + "epoch": 1.0776738511590078, + "grad_norm": 13.710948778620589, + "learning_rate": 1.9853761305943946e-05, + "loss": 0.4887, + "step": 2650 + }, + { + "epoch": 1.0780805205368036, + "grad_norm": 3.602611851263039, + "learning_rate": 1.9853588614872726e-05, + "loss": 0.0612, + "step": 2651 + }, + { + "epoch": 1.0784871899145994, + "grad_norm": 5.453642765622593, + "learning_rate": 1.9853415822649457e-05, + "loss": 0.0541, + "step": 2652 + }, + { + "epoch": 1.0788938592923953, + "grad_norm": 26.328299136250955, + "learning_rate": 1.9853242929275916e-05, + "loss": 0.8096, + "step": 2653 + }, + { + "epoch": 1.079300528670191, + "grad_norm": 0.8845299196670235, + "learning_rate": 1.9853069934753878e-05, + "loss": 0.0198, + "step": 2654 + }, + { + "epoch": 1.0797071980479869, + "grad_norm": 8.063553805122488, + "learning_rate": 1.985289683908512e-05, + "loss": 0.1887, + "step": 2655 + }, + { + "epoch": 1.0801138674257829, + "grad_norm": 46.29709410671897, + "learning_rate": 1.9852723642271413e-05, + "loss": 1.2946, + "step": 2656 + }, + { + "epoch": 1.0805205368035786, + "grad_norm": 12.804857491209424, + "learning_rate": 1.9852550344314543e-05, + "loss": 0.3371, + "step": 2657 + }, + { + "epoch": 1.0809272061813746, + "grad_norm": 8.058377406620487, + "learning_rate": 1.985237694521628e-05, + "loss": 0.1744, + "step": 2658 + }, + { + "epoch": 1.0813338755591704, + "grad_norm": 28.129463881458925, + "learning_rate": 1.9852203444978416e-05, + "loss": 0.5465, + "step": 2659 + }, + { + "epoch": 1.0817405449369661, + "grad_norm": 5.921778593223796, + "learning_rate": 1.985202984360272e-05, + "loss": 0.104, + "step": 2660 + }, + { + "epoch": 1.0821472143147621, + "grad_norm": 1.954907780127766, + "learning_rate": 1.9851856141090978e-05, + "loss": 0.0207, + "step": 2661 + }, + { + "epoch": 1.082553883692558, + "grad_norm": 17.129762601378967, + "learning_rate": 1.985168233744498e-05, + "loss": 0.3948, + "step": 2662 + }, + { + "epoch": 1.0829605530703539, + "grad_norm": 22.01859956103064, + "learning_rate": 1.98515084326665e-05, + "loss": 0.9311, + "step": 2663 + }, + { + "epoch": 1.0833672224481496, + "grad_norm": 7.738123182283522, + "learning_rate": 1.985133442675733e-05, + "loss": 0.1677, + "step": 2664 + }, + { + "epoch": 1.0837738918259454, + "grad_norm": 9.722115128698155, + "learning_rate": 1.985116031971925e-05, + "loss": 0.2128, + "step": 2665 + }, + { + "epoch": 1.0841805612037414, + "grad_norm": 7.648193618060714, + "learning_rate": 1.9850986111554058e-05, + "loss": 0.1875, + "step": 2666 + }, + { + "epoch": 1.0845872305815372, + "grad_norm": 3.0694845071670103, + "learning_rate": 1.9850811802263532e-05, + "loss": 0.036, + "step": 2667 + }, + { + "epoch": 1.0849938999593332, + "grad_norm": 5.378405532787904, + "learning_rate": 1.9850637391849466e-05, + "loss": 0.0645, + "step": 2668 + }, + { + "epoch": 1.085400569337129, + "grad_norm": 9.296241967992119, + "learning_rate": 1.985046288031365e-05, + "loss": 0.133, + "step": 2669 + }, + { + "epoch": 1.0858072387149247, + "grad_norm": 9.873845456179035, + "learning_rate": 1.985028826765787e-05, + "loss": 0.3922, + "step": 2670 + }, + { + "epoch": 1.0862139080927207, + "grad_norm": 1.193785180828875, + "learning_rate": 1.9850113553883928e-05, + "loss": 0.0172, + "step": 2671 + }, + { + "epoch": 1.0866205774705164, + "grad_norm": 7.315425981213661, + "learning_rate": 1.9849938738993612e-05, + "loss": 0.2472, + "step": 2672 + }, + { + "epoch": 1.0870272468483124, + "grad_norm": 11.113045581651063, + "learning_rate": 1.9849763822988717e-05, + "loss": 0.2019, + "step": 2673 + }, + { + "epoch": 1.0874339162261082, + "grad_norm": 19.049370471559932, + "learning_rate": 1.984958880587104e-05, + "loss": 0.4567, + "step": 2674 + }, + { + "epoch": 1.087840585603904, + "grad_norm": 15.187516271323446, + "learning_rate": 1.9849413687642376e-05, + "loss": 0.8047, + "step": 2675 + }, + { + "epoch": 1.0882472549817, + "grad_norm": 8.56063161647455, + "learning_rate": 1.9849238468304523e-05, + "loss": 0.2299, + "step": 2676 + }, + { + "epoch": 1.0886539243594957, + "grad_norm": 9.628526992418555, + "learning_rate": 1.9849063147859282e-05, + "loss": 0.2212, + "step": 2677 + }, + { + "epoch": 1.0890605937372917, + "grad_norm": 8.506187061694304, + "learning_rate": 1.9848887726308448e-05, + "loss": 0.2665, + "step": 2678 + }, + { + "epoch": 1.0894672631150875, + "grad_norm": 10.135206998799319, + "learning_rate": 1.9848712203653827e-05, + "loss": 0.2319, + "step": 2679 + }, + { + "epoch": 1.0898739324928832, + "grad_norm": 6.652397500502513, + "learning_rate": 1.9848536579897218e-05, + "loss": 0.1086, + "step": 2680 + }, + { + "epoch": 1.0902806018706792, + "grad_norm": 24.209694440552894, + "learning_rate": 1.9848360855040423e-05, + "loss": 0.3489, + "step": 2681 + }, + { + "epoch": 1.090687271248475, + "grad_norm": 7.498579282233966, + "learning_rate": 1.984818502908525e-05, + "loss": 0.1067, + "step": 2682 + }, + { + "epoch": 1.0910939406262707, + "grad_norm": 12.87057552249501, + "learning_rate": 1.9848009102033496e-05, + "loss": 0.451, + "step": 2683 + }, + { + "epoch": 1.0915006100040667, + "grad_norm": 18.01633524094366, + "learning_rate": 1.984783307388698e-05, + "loss": 1.0297, + "step": 2684 + }, + { + "epoch": 1.0919072793818625, + "grad_norm": 11.715322430488712, + "learning_rate": 1.9847656944647493e-05, + "loss": 0.2024, + "step": 2685 + }, + { + "epoch": 1.0923139487596585, + "grad_norm": 8.906416741243431, + "learning_rate": 1.9847480714316855e-05, + "loss": 0.5019, + "step": 2686 + }, + { + "epoch": 1.0927206181374542, + "grad_norm": 14.055749288211924, + "learning_rate": 1.9847304382896873e-05, + "loss": 0.3289, + "step": 2687 + }, + { + "epoch": 1.09312728751525, + "grad_norm": 9.08013599583574, + "learning_rate": 1.9847127950389353e-05, + "loss": 0.1929, + "step": 2688 + }, + { + "epoch": 1.093533956893046, + "grad_norm": 14.540049639448256, + "learning_rate": 1.984695141679611e-05, + "loss": 0.6477, + "step": 2689 + }, + { + "epoch": 1.0939406262708418, + "grad_norm": 16.65143822470315, + "learning_rate": 1.9846774782118954e-05, + "loss": 0.7343, + "step": 2690 + }, + { + "epoch": 1.0943472956486378, + "grad_norm": 12.409744571357054, + "learning_rate": 1.98465980463597e-05, + "loss": 0.1226, + "step": 2691 + }, + { + "epoch": 1.0947539650264335, + "grad_norm": 13.512901822756097, + "learning_rate": 1.984642120952016e-05, + "loss": 0.567, + "step": 2692 + }, + { + "epoch": 1.0951606344042293, + "grad_norm": 11.919651806625428, + "learning_rate": 1.984624427160215e-05, + "loss": 0.2672, + "step": 2693 + }, + { + "epoch": 1.0955673037820253, + "grad_norm": 15.410058062691451, + "learning_rate": 1.984606723260749e-05, + "loss": 0.3654, + "step": 2694 + }, + { + "epoch": 1.095973973159821, + "grad_norm": 11.008086373505742, + "learning_rate": 1.9845890092537992e-05, + "loss": 0.4258, + "step": 2695 + }, + { + "epoch": 1.0963806425376168, + "grad_norm": 20.877071502778904, + "learning_rate": 1.984571285139548e-05, + "loss": 0.7387, + "step": 2696 + }, + { + "epoch": 1.0967873119154128, + "grad_norm": 0.5907990423643519, + "learning_rate": 1.9845535509181765e-05, + "loss": 0.0075, + "step": 2697 + }, + { + "epoch": 1.0971939812932086, + "grad_norm": 9.934553345264328, + "learning_rate": 1.9845358065898675e-05, + "loss": 0.296, + "step": 2698 + }, + { + "epoch": 1.0976006506710045, + "grad_norm": 18.319763881993023, + "learning_rate": 1.9845180521548035e-05, + "loss": 0.8418, + "step": 2699 + }, + { + "epoch": 1.0980073200488003, + "grad_norm": 7.392919966269195, + "learning_rate": 1.9845002876131656e-05, + "loss": 0.2008, + "step": 2700 + }, + { + "epoch": 1.098413989426596, + "grad_norm": 4.461882791672582, + "learning_rate": 1.9844825129651368e-05, + "loss": 0.1221, + "step": 2701 + }, + { + "epoch": 1.098820658804392, + "grad_norm": 14.066607212584534, + "learning_rate": 1.9844647282108997e-05, + "loss": 0.5567, + "step": 2702 + }, + { + "epoch": 1.0992273281821878, + "grad_norm": 9.964801680013629, + "learning_rate": 1.9844469333506366e-05, + "loss": 0.1323, + "step": 2703 + }, + { + "epoch": 1.0996339975599838, + "grad_norm": 11.689234521014084, + "learning_rate": 1.9844291283845302e-05, + "loss": 0.1153, + "step": 2704 + }, + { + "epoch": 1.1000406669377796, + "grad_norm": 1.4770740319719518, + "learning_rate": 1.9844113133127637e-05, + "loss": 0.0176, + "step": 2705 + }, + { + "epoch": 1.1004473363155753, + "grad_norm": 9.798188049566642, + "learning_rate": 1.984393488135519e-05, + "loss": 0.2182, + "step": 2706 + }, + { + "epoch": 1.1008540056933713, + "grad_norm": 3.2816828812315553, + "learning_rate": 1.9843756528529803e-05, + "loss": 0.0265, + "step": 2707 + }, + { + "epoch": 1.101260675071167, + "grad_norm": 19.295760456130314, + "learning_rate": 1.9843578074653297e-05, + "loss": 0.557, + "step": 2708 + }, + { + "epoch": 1.101667344448963, + "grad_norm": 22.04015938207401, + "learning_rate": 1.9843399519727508e-05, + "loss": 0.2887, + "step": 2709 + }, + { + "epoch": 1.1020740138267588, + "grad_norm": 5.07179297085069, + "learning_rate": 1.984322086375427e-05, + "loss": 0.0701, + "step": 2710 + }, + { + "epoch": 1.1024806832045546, + "grad_norm": 17.285193396679247, + "learning_rate": 1.9843042106735416e-05, + "loss": 0.8635, + "step": 2711 + }, + { + "epoch": 1.1028873525823506, + "grad_norm": 12.118980467365905, + "learning_rate": 1.984286324867278e-05, + "loss": 0.5988, + "step": 2712 + }, + { + "epoch": 1.1032940219601464, + "grad_norm": 6.743979843481225, + "learning_rate": 1.9842684289568203e-05, + "loss": 0.0725, + "step": 2713 + }, + { + "epoch": 1.1037006913379424, + "grad_norm": 24.709728104147548, + "learning_rate": 1.9842505229423514e-05, + "loss": 0.6836, + "step": 2714 + }, + { + "epoch": 1.1041073607157381, + "grad_norm": 11.736031998961233, + "learning_rate": 1.9842326068240555e-05, + "loss": 0.3249, + "step": 2715 + }, + { + "epoch": 1.1045140300935339, + "grad_norm": 5.857177111282703, + "learning_rate": 1.9842146806021167e-05, + "loss": 0.0983, + "step": 2716 + }, + { + "epoch": 1.1049206994713299, + "grad_norm": 13.125065776399005, + "learning_rate": 1.984196744276719e-05, + "loss": 0.3675, + "step": 2717 + }, + { + "epoch": 1.1053273688491256, + "grad_norm": 8.461459242154481, + "learning_rate": 1.984178797848046e-05, + "loss": 0.1652, + "step": 2718 + }, + { + "epoch": 1.1057340382269216, + "grad_norm": 35.6300843358467, + "learning_rate": 1.9841608413162826e-05, + "loss": 0.2774, + "step": 2719 + }, + { + "epoch": 1.1061407076047174, + "grad_norm": 15.946235570911906, + "learning_rate": 1.9841428746816128e-05, + "loss": 0.5181, + "step": 2720 + }, + { + "epoch": 1.1065473769825132, + "grad_norm": 9.160142643253494, + "learning_rate": 1.984124897944221e-05, + "loss": 0.209, + "step": 2721 + }, + { + "epoch": 1.1069540463603091, + "grad_norm": 4.23144788153438, + "learning_rate": 1.984106911104292e-05, + "loss": 0.0647, + "step": 2722 + }, + { + "epoch": 1.107360715738105, + "grad_norm": 10.89610350697127, + "learning_rate": 1.9840889141620102e-05, + "loss": 0.4069, + "step": 2723 + }, + { + "epoch": 1.1077673851159007, + "grad_norm": 11.929002725383427, + "learning_rate": 1.9840709071175602e-05, + "loss": 0.1976, + "step": 2724 + }, + { + "epoch": 1.1081740544936967, + "grad_norm": 13.697385901120626, + "learning_rate": 1.9840528899711272e-05, + "loss": 0.3689, + "step": 2725 + }, + { + "epoch": 1.1085807238714924, + "grad_norm": 20.093048152412948, + "learning_rate": 1.984034862722896e-05, + "loss": 0.5281, + "step": 2726 + }, + { + "epoch": 1.1089873932492884, + "grad_norm": 5.571233670505109, + "learning_rate": 1.9840168253730518e-05, + "loss": 0.1125, + "step": 2727 + }, + { + "epoch": 1.1093940626270842, + "grad_norm": 21.419176839571044, + "learning_rate": 1.9839987779217796e-05, + "loss": 0.6158, + "step": 2728 + }, + { + "epoch": 1.10980073200488, + "grad_norm": 9.245380246801531, + "learning_rate": 1.983980720369265e-05, + "loss": 0.2379, + "step": 2729 + }, + { + "epoch": 1.110207401382676, + "grad_norm": 9.293775352744959, + "learning_rate": 1.9839626527156925e-05, + "loss": 0.2187, + "step": 2730 + }, + { + "epoch": 1.1106140707604717, + "grad_norm": 13.947735462413567, + "learning_rate": 1.983944574961248e-05, + "loss": 0.7365, + "step": 2731 + }, + { + "epoch": 1.1110207401382677, + "grad_norm": 2.3176476794754386, + "learning_rate": 1.9839264871061178e-05, + "loss": 0.035, + "step": 2732 + }, + { + "epoch": 1.1114274095160634, + "grad_norm": 6.5590516072968175, + "learning_rate": 1.9839083891504865e-05, + "loss": 0.1907, + "step": 2733 + }, + { + "epoch": 1.1118340788938592, + "grad_norm": 17.478227947398445, + "learning_rate": 1.983890281094541e-05, + "loss": 0.5466, + "step": 2734 + }, + { + "epoch": 1.1122407482716552, + "grad_norm": 7.781476438700461, + "learning_rate": 1.983872162938466e-05, + "loss": 0.1773, + "step": 2735 + }, + { + "epoch": 1.112647417649451, + "grad_norm": 7.318486336914227, + "learning_rate": 1.9838540346824487e-05, + "loss": 0.0696, + "step": 2736 + }, + { + "epoch": 1.113054087027247, + "grad_norm": 11.618584159784069, + "learning_rate": 1.9838358963266743e-05, + "loss": 0.6656, + "step": 2737 + }, + { + "epoch": 1.1134607564050427, + "grad_norm": 10.196981330376227, + "learning_rate": 1.983817747871329e-05, + "loss": 0.2986, + "step": 2738 + }, + { + "epoch": 1.1138674257828385, + "grad_norm": 2.3418488321938904, + "learning_rate": 1.9837995893166e-05, + "loss": 0.0306, + "step": 2739 + }, + { + "epoch": 1.1142740951606345, + "grad_norm": 2.8903960368355412, + "learning_rate": 1.9837814206626725e-05, + "loss": 0.0498, + "step": 2740 + }, + { + "epoch": 1.1146807645384302, + "grad_norm": 8.52676990498788, + "learning_rate": 1.9837632419097336e-05, + "loss": 0.0818, + "step": 2741 + }, + { + "epoch": 1.115087433916226, + "grad_norm": 15.490761523772914, + "learning_rate": 1.98374505305797e-05, + "loss": 0.9141, + "step": 2742 + }, + { + "epoch": 1.115494103294022, + "grad_norm": 14.027779162492294, + "learning_rate": 1.9837268541075688e-05, + "loss": 0.5342, + "step": 2743 + }, + { + "epoch": 1.1159007726718178, + "grad_norm": 1.9863533213264777, + "learning_rate": 1.9837086450587157e-05, + "loss": 0.0324, + "step": 2744 + }, + { + "epoch": 1.1163074420496137, + "grad_norm": 11.733453512092284, + "learning_rate": 1.9836904259115985e-05, + "loss": 0.4442, + "step": 2745 + }, + { + "epoch": 1.1167141114274095, + "grad_norm": 10.500803708605252, + "learning_rate": 1.9836721966664042e-05, + "loss": 0.5166, + "step": 2746 + }, + { + "epoch": 1.1171207808052053, + "grad_norm": 24.16585837297387, + "learning_rate": 1.9836539573233195e-05, + "loss": 0.5111, + "step": 2747 + }, + { + "epoch": 1.1175274501830013, + "grad_norm": 9.219910127573861, + "learning_rate": 1.9836357078825317e-05, + "loss": 0.4302, + "step": 2748 + }, + { + "epoch": 1.117934119560797, + "grad_norm": 13.054334179787178, + "learning_rate": 1.9836174483442287e-05, + "loss": 0.6596, + "step": 2749 + }, + { + "epoch": 1.118340788938593, + "grad_norm": 6.395262029785598, + "learning_rate": 1.9835991787085972e-05, + "loss": 0.0669, + "step": 2750 + }, + { + "epoch": 1.1187474583163888, + "grad_norm": 11.516125086506786, + "learning_rate": 1.9835808989758256e-05, + "loss": 0.5709, + "step": 2751 + }, + { + "epoch": 1.1191541276941845, + "grad_norm": 14.527674390895768, + "learning_rate": 1.9835626091461003e-05, + "loss": 0.6314, + "step": 2752 + }, + { + "epoch": 1.1195607970719805, + "grad_norm": 4.1204067859356055, + "learning_rate": 1.98354430921961e-05, + "loss": 0.1425, + "step": 2753 + }, + { + "epoch": 1.1199674664497763, + "grad_norm": 10.444700571608461, + "learning_rate": 1.983525999196543e-05, + "loss": 0.2658, + "step": 2754 + }, + { + "epoch": 1.1203741358275723, + "grad_norm": 35.12118885085534, + "learning_rate": 1.983507679077086e-05, + "loss": 1.0056, + "step": 2755 + }, + { + "epoch": 1.120780805205368, + "grad_norm": 13.10933125793243, + "learning_rate": 1.983489348861428e-05, + "loss": 0.4382, + "step": 2756 + }, + { + "epoch": 1.1211874745831638, + "grad_norm": 7.102179891613271, + "learning_rate": 1.9834710085497563e-05, + "loss": 0.1696, + "step": 2757 + }, + { + "epoch": 1.1215941439609598, + "grad_norm": 5.311841792443667, + "learning_rate": 1.98345265814226e-05, + "loss": 0.1997, + "step": 2758 + }, + { + "epoch": 1.1220008133387556, + "grad_norm": 1.1707757358789954, + "learning_rate": 1.9834342976391276e-05, + "loss": 0.0201, + "step": 2759 + }, + { + "epoch": 1.1224074827165516, + "grad_norm": 5.96213667109918, + "learning_rate": 1.9834159270405472e-05, + "loss": 0.1185, + "step": 2760 + }, + { + "epoch": 1.1228141520943473, + "grad_norm": 6.343485432064383, + "learning_rate": 1.983397546346707e-05, + "loss": 0.1218, + "step": 2761 + }, + { + "epoch": 1.123220821472143, + "grad_norm": 21.41149226592672, + "learning_rate": 1.9833791555577958e-05, + "loss": 1.8534, + "step": 2762 + }, + { + "epoch": 1.123627490849939, + "grad_norm": 12.595994272311259, + "learning_rate": 1.9833607546740033e-05, + "loss": 0.4844, + "step": 2763 + }, + { + "epoch": 1.1240341602277348, + "grad_norm": 7.336785435040935, + "learning_rate": 1.983342343695517e-05, + "loss": 0.1536, + "step": 2764 + }, + { + "epoch": 1.1244408296055306, + "grad_norm": 7.249991535139949, + "learning_rate": 1.9833239226225273e-05, + "loss": 0.4178, + "step": 2765 + }, + { + "epoch": 1.1248474989833266, + "grad_norm": 12.304529952637676, + "learning_rate": 1.9833054914552226e-05, + "loss": 0.44, + "step": 2766 + }, + { + "epoch": 1.1252541683611224, + "grad_norm": 9.354908335521273, + "learning_rate": 1.9832870501937917e-05, + "loss": 0.3893, + "step": 2767 + }, + { + "epoch": 1.1256608377389183, + "grad_norm": 8.728203485058675, + "learning_rate": 1.9832685988384247e-05, + "loss": 0.1676, + "step": 2768 + }, + { + "epoch": 1.126067507116714, + "grad_norm": 17.730047563423433, + "learning_rate": 1.9832501373893106e-05, + "loss": 0.2848, + "step": 2769 + }, + { + "epoch": 1.1264741764945099, + "grad_norm": 7.247412446136838, + "learning_rate": 1.9832316658466388e-05, + "loss": 0.1917, + "step": 2770 + }, + { + "epoch": 1.1268808458723059, + "grad_norm": 3.09011254063337, + "learning_rate": 1.983213184210599e-05, + "loss": 0.0435, + "step": 2771 + }, + { + "epoch": 1.1272875152501016, + "grad_norm": 17.013058912699886, + "learning_rate": 1.9831946924813813e-05, + "loss": 0.7373, + "step": 2772 + }, + { + "epoch": 1.1276941846278976, + "grad_norm": 23.240009741655683, + "learning_rate": 1.9831761906591756e-05, + "loss": 0.884, + "step": 2773 + }, + { + "epoch": 1.1281008540056934, + "grad_norm": 16.939366693000203, + "learning_rate": 1.9831576787441712e-05, + "loss": 0.6602, + "step": 2774 + }, + { + "epoch": 1.1285075233834891, + "grad_norm": 6.305452994646382, + "learning_rate": 1.9831391567365582e-05, + "loss": 0.1776, + "step": 2775 + }, + { + "epoch": 1.1289141927612851, + "grad_norm": 4.266549556305099, + "learning_rate": 1.983120624636527e-05, + "loss": 0.0511, + "step": 2776 + }, + { + "epoch": 1.129320862139081, + "grad_norm": 8.105698633467236, + "learning_rate": 1.983102082444268e-05, + "loss": 0.2562, + "step": 2777 + }, + { + "epoch": 1.1297275315168767, + "grad_norm": 10.66868840198516, + "learning_rate": 1.983083530159971e-05, + "loss": 0.1037, + "step": 2778 + }, + { + "epoch": 1.1301342008946726, + "grad_norm": 13.902276338004922, + "learning_rate": 1.983064967783827e-05, + "loss": 0.3492, + "step": 2779 + }, + { + "epoch": 1.1305408702724684, + "grad_norm": 8.483908808686294, + "learning_rate": 1.983046395316026e-05, + "loss": 0.1359, + "step": 2780 + }, + { + "epoch": 1.1309475396502644, + "grad_norm": 9.248829078525237, + "learning_rate": 1.9830278127567596e-05, + "loss": 0.1768, + "step": 2781 + }, + { + "epoch": 1.1313542090280602, + "grad_norm": 14.968698473344679, + "learning_rate": 1.9830092201062176e-05, + "loss": 0.5219, + "step": 2782 + }, + { + "epoch": 1.131760878405856, + "grad_norm": 18.71066755765061, + "learning_rate": 1.9829906173645913e-05, + "loss": 0.2224, + "step": 2783 + }, + { + "epoch": 1.132167547783652, + "grad_norm": 14.187211252666149, + "learning_rate": 1.9829720045320714e-05, + "loss": 0.4849, + "step": 2784 + }, + { + "epoch": 1.1325742171614477, + "grad_norm": 32.39154554624037, + "learning_rate": 1.9829533816088493e-05, + "loss": 0.8263, + "step": 2785 + }, + { + "epoch": 1.1329808865392437, + "grad_norm": 0.2580248072247233, + "learning_rate": 1.982934748595116e-05, + "loss": 0.0031, + "step": 2786 + }, + { + "epoch": 1.1333875559170394, + "grad_norm": 13.921900555233856, + "learning_rate": 1.9829161054910632e-05, + "loss": 0.6279, + "step": 2787 + }, + { + "epoch": 1.1337942252948352, + "grad_norm": 12.313059565447494, + "learning_rate": 1.9828974522968812e-05, + "loss": 0.3017, + "step": 2788 + }, + { + "epoch": 1.1342008946726312, + "grad_norm": 11.066802957098274, + "learning_rate": 1.982878789012763e-05, + "loss": 0.5836, + "step": 2789 + }, + { + "epoch": 1.134607564050427, + "grad_norm": 12.151049996996276, + "learning_rate": 1.9828601156388985e-05, + "loss": 0.4926, + "step": 2790 + }, + { + "epoch": 1.135014233428223, + "grad_norm": 8.20300516879117, + "learning_rate": 1.9828414321754806e-05, + "loss": 0.2029, + "step": 2791 + }, + { + "epoch": 1.1354209028060187, + "grad_norm": 11.462203702414024, + "learning_rate": 1.982822738622701e-05, + "loss": 0.4111, + "step": 2792 + }, + { + "epoch": 1.1358275721838145, + "grad_norm": 15.132055916161212, + "learning_rate": 1.982804034980751e-05, + "loss": 0.325, + "step": 2793 + }, + { + "epoch": 1.1362342415616105, + "grad_norm": 6.634052415968836, + "learning_rate": 1.982785321249823e-05, + "loss": 0.1048, + "step": 2794 + }, + { + "epoch": 1.1366409109394062, + "grad_norm": 13.613489069747997, + "learning_rate": 1.982766597430109e-05, + "loss": 0.4628, + "step": 2795 + }, + { + "epoch": 1.1370475803172022, + "grad_norm": 0.9828458752410794, + "learning_rate": 1.9827478635218013e-05, + "loss": 0.0186, + "step": 2796 + }, + { + "epoch": 1.137454249694998, + "grad_norm": 12.288265157767627, + "learning_rate": 1.9827291195250924e-05, + "loss": 0.2679, + "step": 2797 + }, + { + "epoch": 1.1378609190727937, + "grad_norm": 13.065199201219958, + "learning_rate": 1.982710365440174e-05, + "loss": 0.9517, + "step": 2798 + }, + { + "epoch": 1.1382675884505897, + "grad_norm": 13.789209328320874, + "learning_rate": 1.9826916012672398e-05, + "loss": 0.2675, + "step": 2799 + }, + { + "epoch": 1.1386742578283855, + "grad_norm": 30.8528558247052, + "learning_rate": 1.982672827006481e-05, + "loss": 0.8946, + "step": 2800 + }, + { + "epoch": 1.1390809272061815, + "grad_norm": 11.314601044961304, + "learning_rate": 1.9826540426580917e-05, + "loss": 0.4296, + "step": 2801 + }, + { + "epoch": 1.1394875965839772, + "grad_norm": 7.295447837196024, + "learning_rate": 1.982635248222264e-05, + "loss": 0.1147, + "step": 2802 + }, + { + "epoch": 1.139894265961773, + "grad_norm": 27.931880660213018, + "learning_rate": 1.9826164436991905e-05, + "loss": 0.522, + "step": 2803 + }, + { + "epoch": 1.140300935339569, + "grad_norm": 3.3679130138159676, + "learning_rate": 1.982597629089065e-05, + "loss": 0.0585, + "step": 2804 + }, + { + "epoch": 1.1407076047173648, + "grad_norm": 19.96399325978629, + "learning_rate": 1.9825788043920803e-05, + "loss": 0.9787, + "step": 2805 + }, + { + "epoch": 1.1411142740951608, + "grad_norm": 4.192665962099296, + "learning_rate": 1.9825599696084296e-05, + "loss": 0.0801, + "step": 2806 + }, + { + "epoch": 1.1415209434729565, + "grad_norm": 7.66211708886728, + "learning_rate": 1.9825411247383062e-05, + "loss": 0.1286, + "step": 2807 + }, + { + "epoch": 1.1419276128507523, + "grad_norm": 2.6085421398714894, + "learning_rate": 1.982522269781904e-05, + "loss": 0.0372, + "step": 2808 + }, + { + "epoch": 1.1423342822285483, + "grad_norm": 12.497963740523447, + "learning_rate": 1.9825034047394157e-05, + "loss": 0.4366, + "step": 2809 + }, + { + "epoch": 1.142740951606344, + "grad_norm": 16.625458185794457, + "learning_rate": 1.9824845296110355e-05, + "loss": 0.5202, + "step": 2810 + }, + { + "epoch": 1.1431476209841398, + "grad_norm": 6.145096864268652, + "learning_rate": 1.9824656443969578e-05, + "loss": 0.0856, + "step": 2811 + }, + { + "epoch": 1.1435542903619358, + "grad_norm": 3.710830351841784, + "learning_rate": 1.982446749097375e-05, + "loss": 0.1114, + "step": 2812 + }, + { + "epoch": 1.1439609597397316, + "grad_norm": 12.352303082455606, + "learning_rate": 1.9824278437124823e-05, + "loss": 0.4633, + "step": 2813 + }, + { + "epoch": 1.1443676291175275, + "grad_norm": 10.663472563085557, + "learning_rate": 1.982408928242473e-05, + "loss": 0.1709, + "step": 2814 + }, + { + "epoch": 1.1447742984953233, + "grad_norm": 17.619132185175857, + "learning_rate": 1.982390002687542e-05, + "loss": 1.3669, + "step": 2815 + }, + { + "epoch": 1.145180967873119, + "grad_norm": 6.5950253656997, + "learning_rate": 1.982371067047883e-05, + "loss": 0.2576, + "step": 2816 + }, + { + "epoch": 1.145587637250915, + "grad_norm": 25.784506680675122, + "learning_rate": 1.9823521213236907e-05, + "loss": 1.024, + "step": 2817 + }, + { + "epoch": 1.1459943066287108, + "grad_norm": 4.595520514905329, + "learning_rate": 1.982333165515159e-05, + "loss": 0.054, + "step": 2818 + }, + { + "epoch": 1.1464009760065066, + "grad_norm": 3.5322054773969724, + "learning_rate": 1.9823141996224837e-05, + "loss": 0.0568, + "step": 2819 + }, + { + "epoch": 1.1468076453843026, + "grad_norm": 8.960347764723585, + "learning_rate": 1.9822952236458583e-05, + "loss": 0.4721, + "step": 2820 + }, + { + "epoch": 1.1472143147620983, + "grad_norm": 7.9292960070730505, + "learning_rate": 1.9822762375854782e-05, + "loss": 0.1414, + "step": 2821 + }, + { + "epoch": 1.1476209841398943, + "grad_norm": 6.140719943844023, + "learning_rate": 1.982257241441538e-05, + "loss": 0.1225, + "step": 2822 + }, + { + "epoch": 1.14802765351769, + "grad_norm": 15.347520661804936, + "learning_rate": 1.9822382352142333e-05, + "loss": 0.8068, + "step": 2823 + }, + { + "epoch": 1.1484343228954859, + "grad_norm": 20.634940330715406, + "learning_rate": 1.982219218903758e-05, + "loss": 0.8063, + "step": 2824 + }, + { + "epoch": 1.1488409922732818, + "grad_norm": 12.244140591076029, + "learning_rate": 1.9822001925103087e-05, + "loss": 0.4381, + "step": 2825 + }, + { + "epoch": 1.1492476616510776, + "grad_norm": 38.6034055309236, + "learning_rate": 1.98218115603408e-05, + "loss": 0.1604, + "step": 2826 + }, + { + "epoch": 1.1496543310288736, + "grad_norm": 6.248532788468195, + "learning_rate": 1.982162109475267e-05, + "loss": 0.0972, + "step": 2827 + }, + { + "epoch": 1.1500610004066694, + "grad_norm": 9.029836894725479, + "learning_rate": 1.982143052834066e-05, + "loss": 0.1751, + "step": 2828 + }, + { + "epoch": 1.1504676697844651, + "grad_norm": 15.582740898545403, + "learning_rate": 1.9821239861106724e-05, + "loss": 0.5523, + "step": 2829 + }, + { + "epoch": 1.1508743391622611, + "grad_norm": 11.348870474087365, + "learning_rate": 1.9821049093052816e-05, + "loss": 0.5952, + "step": 2830 + }, + { + "epoch": 1.1512810085400569, + "grad_norm": 12.532639429906835, + "learning_rate": 1.9820858224180894e-05, + "loss": 0.7812, + "step": 2831 + }, + { + "epoch": 1.1516876779178529, + "grad_norm": 18.333387487279136, + "learning_rate": 1.9820667254492924e-05, + "loss": 1.0978, + "step": 2832 + }, + { + "epoch": 1.1520943472956486, + "grad_norm": 13.307494560038382, + "learning_rate": 1.982047618399086e-05, + "loss": 0.2706, + "step": 2833 + }, + { + "epoch": 1.1525010166734444, + "grad_norm": 17.48663206101558, + "learning_rate": 1.9820285012676664e-05, + "loss": 0.7715, + "step": 2834 + }, + { + "epoch": 1.1529076860512404, + "grad_norm": 14.12041528952888, + "learning_rate": 1.98200937405523e-05, + "loss": 0.6048, + "step": 2835 + }, + { + "epoch": 1.1533143554290362, + "grad_norm": 4.947647488232754, + "learning_rate": 1.9819902367619735e-05, + "loss": 0.1055, + "step": 2836 + }, + { + "epoch": 1.1537210248068321, + "grad_norm": 8.112729372500786, + "learning_rate": 1.9819710893880927e-05, + "loss": 0.1549, + "step": 2837 + }, + { + "epoch": 1.154127694184628, + "grad_norm": 12.160340926265766, + "learning_rate": 1.9819519319337846e-05, + "loss": 0.4768, + "step": 2838 + }, + { + "epoch": 1.1545343635624237, + "grad_norm": 6.505170942828164, + "learning_rate": 1.9819327643992454e-05, + "loss": 0.1185, + "step": 2839 + }, + { + "epoch": 1.1549410329402197, + "grad_norm": 3.2433242183438966, + "learning_rate": 1.9819135867846722e-05, + "loss": 0.0486, + "step": 2840 + }, + { + "epoch": 1.1553477023180154, + "grad_norm": 4.238577135916189, + "learning_rate": 1.9818943990902624e-05, + "loss": 0.0874, + "step": 2841 + }, + { + "epoch": 1.1557543716958114, + "grad_norm": 13.718800801392733, + "learning_rate": 1.981875201316212e-05, + "loss": 0.4659, + "step": 2842 + }, + { + "epoch": 1.1561610410736072, + "grad_norm": 13.584050008459611, + "learning_rate": 1.9818559934627184e-05, + "loss": 0.4787, + "step": 2843 + }, + { + "epoch": 1.156567710451403, + "grad_norm": 6.371264570081237, + "learning_rate": 1.981836775529979e-05, + "loss": 0.1335, + "step": 2844 + }, + { + "epoch": 1.156974379829199, + "grad_norm": 15.034579647798758, + "learning_rate": 1.981817547518191e-05, + "loss": 0.2637, + "step": 2845 + }, + { + "epoch": 1.1573810492069947, + "grad_norm": 4.683830272004993, + "learning_rate": 1.981798309427552e-05, + "loss": 0.0661, + "step": 2846 + }, + { + "epoch": 1.1577877185847907, + "grad_norm": 5.653567486086086, + "learning_rate": 1.9817790612582587e-05, + "loss": 0.0939, + "step": 2847 + }, + { + "epoch": 1.1581943879625864, + "grad_norm": 6.073025284154953, + "learning_rate": 1.9817598030105092e-05, + "loss": 0.2704, + "step": 2848 + }, + { + "epoch": 1.1586010573403822, + "grad_norm": 9.124515673381923, + "learning_rate": 1.9817405346845014e-05, + "loss": 0.1361, + "step": 2849 + }, + { + "epoch": 1.1590077267181782, + "grad_norm": 11.233010123445828, + "learning_rate": 1.981721256280433e-05, + "loss": 0.3785, + "step": 2850 + }, + { + "epoch": 1.159414396095974, + "grad_norm": 9.087074241886752, + "learning_rate": 1.9817019677985017e-05, + "loss": 0.1406, + "step": 2851 + }, + { + "epoch": 1.1598210654737697, + "grad_norm": 2.7554769454676133, + "learning_rate": 1.981682669238906e-05, + "loss": 0.0484, + "step": 2852 + }, + { + "epoch": 1.1602277348515657, + "grad_norm": 41.85381785424317, + "learning_rate": 1.9816633606018433e-05, + "loss": 1.3121, + "step": 2853 + }, + { + "epoch": 1.1606344042293615, + "grad_norm": 11.47506841830965, + "learning_rate": 1.981644041887512e-05, + "loss": 0.5636, + "step": 2854 + }, + { + "epoch": 1.1610410736071575, + "grad_norm": 1.4231530705956983, + "learning_rate": 1.981624713096111e-05, + "loss": 0.0358, + "step": 2855 + }, + { + "epoch": 1.1614477429849532, + "grad_norm": 17.800997313263235, + "learning_rate": 1.981605374227838e-05, + "loss": 0.5332, + "step": 2856 + }, + { + "epoch": 1.161854412362749, + "grad_norm": 10.279570392104157, + "learning_rate": 1.9815860252828915e-05, + "loss": 0.4527, + "step": 2857 + }, + { + "epoch": 1.162261081740545, + "grad_norm": 1.0065416954159836, + "learning_rate": 1.981566666261471e-05, + "loss": 0.0151, + "step": 2858 + }, + { + "epoch": 1.1626677511183408, + "grad_norm": 8.871467322525943, + "learning_rate": 1.9815472971637742e-05, + "loss": 0.3316, + "step": 2859 + }, + { + "epoch": 1.1630744204961365, + "grad_norm": 2.436061787684322, + "learning_rate": 1.9815279179900005e-05, + "loss": 0.0296, + "step": 2860 + }, + { + "epoch": 1.1634810898739325, + "grad_norm": 1.037891101876026, + "learning_rate": 1.9815085287403488e-05, + "loss": 0.0129, + "step": 2861 + }, + { + "epoch": 1.1638877592517283, + "grad_norm": 15.167766839219029, + "learning_rate": 1.9814891294150183e-05, + "loss": 0.5383, + "step": 2862 + }, + { + "epoch": 1.1642944286295243, + "grad_norm": 11.999647210676603, + "learning_rate": 1.9814697200142078e-05, + "loss": 0.293, + "step": 2863 + }, + { + "epoch": 1.16470109800732, + "grad_norm": 0.5628069747352404, + "learning_rate": 1.9814503005381166e-05, + "loss": 0.0068, + "step": 2864 + }, + { + "epoch": 1.1651077673851158, + "grad_norm": 7.108767042473237, + "learning_rate": 1.9814308709869436e-05, + "loss": 0.1675, + "step": 2865 + }, + { + "epoch": 1.1655144367629118, + "grad_norm": 4.929620033444732, + "learning_rate": 1.9814114313608896e-05, + "loss": 0.1386, + "step": 2866 + }, + { + "epoch": 1.1659211061407075, + "grad_norm": 3.934395333763552, + "learning_rate": 1.981391981660153e-05, + "loss": 0.0552, + "step": 2867 + }, + { + "epoch": 1.1663277755185035, + "grad_norm": 15.191813035970199, + "learning_rate": 1.9813725218849337e-05, + "loss": 0.4468, + "step": 2868 + }, + { + "epoch": 1.1667344448962993, + "grad_norm": 3.047134700993606, + "learning_rate": 1.9813530520354313e-05, + "loss": 0.05, + "step": 2869 + }, + { + "epoch": 1.167141114274095, + "grad_norm": 18.264904995352232, + "learning_rate": 1.9813335721118462e-05, + "loss": 0.3712, + "step": 2870 + }, + { + "epoch": 1.167547783651891, + "grad_norm": 5.249408475814902, + "learning_rate": 1.9813140821143783e-05, + "loss": 0.0665, + "step": 2871 + }, + { + "epoch": 1.1679544530296868, + "grad_norm": 4.562595400982473, + "learning_rate": 1.981294582043227e-05, + "loss": 0.1268, + "step": 2872 + }, + { + "epoch": 1.1683611224074828, + "grad_norm": 3.1566297523097333, + "learning_rate": 1.9812750718985934e-05, + "loss": 0.0429, + "step": 2873 + }, + { + "epoch": 1.1687677917852786, + "grad_norm": 13.975481539526852, + "learning_rate": 1.981255551680677e-05, + "loss": 0.4915, + "step": 2874 + }, + { + "epoch": 1.1691744611630743, + "grad_norm": 15.286838867949452, + "learning_rate": 1.9812360213896786e-05, + "loss": 0.7181, + "step": 2875 + }, + { + "epoch": 1.1695811305408703, + "grad_norm": 15.181897436168732, + "learning_rate": 1.9812164810257987e-05, + "loss": 0.8588, + "step": 2876 + }, + { + "epoch": 1.169987799918666, + "grad_norm": 20.14798036717175, + "learning_rate": 1.9811969305892377e-05, + "loss": 1.265, + "step": 2877 + }, + { + "epoch": 1.170394469296462, + "grad_norm": 8.986235521497905, + "learning_rate": 1.9811773700801964e-05, + "loss": 0.3244, + "step": 2878 + }, + { + "epoch": 1.1708011386742578, + "grad_norm": 8.815143292506692, + "learning_rate": 1.9811577994988755e-05, + "loss": 0.2492, + "step": 2879 + }, + { + "epoch": 1.1712078080520536, + "grad_norm": 11.043832397419195, + "learning_rate": 1.9811382188454763e-05, + "loss": 0.6519, + "step": 2880 + }, + { + "epoch": 1.1716144774298496, + "grad_norm": 18.016213881164212, + "learning_rate": 1.9811186281201992e-05, + "loss": 0.6529, + "step": 2881 + }, + { + "epoch": 1.1720211468076454, + "grad_norm": 19.82853573182277, + "learning_rate": 1.9810990273232456e-05, + "loss": 0.884, + "step": 2882 + }, + { + "epoch": 1.1724278161854413, + "grad_norm": 6.926680609722943, + "learning_rate": 1.9810794164548165e-05, + "loss": 0.3337, + "step": 2883 + }, + { + "epoch": 1.172834485563237, + "grad_norm": 24.94839275671697, + "learning_rate": 1.9810597955151138e-05, + "loss": 0.7739, + "step": 2884 + }, + { + "epoch": 1.1732411549410329, + "grad_norm": 10.00709246413382, + "learning_rate": 1.9810401645043384e-05, + "loss": 0.1721, + "step": 2885 + }, + { + "epoch": 1.1736478243188289, + "grad_norm": 4.4239368686142155, + "learning_rate": 1.981020523422692e-05, + "loss": 0.0437, + "step": 2886 + }, + { + "epoch": 1.1740544936966246, + "grad_norm": 5.658758627062879, + "learning_rate": 1.981000872270376e-05, + "loss": 0.1059, + "step": 2887 + }, + { + "epoch": 1.1744611630744206, + "grad_norm": 24.58556465930508, + "learning_rate": 1.9809812110475926e-05, + "loss": 0.6561, + "step": 2888 + }, + { + "epoch": 1.1748678324522164, + "grad_norm": 19.543787786769297, + "learning_rate": 1.9809615397545432e-05, + "loss": 0.698, + "step": 2889 + }, + { + "epoch": 1.1752745018300121, + "grad_norm": 1.6457494398579982, + "learning_rate": 1.9809418583914298e-05, + "loss": 0.0221, + "step": 2890 + }, + { + "epoch": 1.1756811712078081, + "grad_norm": 15.024139886311026, + "learning_rate": 1.980922166958455e-05, + "loss": 0.2657, + "step": 2891 + }, + { + "epoch": 1.176087840585604, + "grad_norm": 7.989314312785944, + "learning_rate": 1.9809024654558197e-05, + "loss": 0.1274, + "step": 2892 + }, + { + "epoch": 1.1764945099633997, + "grad_norm": 13.542637350768276, + "learning_rate": 1.9808827538837275e-05, + "loss": 0.6253, + "step": 2893 + }, + { + "epoch": 1.1769011793411956, + "grad_norm": 2.4717386729220636, + "learning_rate": 1.9808630322423797e-05, + "loss": 0.0452, + "step": 2894 + }, + { + "epoch": 1.1773078487189914, + "grad_norm": 5.591272869783841, + "learning_rate": 1.9808433005319796e-05, + "loss": 0.069, + "step": 2895 + }, + { + "epoch": 1.1777145180967874, + "grad_norm": 4.440332066883205, + "learning_rate": 1.9808235587527294e-05, + "loss": 0.1042, + "step": 2896 + }, + { + "epoch": 1.1781211874745832, + "grad_norm": 7.804053251035937, + "learning_rate": 1.9808038069048315e-05, + "loss": 0.2507, + "step": 2897 + }, + { + "epoch": 1.178527856852379, + "grad_norm": 4.653701769318493, + "learning_rate": 1.980784044988489e-05, + "loss": 0.1042, + "step": 2898 + }, + { + "epoch": 1.178934526230175, + "grad_norm": 10.767401339134722, + "learning_rate": 1.9807642730039047e-05, + "loss": 0.5056, + "step": 2899 + }, + { + "epoch": 1.1793411956079707, + "grad_norm": 24.31860571232435, + "learning_rate": 1.9807444909512818e-05, + "loss": 1.0839, + "step": 2900 + }, + { + "epoch": 1.1797478649857664, + "grad_norm": 5.050783862462147, + "learning_rate": 1.9807246988308226e-05, + "loss": 0.1321, + "step": 2901 + }, + { + "epoch": 1.1801545343635624, + "grad_norm": 10.683332239146162, + "learning_rate": 1.980704896642731e-05, + "loss": 0.5853, + "step": 2902 + }, + { + "epoch": 1.1805612037413582, + "grad_norm": 19.84734553104155, + "learning_rate": 1.98068508438721e-05, + "loss": 0.8363, + "step": 2903 + }, + { + "epoch": 1.1809678731191542, + "grad_norm": 19.5040620568025, + "learning_rate": 1.980665262064463e-05, + "loss": 0.693, + "step": 2904 + }, + { + "epoch": 1.18137454249695, + "grad_norm": 7.569659291274166, + "learning_rate": 1.9806454296746936e-05, + "loss": 0.2248, + "step": 2905 + }, + { + "epoch": 1.1817812118747457, + "grad_norm": 15.851765796538432, + "learning_rate": 1.980625587218105e-05, + "loss": 0.6367, + "step": 2906 + }, + { + "epoch": 1.1821878812525417, + "grad_norm": 0.3328071848552989, + "learning_rate": 1.9806057346949017e-05, + "loss": 0.0059, + "step": 2907 + }, + { + "epoch": 1.1825945506303375, + "grad_norm": 7.277360020546121, + "learning_rate": 1.9805858721052868e-05, + "loss": 0.2294, + "step": 2908 + }, + { + "epoch": 1.1830012200081335, + "grad_norm": 21.732632398678444, + "learning_rate": 1.9805659994494645e-05, + "loss": 0.6406, + "step": 2909 + }, + { + "epoch": 1.1834078893859292, + "grad_norm": 4.919376187806003, + "learning_rate": 1.980546116727639e-05, + "loss": 0.0438, + "step": 2910 + }, + { + "epoch": 1.183814558763725, + "grad_norm": 15.827290682089616, + "learning_rate": 1.9805262239400135e-05, + "loss": 0.4876, + "step": 2911 + }, + { + "epoch": 1.184221228141521, + "grad_norm": 3.399096671108493, + "learning_rate": 1.980506321086793e-05, + "loss": 0.0737, + "step": 2912 + }, + { + "epoch": 1.1846278975193167, + "grad_norm": 9.370845954326718, + "learning_rate": 1.9804864081681815e-05, + "loss": 0.3366, + "step": 2913 + }, + { + "epoch": 1.1850345668971127, + "grad_norm": 6.539852425474146, + "learning_rate": 1.980466485184384e-05, + "loss": 0.184, + "step": 2914 + }, + { + "epoch": 1.1854412362749085, + "grad_norm": 11.465853567995476, + "learning_rate": 1.980446552135604e-05, + "loss": 0.1305, + "step": 2915 + }, + { + "epoch": 1.1858479056527043, + "grad_norm": 11.43528520904096, + "learning_rate": 1.980426609022047e-05, + "loss": 0.8852, + "step": 2916 + }, + { + "epoch": 1.1862545750305002, + "grad_norm": 1.2708172352360632, + "learning_rate": 1.9804066558439175e-05, + "loss": 0.0212, + "step": 2917 + }, + { + "epoch": 1.186661244408296, + "grad_norm": 12.112115192976319, + "learning_rate": 1.9803866926014205e-05, + "loss": 0.4346, + "step": 2918 + }, + { + "epoch": 1.187067913786092, + "grad_norm": 20.33792415059022, + "learning_rate": 1.98036671929476e-05, + "loss": 0.6556, + "step": 2919 + }, + { + "epoch": 1.1874745831638878, + "grad_norm": 17.84994321716083, + "learning_rate": 1.9803467359241422e-05, + "loss": 1.7607, + "step": 2920 + }, + { + "epoch": 1.1878812525416835, + "grad_norm": 2.8235715077297576, + "learning_rate": 1.9803267424897717e-05, + "loss": 0.0475, + "step": 2921 + }, + { + "epoch": 1.1882879219194795, + "grad_norm": 9.25930121301106, + "learning_rate": 1.980306738991854e-05, + "loss": 0.2637, + "step": 2922 + }, + { + "epoch": 1.1886945912972753, + "grad_norm": 33.842845168702965, + "learning_rate": 1.9802867254305938e-05, + "loss": 0.8155, + "step": 2923 + }, + { + "epoch": 1.1891012606750713, + "grad_norm": 1.0881804793444152, + "learning_rate": 1.980266701806197e-05, + "loss": 0.0111, + "step": 2924 + }, + { + "epoch": 1.189507930052867, + "grad_norm": 11.665600158359094, + "learning_rate": 1.9802466681188693e-05, + "loss": 0.3402, + "step": 2925 + }, + { + "epoch": 1.1899145994306628, + "grad_norm": 8.927913099022838, + "learning_rate": 1.9802266243688166e-05, + "loss": 0.202, + "step": 2926 + }, + { + "epoch": 1.1903212688084588, + "grad_norm": 11.680044816445745, + "learning_rate": 1.9802065705562435e-05, + "loss": 0.4301, + "step": 2927 + }, + { + "epoch": 1.1907279381862546, + "grad_norm": 7.892557476661442, + "learning_rate": 1.980186506681357e-05, + "loss": 0.2431, + "step": 2928 + }, + { + "epoch": 1.1911346075640505, + "grad_norm": 13.119695989993689, + "learning_rate": 1.980166432744363e-05, + "loss": 0.3753, + "step": 2929 + }, + { + "epoch": 1.1915412769418463, + "grad_norm": 3.827484398151878, + "learning_rate": 1.980146348745467e-05, + "loss": 0.0486, + "step": 2930 + }, + { + "epoch": 1.191947946319642, + "grad_norm": 7.878338116222518, + "learning_rate": 1.9801262546848756e-05, + "loss": 0.2223, + "step": 2931 + }, + { + "epoch": 1.192354615697438, + "grad_norm": 8.432176717040234, + "learning_rate": 1.9801061505627945e-05, + "loss": 0.3331, + "step": 2932 + }, + { + "epoch": 1.1927612850752338, + "grad_norm": 28.227533314784647, + "learning_rate": 1.9800860363794308e-05, + "loss": 0.8906, + "step": 2933 + }, + { + "epoch": 1.1931679544530298, + "grad_norm": 8.751573718926224, + "learning_rate": 1.9800659121349907e-05, + "loss": 0.1632, + "step": 2934 + }, + { + "epoch": 1.1935746238308256, + "grad_norm": 11.74571642110638, + "learning_rate": 1.9800457778296803e-05, + "loss": 0.6066, + "step": 2935 + }, + { + "epoch": 1.1939812932086213, + "grad_norm": 12.708555158909567, + "learning_rate": 1.9800256334637073e-05, + "loss": 0.2528, + "step": 2936 + }, + { + "epoch": 1.1943879625864173, + "grad_norm": 7.032044220790961, + "learning_rate": 1.9800054790372777e-05, + "loss": 0.1831, + "step": 2937 + }, + { + "epoch": 1.194794631964213, + "grad_norm": 4.59716466801262, + "learning_rate": 1.9799853145505984e-05, + "loss": 0.0665, + "step": 2938 + }, + { + "epoch": 1.1952013013420089, + "grad_norm": 6.399649212974649, + "learning_rate": 1.9799651400038772e-05, + "loss": 0.1492, + "step": 2939 + }, + { + "epoch": 1.1956079707198048, + "grad_norm": 6.978540667847402, + "learning_rate": 1.97994495539732e-05, + "loss": 0.1147, + "step": 2940 + }, + { + "epoch": 1.1960146400976006, + "grad_norm": 7.239821501855836, + "learning_rate": 1.979924760731135e-05, + "loss": 0.2512, + "step": 2941 + }, + { + "epoch": 1.1964213094753966, + "grad_norm": 10.66901216564576, + "learning_rate": 1.9799045560055292e-05, + "loss": 0.3585, + "step": 2942 + }, + { + "epoch": 1.1968279788531924, + "grad_norm": 8.635110972394147, + "learning_rate": 1.9798843412207098e-05, + "loss": 0.2077, + "step": 2943 + }, + { + "epoch": 1.1972346482309881, + "grad_norm": 18.57349250569016, + "learning_rate": 1.9798641163768842e-05, + "loss": 0.6815, + "step": 2944 + }, + { + "epoch": 1.1976413176087841, + "grad_norm": 0.46141856949679344, + "learning_rate": 1.979843881474261e-05, + "loss": 0.0058, + "step": 2945 + }, + { + "epoch": 1.1980479869865799, + "grad_norm": 18.64922523787469, + "learning_rate": 1.9798236365130467e-05, + "loss": 0.7958, + "step": 2946 + }, + { + "epoch": 1.1984546563643756, + "grad_norm": 18.099086492322552, + "learning_rate": 1.9798033814934498e-05, + "loss": 0.7972, + "step": 2947 + }, + { + "epoch": 1.1988613257421716, + "grad_norm": 15.081343584011396, + "learning_rate": 1.979783116415678e-05, + "loss": 0.3657, + "step": 2948 + }, + { + "epoch": 1.1992679951199674, + "grad_norm": 5.006006786162578, + "learning_rate": 1.9797628412799394e-05, + "loss": 0.0791, + "step": 2949 + }, + { + "epoch": 1.1996746644977634, + "grad_norm": 8.348002653739792, + "learning_rate": 1.9797425560864424e-05, + "loss": 0.2662, + "step": 2950 + }, + { + "epoch": 1.2000813338755592, + "grad_norm": 13.771495090205073, + "learning_rate": 1.9797222608353944e-05, + "loss": 0.6712, + "step": 2951 + }, + { + "epoch": 1.200488003253355, + "grad_norm": 9.83450875361603, + "learning_rate": 1.979701955527005e-05, + "loss": 0.2965, + "step": 2952 + }, + { + "epoch": 1.200894672631151, + "grad_norm": 1.0960830129972292, + "learning_rate": 1.979681640161481e-05, + "loss": 0.0213, + "step": 2953 + }, + { + "epoch": 1.2013013420089467, + "grad_norm": 28.955814880016714, + "learning_rate": 1.9796613147390328e-05, + "loss": 1.1431, + "step": 2954 + }, + { + "epoch": 1.2017080113867427, + "grad_norm": 7.778990053030239, + "learning_rate": 1.979640979259868e-05, + "loss": 0.2234, + "step": 2955 + }, + { + "epoch": 1.2021146807645384, + "grad_norm": 9.217540379925383, + "learning_rate": 1.979620633724195e-05, + "loss": 0.4105, + "step": 2956 + }, + { + "epoch": 1.2025213501423342, + "grad_norm": 21.27574746587229, + "learning_rate": 1.9796002781322236e-05, + "loss": 0.8064, + "step": 2957 + }, + { + "epoch": 1.2029280195201302, + "grad_norm": 8.499086025557236, + "learning_rate": 1.9795799124841623e-05, + "loss": 0.175, + "step": 2958 + }, + { + "epoch": 1.203334688897926, + "grad_norm": 15.946998086333197, + "learning_rate": 1.97955953678022e-05, + "loss": 0.5537, + "step": 2959 + }, + { + "epoch": 1.203741358275722, + "grad_norm": 4.1659422418633385, + "learning_rate": 1.979539151020606e-05, + "loss": 0.0686, + "step": 2960 + }, + { + "epoch": 1.2041480276535177, + "grad_norm": 19.16567304269657, + "learning_rate": 1.97951875520553e-05, + "loss": 1.0941, + "step": 2961 + }, + { + "epoch": 1.2045546970313135, + "grad_norm": 18.177135920835646, + "learning_rate": 1.9794983493352005e-05, + "loss": 0.4199, + "step": 2962 + }, + { + "epoch": 1.2049613664091094, + "grad_norm": 16.419602097497386, + "learning_rate": 1.979477933409828e-05, + "loss": 0.7292, + "step": 2963 + }, + { + "epoch": 1.2053680357869052, + "grad_norm": 8.089390326638286, + "learning_rate": 1.979457507429621e-05, + "loss": 0.1958, + "step": 2964 + }, + { + "epoch": 1.2057747051647012, + "grad_norm": 9.7976858857087, + "learning_rate": 1.9794370713947902e-05, + "loss": 0.406, + "step": 2965 + }, + { + "epoch": 1.206181374542497, + "grad_norm": 8.196393622654435, + "learning_rate": 1.9794166253055448e-05, + "loss": 0.286, + "step": 2966 + }, + { + "epoch": 1.2065880439202927, + "grad_norm": 6.636518973641589, + "learning_rate": 1.9793961691620945e-05, + "loss": 0.148, + "step": 2967 + }, + { + "epoch": 1.2069947132980887, + "grad_norm": 14.553361853829934, + "learning_rate": 1.97937570296465e-05, + "loss": 0.4817, + "step": 2968 + }, + { + "epoch": 1.2074013826758845, + "grad_norm": 9.846014105318737, + "learning_rate": 1.979355226713421e-05, + "loss": 0.2495, + "step": 2969 + }, + { + "epoch": 1.2078080520536805, + "grad_norm": 13.18825877976208, + "learning_rate": 1.9793347404086172e-05, + "loss": 0.6548, + "step": 2970 + }, + { + "epoch": 1.2082147214314762, + "grad_norm": 15.957019915687617, + "learning_rate": 1.9793142440504502e-05, + "loss": 0.1163, + "step": 2971 + }, + { + "epoch": 1.208621390809272, + "grad_norm": 21.449631320286827, + "learning_rate": 1.9792937376391294e-05, + "loss": 0.8676, + "step": 2972 + }, + { + "epoch": 1.209028060187068, + "grad_norm": 5.130915874503473, + "learning_rate": 1.9792732211748647e-05, + "loss": 0.093, + "step": 2973 + }, + { + "epoch": 1.2094347295648638, + "grad_norm": 19.149080056180065, + "learning_rate": 1.9792526946578684e-05, + "loss": 0.5669, + "step": 2974 + }, + { + "epoch": 1.2098413989426597, + "grad_norm": 11.525172019675017, + "learning_rate": 1.97923215808835e-05, + "loss": 0.2982, + "step": 2975 + }, + { + "epoch": 1.2102480683204555, + "grad_norm": 24.018105032306842, + "learning_rate": 1.979211611466521e-05, + "loss": 0.3642, + "step": 2976 + }, + { + "epoch": 1.2106547376982513, + "grad_norm": 13.42070746040456, + "learning_rate": 1.9791910547925915e-05, + "loss": 0.5115, + "step": 2977 + }, + { + "epoch": 1.2110614070760473, + "grad_norm": 8.382647411950824, + "learning_rate": 1.979170488066773e-05, + "loss": 0.1834, + "step": 2978 + }, + { + "epoch": 1.211468076453843, + "grad_norm": 13.13155038309426, + "learning_rate": 1.979149911289277e-05, + "loss": 0.5175, + "step": 2979 + }, + { + "epoch": 1.2118747458316388, + "grad_norm": 11.468455189445265, + "learning_rate": 1.979129324460314e-05, + "loss": 0.3396, + "step": 2980 + }, + { + "epoch": 1.2122814152094348, + "grad_norm": 12.422399901675007, + "learning_rate": 1.979108727580096e-05, + "loss": 0.5023, + "step": 2981 + }, + { + "epoch": 1.2126880845872305, + "grad_norm": 0.3034883262872181, + "learning_rate": 1.9790881206488342e-05, + "loss": 0.0048, + "step": 2982 + }, + { + "epoch": 1.2130947539650265, + "grad_norm": 11.00007462857206, + "learning_rate": 1.9790675036667397e-05, + "loss": 0.4645, + "step": 2983 + }, + { + "epoch": 1.2135014233428223, + "grad_norm": 12.222651386620049, + "learning_rate": 1.979046876634025e-05, + "loss": 0.4576, + "step": 2984 + }, + { + "epoch": 1.213908092720618, + "grad_norm": 6.667188901542939, + "learning_rate": 1.979026239550901e-05, + "loss": 0.1337, + "step": 2985 + }, + { + "epoch": 1.214314762098414, + "grad_norm": 4.7591218331742065, + "learning_rate": 1.97900559241758e-05, + "loss": 0.1368, + "step": 2986 + }, + { + "epoch": 1.2147214314762098, + "grad_norm": 14.870954426848536, + "learning_rate": 1.9789849352342737e-05, + "loss": 0.7409, + "step": 2987 + }, + { + "epoch": 1.2151281008540056, + "grad_norm": 1.536471576640913, + "learning_rate": 1.9789642680011947e-05, + "loss": 0.0229, + "step": 2988 + }, + { + "epoch": 1.2155347702318016, + "grad_norm": 9.56203164518016, + "learning_rate": 1.9789435907185547e-05, + "loss": 0.2205, + "step": 2989 + }, + { + "epoch": 1.2159414396095973, + "grad_norm": 6.261865822533935, + "learning_rate": 1.978922903386566e-05, + "loss": 0.1831, + "step": 2990 + }, + { + "epoch": 1.2163481089873933, + "grad_norm": 13.27815232363884, + "learning_rate": 1.978902206005441e-05, + "loss": 0.4406, + "step": 2991 + }, + { + "epoch": 1.216754778365189, + "grad_norm": 21.236817361568825, + "learning_rate": 1.9788814985753923e-05, + "loss": 0.9691, + "step": 2992 + }, + { + "epoch": 1.2171614477429848, + "grad_norm": 17.51609480107211, + "learning_rate": 1.978860781096632e-05, + "loss": 0.7206, + "step": 2993 + }, + { + "epoch": 1.2175681171207808, + "grad_norm": 7.694124538148311, + "learning_rate": 1.978840053569373e-05, + "loss": 0.1863, + "step": 2994 + }, + { + "epoch": 1.2179747864985766, + "grad_norm": 11.941160675597496, + "learning_rate": 1.9788193159938288e-05, + "loss": 0.2831, + "step": 2995 + }, + { + "epoch": 1.2183814558763726, + "grad_norm": 5.962617788043496, + "learning_rate": 1.9787985683702115e-05, + "loss": 0.118, + "step": 2996 + }, + { + "epoch": 1.2187881252541684, + "grad_norm": 16.480673054411834, + "learning_rate": 1.978777810698734e-05, + "loss": 0.9837, + "step": 2997 + }, + { + "epoch": 1.2191947946319641, + "grad_norm": 2.895943581421062, + "learning_rate": 1.9787570429796103e-05, + "loss": 0.0353, + "step": 2998 + }, + { + "epoch": 1.21960146400976, + "grad_norm": 6.933849179035831, + "learning_rate": 1.9787362652130523e-05, + "loss": 0.1163, + "step": 2999 + }, + { + "epoch": 1.2200081333875559, + "grad_norm": 14.085873677925816, + "learning_rate": 1.9787154773992744e-05, + "loss": 0.2586, + "step": 3000 + }, + { + "epoch": 1.2204148027653519, + "grad_norm": 11.58727842393831, + "learning_rate": 1.978694679538489e-05, + "loss": 0.3421, + "step": 3001 + }, + { + "epoch": 1.2208214721431476, + "grad_norm": 6.135724516914884, + "learning_rate": 1.9786738716309108e-05, + "loss": 0.124, + "step": 3002 + }, + { + "epoch": 1.2212281415209434, + "grad_norm": 2.3434928511638464, + "learning_rate": 1.978653053676752e-05, + "loss": 0.0585, + "step": 3003 + }, + { + "epoch": 1.2216348108987394, + "grad_norm": 4.584021642405044, + "learning_rate": 1.9786322256762277e-05, + "loss": 0.1702, + "step": 3004 + }, + { + "epoch": 1.2220414802765351, + "grad_norm": 33.01243042192851, + "learning_rate": 1.978611387629551e-05, + "loss": 0.5847, + "step": 3005 + }, + { + "epoch": 1.2224481496543311, + "grad_norm": 15.48563818778481, + "learning_rate": 1.9785905395369355e-05, + "loss": 0.7424, + "step": 3006 + }, + { + "epoch": 1.222854819032127, + "grad_norm": 12.021402151709191, + "learning_rate": 1.978569681398596e-05, + "loss": 0.6007, + "step": 3007 + }, + { + "epoch": 1.2232614884099227, + "grad_norm": 10.831318242608003, + "learning_rate": 1.978548813214746e-05, + "loss": 0.2633, + "step": 3008 + }, + { + "epoch": 1.2236681577877186, + "grad_norm": 13.178323196722449, + "learning_rate": 1.9785279349856e-05, + "loss": 0.7507, + "step": 3009 + }, + { + "epoch": 1.2240748271655144, + "grad_norm": 10.738855549939549, + "learning_rate": 1.9785070467113723e-05, + "loss": 0.6079, + "step": 3010 + }, + { + "epoch": 1.2244814965433104, + "grad_norm": 13.318510207165302, + "learning_rate": 1.978486148392277e-05, + "loss": 0.5561, + "step": 3011 + }, + { + "epoch": 1.2248881659211062, + "grad_norm": 10.551213020742873, + "learning_rate": 1.9784652400285293e-05, + "loss": 0.139, + "step": 3012 + }, + { + "epoch": 1.225294835298902, + "grad_norm": 1.1240807135183255, + "learning_rate": 1.9784443216203436e-05, + "loss": 0.0121, + "step": 3013 + }, + { + "epoch": 1.225701504676698, + "grad_norm": 7.4870223457531315, + "learning_rate": 1.978423393167934e-05, + "loss": 0.1004, + "step": 3014 + }, + { + "epoch": 1.2261081740544937, + "grad_norm": 2.3335354207135195, + "learning_rate": 1.978402454671516e-05, + "loss": 0.0493, + "step": 3015 + }, + { + "epoch": 1.2265148434322897, + "grad_norm": 27.746972229240843, + "learning_rate": 1.978381506131305e-05, + "loss": 1.0859, + "step": 3016 + }, + { + "epoch": 1.2269215128100854, + "grad_norm": 6.302340052801195, + "learning_rate": 1.9783605475475143e-05, + "loss": 0.2387, + "step": 3017 + }, + { + "epoch": 1.2273281821878812, + "grad_norm": 15.787617274916853, + "learning_rate": 1.978339578920361e-05, + "loss": 0.7862, + "step": 3018 + }, + { + "epoch": 1.2277348515656772, + "grad_norm": 7.889487028403832, + "learning_rate": 1.9783186002500593e-05, + "loss": 0.4006, + "step": 3019 + }, + { + "epoch": 1.228141520943473, + "grad_norm": 7.532878077478514, + "learning_rate": 1.978297611536825e-05, + "loss": 0.1329, + "step": 3020 + }, + { + "epoch": 1.2285481903212687, + "grad_norm": 5.432392123076222, + "learning_rate": 1.978276612780873e-05, + "loss": 0.1203, + "step": 3021 + }, + { + "epoch": 1.2289548596990647, + "grad_norm": 15.208688180584693, + "learning_rate": 1.978255603982419e-05, + "loss": 0.8442, + "step": 3022 + }, + { + "epoch": 1.2293615290768605, + "grad_norm": 10.864768748066684, + "learning_rate": 1.9782345851416795e-05, + "loss": 0.2743, + "step": 3023 + }, + { + "epoch": 1.2297681984546565, + "grad_norm": 10.760134545827606, + "learning_rate": 1.9782135562588693e-05, + "loss": 0.2386, + "step": 3024 + }, + { + "epoch": 1.2301748678324522, + "grad_norm": 6.724378209343412, + "learning_rate": 1.978192517334205e-05, + "loss": 0.1766, + "step": 3025 + }, + { + "epoch": 1.230581537210248, + "grad_norm": 12.583230817067301, + "learning_rate": 1.9781714683679017e-05, + "loss": 0.3453, + "step": 3026 + }, + { + "epoch": 1.230988206588044, + "grad_norm": 6.986085136382818, + "learning_rate": 1.9781504093601763e-05, + "loss": 0.2902, + "step": 3027 + }, + { + "epoch": 1.2313948759658397, + "grad_norm": 8.33844666035487, + "learning_rate": 1.9781293403112443e-05, + "loss": 0.3338, + "step": 3028 + }, + { + "epoch": 1.2318015453436355, + "grad_norm": 15.112597682189438, + "learning_rate": 1.9781082612213226e-05, + "loss": 0.5348, + "step": 3029 + }, + { + "epoch": 1.2322082147214315, + "grad_norm": 18.299992797074474, + "learning_rate": 1.9780871720906275e-05, + "loss": 1.3259, + "step": 3030 + }, + { + "epoch": 1.2326148840992273, + "grad_norm": 16.84630460757215, + "learning_rate": 1.978066072919375e-05, + "loss": 0.5791, + "step": 3031 + }, + { + "epoch": 1.2330215534770232, + "grad_norm": 9.718732004938609, + "learning_rate": 1.9780449637077822e-05, + "loss": 0.2161, + "step": 3032 + }, + { + "epoch": 1.233428222854819, + "grad_norm": 21.66897522662391, + "learning_rate": 1.9780238444560656e-05, + "loss": 1.0683, + "step": 3033 + }, + { + "epoch": 1.2338348922326148, + "grad_norm": 2.1654210586817633, + "learning_rate": 1.9780027151644418e-05, + "loss": 0.0644, + "step": 3034 + }, + { + "epoch": 1.2342415616104108, + "grad_norm": 13.968139334899364, + "learning_rate": 1.977981575833128e-05, + "loss": 0.8868, + "step": 3035 + }, + { + "epoch": 1.2346482309882065, + "grad_norm": 6.805260081787598, + "learning_rate": 1.977960426462341e-05, + "loss": 0.2161, + "step": 3036 + }, + { + "epoch": 1.2350549003660025, + "grad_norm": 16.358731148545985, + "learning_rate": 1.9779392670522984e-05, + "loss": 0.5681, + "step": 3037 + }, + { + "epoch": 1.2354615697437983, + "grad_norm": 13.636186696951205, + "learning_rate": 1.977918097603217e-05, + "loss": 0.6009, + "step": 3038 + }, + { + "epoch": 1.235868239121594, + "grad_norm": 16.00898344461427, + "learning_rate": 1.9778969181153137e-05, + "loss": 0.8254, + "step": 3039 + }, + { + "epoch": 1.23627490849939, + "grad_norm": 3.905987060529981, + "learning_rate": 1.9778757285888067e-05, + "loss": 0.0461, + "step": 3040 + }, + { + "epoch": 1.2366815778771858, + "grad_norm": 32.34534924729732, + "learning_rate": 1.977854529023913e-05, + "loss": 0.8955, + "step": 3041 + }, + { + "epoch": 1.2370882472549818, + "grad_norm": 14.700540474308927, + "learning_rate": 1.9778333194208507e-05, + "loss": 1.0175, + "step": 3042 + }, + { + "epoch": 1.2374949166327776, + "grad_norm": 15.982171762600887, + "learning_rate": 1.9778120997798372e-05, + "loss": 0.7585, + "step": 3043 + }, + { + "epoch": 1.2379015860105733, + "grad_norm": 6.558596934214962, + "learning_rate": 1.9777908701010902e-05, + "loss": 0.2463, + "step": 3044 + }, + { + "epoch": 1.2383082553883693, + "grad_norm": 9.895767876529876, + "learning_rate": 1.977769630384828e-05, + "loss": 0.1617, + "step": 3045 + }, + { + "epoch": 1.238714924766165, + "grad_norm": 10.343345660612913, + "learning_rate": 1.9777483806312684e-05, + "loss": 0.2483, + "step": 3046 + }, + { + "epoch": 1.239121594143961, + "grad_norm": 11.233165705642035, + "learning_rate": 1.9777271208406296e-05, + "loss": 0.2667, + "step": 3047 + }, + { + "epoch": 1.2395282635217568, + "grad_norm": 10.444350205421065, + "learning_rate": 1.9777058510131294e-05, + "loss": 0.5534, + "step": 3048 + }, + { + "epoch": 1.2399349328995526, + "grad_norm": 9.025241869361182, + "learning_rate": 1.977684571148987e-05, + "loss": 0.1975, + "step": 3049 + }, + { + "epoch": 1.2403416022773486, + "grad_norm": 8.08341644916684, + "learning_rate": 1.9776632812484205e-05, + "loss": 0.2327, + "step": 3050 + }, + { + "epoch": 1.2407482716551443, + "grad_norm": 2.8059222042071235, + "learning_rate": 1.977641981311648e-05, + "loss": 0.0462, + "step": 3051 + }, + { + "epoch": 1.2411549410329403, + "grad_norm": 7.827179501765463, + "learning_rate": 1.9776206713388887e-05, + "loss": 0.27, + "step": 3052 + }, + { + "epoch": 1.241561610410736, + "grad_norm": 10.122190315944804, + "learning_rate": 1.9775993513303612e-05, + "loss": 0.552, + "step": 3053 + }, + { + "epoch": 1.2419682797885319, + "grad_norm": 5.797679172608584, + "learning_rate": 1.9775780212862842e-05, + "loss": 0.0956, + "step": 3054 + }, + { + "epoch": 1.2423749491663278, + "grad_norm": 6.409565514264635, + "learning_rate": 1.977556681206877e-05, + "loss": 0.3156, + "step": 3055 + }, + { + "epoch": 1.2427816185441236, + "grad_norm": 5.77984549684596, + "learning_rate": 1.9775353310923588e-05, + "loss": 0.1193, + "step": 3056 + }, + { + "epoch": 1.2431882879219196, + "grad_norm": 22.0377774485133, + "learning_rate": 1.9775139709429477e-05, + "loss": 1.0442, + "step": 3057 + }, + { + "epoch": 1.2435949572997154, + "grad_norm": 14.157595809104574, + "learning_rate": 1.9774926007588643e-05, + "loss": 0.501, + "step": 3058 + }, + { + "epoch": 1.2440016266775111, + "grad_norm": 11.186605716026637, + "learning_rate": 1.977471220540327e-05, + "loss": 0.3059, + "step": 3059 + }, + { + "epoch": 1.2444082960553071, + "grad_norm": 19.045973724762533, + "learning_rate": 1.9774498302875556e-05, + "loss": 0.6572, + "step": 3060 + }, + { + "epoch": 1.2448149654331029, + "grad_norm": 15.4151153353223, + "learning_rate": 1.9774284300007704e-05, + "loss": 0.7648, + "step": 3061 + }, + { + "epoch": 1.2452216348108986, + "grad_norm": 6.748562234998511, + "learning_rate": 1.97740701968019e-05, + "loss": 0.157, + "step": 3062 + }, + { + "epoch": 1.2456283041886946, + "grad_norm": 2.3672716978458777, + "learning_rate": 1.9773855993260345e-05, + "loss": 0.0407, + "step": 3063 + }, + { + "epoch": 1.2460349735664904, + "grad_norm": 13.047779492420522, + "learning_rate": 1.977364168938524e-05, + "loss": 0.3649, + "step": 3064 + }, + { + "epoch": 1.2464416429442864, + "grad_norm": 13.554226026697823, + "learning_rate": 1.977342728517879e-05, + "loss": 0.4405, + "step": 3065 + }, + { + "epoch": 1.2468483123220822, + "grad_norm": 17.111106314820013, + "learning_rate": 1.9773212780643184e-05, + "loss": 1.1241, + "step": 3066 + }, + { + "epoch": 1.247254981699878, + "grad_norm": 7.335861967100888, + "learning_rate": 1.9772998175780627e-05, + "loss": 0.1058, + "step": 3067 + }, + { + "epoch": 1.247661651077674, + "grad_norm": 11.544089645215672, + "learning_rate": 1.977278347059333e-05, + "loss": 0.232, + "step": 3068 + }, + { + "epoch": 1.2480683204554697, + "grad_norm": 7.762952549617385, + "learning_rate": 1.977256866508349e-05, + "loss": 0.0714, + "step": 3069 + }, + { + "epoch": 1.2484749898332654, + "grad_norm": 1.3739812145222157, + "learning_rate": 1.977235375925332e-05, + "loss": 0.0192, + "step": 3070 + }, + { + "epoch": 1.2488816592110614, + "grad_norm": 10.790791792425287, + "learning_rate": 1.9772138753105014e-05, + "loss": 0.307, + "step": 3071 + }, + { + "epoch": 1.2492883285888572, + "grad_norm": 3.5647070455005503, + "learning_rate": 1.9771923646640784e-05, + "loss": 0.0655, + "step": 3072 + }, + { + "epoch": 1.2496949979666532, + "grad_norm": 16.550888692812304, + "learning_rate": 1.9771708439862845e-05, + "loss": 0.56, + "step": 3073 + }, + { + "epoch": 1.250101667344449, + "grad_norm": 33.07502669026436, + "learning_rate": 1.97714931327734e-05, + "loss": 1.0793, + "step": 3074 + }, + { + "epoch": 1.2505083367222447, + "grad_norm": 15.391013230229474, + "learning_rate": 1.9771277725374657e-05, + "loss": 0.3113, + "step": 3075 + }, + { + "epoch": 1.2509150061000407, + "grad_norm": 13.620685325304018, + "learning_rate": 1.9771062217668833e-05, + "loss": 0.612, + "step": 3076 + }, + { + "epoch": 1.2513216754778365, + "grad_norm": 8.913803969988166, + "learning_rate": 1.9770846609658133e-05, + "loss": 0.3183, + "step": 3077 + }, + { + "epoch": 1.2517283448556324, + "grad_norm": 8.267380115843999, + "learning_rate": 1.977063090134478e-05, + "loss": 0.3397, + "step": 3078 + }, + { + "epoch": 1.2521350142334282, + "grad_norm": 20.27745712642099, + "learning_rate": 1.9770415092730982e-05, + "loss": 0.9488, + "step": 3079 + }, + { + "epoch": 1.252541683611224, + "grad_norm": 3.014990405536576, + "learning_rate": 1.977019918381896e-05, + "loss": 0.1044, + "step": 3080 + }, + { + "epoch": 1.25294835298902, + "grad_norm": 13.358631433619319, + "learning_rate": 1.9769983174610918e-05, + "loss": 0.4989, + "step": 3081 + }, + { + "epoch": 1.2533550223668157, + "grad_norm": 12.126542686614224, + "learning_rate": 1.9769767065109086e-05, + "loss": 0.3812, + "step": 3082 + }, + { + "epoch": 1.2537616917446117, + "grad_norm": 10.474793993419912, + "learning_rate": 1.976955085531568e-05, + "loss": 0.6542, + "step": 3083 + }, + { + "epoch": 1.2541683611224075, + "grad_norm": 19.077937325969014, + "learning_rate": 1.9769334545232913e-05, + "loss": 0.9126, + "step": 3084 + }, + { + "epoch": 1.2545750305002032, + "grad_norm": 1.7634319144975046, + "learning_rate": 1.9769118134863013e-05, + "loss": 0.0299, + "step": 3085 + }, + { + "epoch": 1.2549816998779992, + "grad_norm": 12.43883724457917, + "learning_rate": 1.9768901624208197e-05, + "loss": 0.4485, + "step": 3086 + }, + { + "epoch": 1.255388369255795, + "grad_norm": 1.9764228245594462, + "learning_rate": 1.976868501327069e-05, + "loss": 0.0364, + "step": 3087 + }, + { + "epoch": 1.255795038633591, + "grad_norm": 2.2453304760365915, + "learning_rate": 1.9768468302052716e-05, + "loss": 0.0349, + "step": 3088 + }, + { + "epoch": 1.2562017080113868, + "grad_norm": 1.482677353685164, + "learning_rate": 1.9768251490556497e-05, + "loss": 0.022, + "step": 3089 + }, + { + "epoch": 1.2566083773891825, + "grad_norm": 14.57703512354668, + "learning_rate": 1.9768034578784265e-05, + "loss": 0.9421, + "step": 3090 + }, + { + "epoch": 1.2570150467669785, + "grad_norm": 7.971201832795152, + "learning_rate": 1.9767817566738236e-05, + "loss": 0.1926, + "step": 3091 + }, + { + "epoch": 1.2574217161447743, + "grad_norm": 8.85575980192783, + "learning_rate": 1.9767600454420644e-05, + "loss": 0.1926, + "step": 3092 + }, + { + "epoch": 1.2578283855225703, + "grad_norm": 12.975159893389785, + "learning_rate": 1.976738324183372e-05, + "loss": 0.5482, + "step": 3093 + }, + { + "epoch": 1.258235054900366, + "grad_norm": 9.929553465272473, + "learning_rate": 1.976716592897969e-05, + "loss": 0.2924, + "step": 3094 + }, + { + "epoch": 1.2586417242781618, + "grad_norm": 7.142871211112061, + "learning_rate": 1.9766948515860785e-05, + "loss": 0.2126, + "step": 3095 + }, + { + "epoch": 1.2590483936559578, + "grad_norm": 8.689447040028815, + "learning_rate": 1.9766731002479238e-05, + "loss": 0.1981, + "step": 3096 + }, + { + "epoch": 1.2594550630337535, + "grad_norm": 7.93906074964331, + "learning_rate": 1.976651338883728e-05, + "loss": 0.1132, + "step": 3097 + }, + { + "epoch": 1.2598617324115495, + "grad_norm": 10.065545286745564, + "learning_rate": 1.9766295674937154e-05, + "loss": 0.2036, + "step": 3098 + }, + { + "epoch": 1.2602684017893453, + "grad_norm": 4.326744837309469, + "learning_rate": 1.9766077860781082e-05, + "loss": 0.0908, + "step": 3099 + }, + { + "epoch": 1.260675071167141, + "grad_norm": 7.673320091359741, + "learning_rate": 1.9765859946371305e-05, + "loss": 0.2984, + "step": 3100 + }, + { + "epoch": 1.261081740544937, + "grad_norm": 12.059620350526853, + "learning_rate": 1.9765641931710063e-05, + "loss": 0.5809, + "step": 3101 + }, + { + "epoch": 1.2614884099227328, + "grad_norm": 13.93002444251955, + "learning_rate": 1.9765423816799592e-05, + "loss": 0.525, + "step": 3102 + }, + { + "epoch": 1.2618950793005288, + "grad_norm": 4.501605083156297, + "learning_rate": 1.9765205601642128e-05, + "loss": 0.2145, + "step": 3103 + }, + { + "epoch": 1.2623017486783246, + "grad_norm": 17.045366697778608, + "learning_rate": 1.9764987286239913e-05, + "loss": 0.5809, + "step": 3104 + }, + { + "epoch": 1.2627084180561203, + "grad_norm": 12.014922533146569, + "learning_rate": 1.976476887059519e-05, + "loss": 0.5877, + "step": 3105 + }, + { + "epoch": 1.263115087433916, + "grad_norm": 8.57531921869883, + "learning_rate": 1.9764550354710202e-05, + "loss": 0.1885, + "step": 3106 + }, + { + "epoch": 1.263521756811712, + "grad_norm": 7.84822303210292, + "learning_rate": 1.976433173858719e-05, + "loss": 0.2505, + "step": 3107 + }, + { + "epoch": 1.2639284261895078, + "grad_norm": 7.6024379032892195, + "learning_rate": 1.97641130222284e-05, + "loss": 0.2505, + "step": 3108 + }, + { + "epoch": 1.2643350955673038, + "grad_norm": 10.56951888453978, + "learning_rate": 1.976389420563607e-05, + "loss": 0.3655, + "step": 3109 + }, + { + "epoch": 1.2647417649450996, + "grad_norm": 9.787235469021063, + "learning_rate": 1.9763675288812458e-05, + "loss": 0.2042, + "step": 3110 + }, + { + "epoch": 1.2651484343228954, + "grad_norm": 28.96286296077855, + "learning_rate": 1.9763456271759802e-05, + "loss": 0.5846, + "step": 3111 + }, + { + "epoch": 1.2655551037006914, + "grad_norm": 19.494093593922493, + "learning_rate": 1.9763237154480353e-05, + "loss": 1.1553, + "step": 3112 + }, + { + "epoch": 1.2659617730784871, + "grad_norm": 2.785535044421538, + "learning_rate": 1.9763017936976364e-05, + "loss": 0.0518, + "step": 3113 + }, + { + "epoch": 1.266368442456283, + "grad_norm": 1.8440204188337983, + "learning_rate": 1.9762798619250078e-05, + "loss": 0.0256, + "step": 3114 + }, + { + "epoch": 1.2667751118340789, + "grad_norm": 17.341955203761987, + "learning_rate": 1.976257920130375e-05, + "loss": 0.5238, + "step": 3115 + }, + { + "epoch": 1.2671817812118746, + "grad_norm": 15.174049112064973, + "learning_rate": 1.9762359683139638e-05, + "loss": 0.2693, + "step": 3116 + }, + { + "epoch": 1.2675884505896706, + "grad_norm": 8.219915831418202, + "learning_rate": 1.976214006475999e-05, + "loss": 0.2902, + "step": 3117 + }, + { + "epoch": 1.2679951199674664, + "grad_norm": 8.628430732661537, + "learning_rate": 1.9761920346167057e-05, + "loss": 0.1955, + "step": 3118 + }, + { + "epoch": 1.2684017893452624, + "grad_norm": 4.181889172796168, + "learning_rate": 1.9761700527363103e-05, + "loss": 0.0878, + "step": 3119 + }, + { + "epoch": 1.2688084587230581, + "grad_norm": 8.445301908755257, + "learning_rate": 1.9761480608350378e-05, + "loss": 0.3669, + "step": 3120 + }, + { + "epoch": 1.269215128100854, + "grad_norm": 17.059945892364713, + "learning_rate": 1.9761260589131145e-05, + "loss": 0.751, + "step": 3121 + }, + { + "epoch": 1.26962179747865, + "grad_norm": 5.992046787771875, + "learning_rate": 1.9761040469707656e-05, + "loss": 0.1703, + "step": 3122 + }, + { + "epoch": 1.2700284668564457, + "grad_norm": 14.86496533052779, + "learning_rate": 1.976082025008217e-05, + "loss": 0.3866, + "step": 3123 + }, + { + "epoch": 1.2704351362342416, + "grad_norm": 0.22391131887817964, + "learning_rate": 1.976059993025696e-05, + "loss": 0.0031, + "step": 3124 + }, + { + "epoch": 1.2708418056120374, + "grad_norm": 4.267005529442631, + "learning_rate": 1.9760379510234274e-05, + "loss": 0.1854, + "step": 3125 + }, + { + "epoch": 1.2712484749898332, + "grad_norm": 8.380420322992638, + "learning_rate": 1.976015899001638e-05, + "loss": 0.3387, + "step": 3126 + }, + { + "epoch": 1.2716551443676292, + "grad_norm": 2.1671073739401727, + "learning_rate": 1.975993836960554e-05, + "loss": 0.0344, + "step": 3127 + }, + { + "epoch": 1.272061813745425, + "grad_norm": 10.49935871054114, + "learning_rate": 1.9759717649004027e-05, + "loss": 0.2952, + "step": 3128 + }, + { + "epoch": 1.272468483123221, + "grad_norm": 8.874491255302397, + "learning_rate": 1.9759496828214096e-05, + "loss": 0.2733, + "step": 3129 + }, + { + "epoch": 1.2728751525010167, + "grad_norm": 11.346237301872167, + "learning_rate": 1.975927590723802e-05, + "loss": 0.2689, + "step": 3130 + }, + { + "epoch": 1.2732818218788124, + "grad_norm": 8.564166568314063, + "learning_rate": 1.9759054886078065e-05, + "loss": 0.1519, + "step": 3131 + }, + { + "epoch": 1.2736884912566084, + "grad_norm": 0.38806249335909254, + "learning_rate": 1.9758833764736498e-05, + "loss": 0.0055, + "step": 3132 + }, + { + "epoch": 1.2740951606344042, + "grad_norm": 0.352690011724305, + "learning_rate": 1.9758612543215592e-05, + "loss": 0.004, + "step": 3133 + }, + { + "epoch": 1.2745018300122002, + "grad_norm": 4.25569172497689, + "learning_rate": 1.9758391221517617e-05, + "loss": 0.1408, + "step": 3134 + }, + { + "epoch": 1.274908499389996, + "grad_norm": 11.426347716545571, + "learning_rate": 1.9758169799644844e-05, + "loss": 0.3047, + "step": 3135 + }, + { + "epoch": 1.2753151687677917, + "grad_norm": 8.203956237341231, + "learning_rate": 1.9757948277599545e-05, + "loss": 0.2738, + "step": 3136 + }, + { + "epoch": 1.2757218381455877, + "grad_norm": 1.131392111219215, + "learning_rate": 1.9757726655384e-05, + "loss": 0.0142, + "step": 3137 + }, + { + "epoch": 1.2761285075233835, + "grad_norm": 15.27151578194257, + "learning_rate": 1.975750493300048e-05, + "loss": 0.525, + "step": 3138 + }, + { + "epoch": 1.2765351769011795, + "grad_norm": 6.054423508657131, + "learning_rate": 1.9757283110451258e-05, + "loss": 0.2442, + "step": 3139 + }, + { + "epoch": 1.2769418462789752, + "grad_norm": 9.466122808132171, + "learning_rate": 1.9757061187738616e-05, + "loss": 0.2329, + "step": 3140 + }, + { + "epoch": 1.277348515656771, + "grad_norm": 12.115535443390824, + "learning_rate": 1.975683916486483e-05, + "loss": 0.5373, + "step": 3141 + }, + { + "epoch": 1.277755185034567, + "grad_norm": 2.385728648622126, + "learning_rate": 1.975661704183218e-05, + "loss": 0.0211, + "step": 3142 + }, + { + "epoch": 1.2781618544123627, + "grad_norm": 25.091364910153022, + "learning_rate": 1.975639481864294e-05, + "loss": 0.7802, + "step": 3143 + }, + { + "epoch": 1.2785685237901587, + "grad_norm": 11.947642562218306, + "learning_rate": 1.9756172495299405e-05, + "loss": 0.2168, + "step": 3144 + }, + { + "epoch": 1.2789751931679545, + "grad_norm": 9.411724622035814, + "learning_rate": 1.9755950071803845e-05, + "loss": 0.5799, + "step": 3145 + }, + { + "epoch": 1.2793818625457503, + "grad_norm": 6.717057232722267, + "learning_rate": 1.975572754815855e-05, + "loss": 0.1478, + "step": 3146 + }, + { + "epoch": 1.279788531923546, + "grad_norm": 16.84452209479042, + "learning_rate": 1.9755504924365797e-05, + "loss": 0.8417, + "step": 3147 + }, + { + "epoch": 1.280195201301342, + "grad_norm": 3.2568045280286317, + "learning_rate": 1.975528220042788e-05, + "loss": 0.073, + "step": 3148 + }, + { + "epoch": 1.280601870679138, + "grad_norm": 107.12314360427808, + "learning_rate": 1.975505937634708e-05, + "loss": 0.5409, + "step": 3149 + }, + { + "epoch": 1.2810085400569338, + "grad_norm": 16.22790740410557, + "learning_rate": 1.9754836452125687e-05, + "loss": 0.8209, + "step": 3150 + }, + { + "epoch": 1.2814152094347295, + "grad_norm": 8.034692937359514, + "learning_rate": 1.9754613427765988e-05, + "loss": 0.1642, + "step": 3151 + }, + { + "epoch": 1.2818218788125253, + "grad_norm": 13.79555205893699, + "learning_rate": 1.9754390303270275e-05, + "loss": 0.7291, + "step": 3152 + }, + { + "epoch": 1.2822285481903213, + "grad_norm": 1.0198805001021822, + "learning_rate": 1.975416707864083e-05, + "loss": 0.0117, + "step": 3153 + }, + { + "epoch": 1.282635217568117, + "grad_norm": 6.623236365895915, + "learning_rate": 1.9753943753879955e-05, + "loss": 0.1228, + "step": 3154 + }, + { + "epoch": 1.283041886945913, + "grad_norm": 6.374298249114321, + "learning_rate": 1.975372032898994e-05, + "loss": 0.096, + "step": 3155 + }, + { + "epoch": 1.2834485563237088, + "grad_norm": 9.582369382423005, + "learning_rate": 1.9753496803973074e-05, + "loss": 0.2884, + "step": 3156 + }, + { + "epoch": 1.2838552257015046, + "grad_norm": 11.203299583720067, + "learning_rate": 1.9753273178831656e-05, + "loss": 0.1857, + "step": 3157 + }, + { + "epoch": 1.2842618950793006, + "grad_norm": 2.9753885155841515, + "learning_rate": 1.975304945356798e-05, + "loss": 0.0377, + "step": 3158 + }, + { + "epoch": 1.2846685644570963, + "grad_norm": 1.1909259200295017, + "learning_rate": 1.9752825628184343e-05, + "loss": 0.0076, + "step": 3159 + }, + { + "epoch": 1.2850752338348923, + "grad_norm": 1.3837493417490316, + "learning_rate": 1.975260170268304e-05, + "loss": 0.0192, + "step": 3160 + }, + { + "epoch": 1.285481903212688, + "grad_norm": 1.9250808910144093, + "learning_rate": 1.9752377677066375e-05, + "loss": 0.0274, + "step": 3161 + }, + { + "epoch": 1.2858885725904838, + "grad_norm": 10.125284604499518, + "learning_rate": 1.975215355133664e-05, + "loss": 0.4227, + "step": 3162 + }, + { + "epoch": 1.2862952419682798, + "grad_norm": 9.886857161896325, + "learning_rate": 1.9751929325496146e-05, + "loss": 0.4028, + "step": 3163 + }, + { + "epoch": 1.2867019113460756, + "grad_norm": 10.789695432178169, + "learning_rate": 1.9751704999547186e-05, + "loss": 0.2634, + "step": 3164 + }, + { + "epoch": 1.2871085807238716, + "grad_norm": 16.61872280257784, + "learning_rate": 1.9751480573492067e-05, + "loss": 0.707, + "step": 3165 + }, + { + "epoch": 1.2875152501016673, + "grad_norm": 2.642654328505693, + "learning_rate": 1.9751256047333097e-05, + "loss": 0.0232, + "step": 3166 + }, + { + "epoch": 1.287921919479463, + "grad_norm": 14.638289045051186, + "learning_rate": 1.975103142107257e-05, + "loss": 0.6532, + "step": 3167 + }, + { + "epoch": 1.288328588857259, + "grad_norm": 24.234128146433083, + "learning_rate": 1.9750806694712802e-05, + "loss": 1.7973, + "step": 3168 + }, + { + "epoch": 1.2887352582350549, + "grad_norm": 0.8408111536659942, + "learning_rate": 1.975058186825609e-05, + "loss": 0.0153, + "step": 3169 + }, + { + "epoch": 1.2891419276128508, + "grad_norm": 18.327458363838137, + "learning_rate": 1.9750356941704752e-05, + "loss": 0.4258, + "step": 3170 + }, + { + "epoch": 1.2895485969906466, + "grad_norm": 19.05396210416603, + "learning_rate": 1.9750131915061095e-05, + "loss": 0.7097, + "step": 3171 + }, + { + "epoch": 1.2899552663684424, + "grad_norm": 10.533339594592315, + "learning_rate": 1.9749906788327423e-05, + "loss": 0.2967, + "step": 3172 + }, + { + "epoch": 1.2903619357462384, + "grad_norm": 14.451140800783069, + "learning_rate": 1.974968156150605e-05, + "loss": 0.6001, + "step": 3173 + }, + { + "epoch": 1.2907686051240341, + "grad_norm": 15.644878245031506, + "learning_rate": 1.974945623459929e-05, + "loss": 0.7738, + "step": 3174 + }, + { + "epoch": 1.2911752745018301, + "grad_norm": 12.342930661242972, + "learning_rate": 1.9749230807609455e-05, + "loss": 0.232, + "step": 3175 + }, + { + "epoch": 1.2915819438796259, + "grad_norm": 23.07496994414134, + "learning_rate": 1.974900528053886e-05, + "loss": 0.904, + "step": 3176 + }, + { + "epoch": 1.2919886132574216, + "grad_norm": 17.740857674264944, + "learning_rate": 1.974877965338982e-05, + "loss": 1.0315, + "step": 3177 + }, + { + "epoch": 1.2923952826352176, + "grad_norm": 7.434614610934621, + "learning_rate": 1.974855392616465e-05, + "loss": 0.1663, + "step": 3178 + }, + { + "epoch": 1.2928019520130134, + "grad_norm": 10.724163098716096, + "learning_rate": 1.9748328098865667e-05, + "loss": 0.3259, + "step": 3179 + }, + { + "epoch": 1.2932086213908094, + "grad_norm": 8.054316371584306, + "learning_rate": 1.9748102171495187e-05, + "loss": 0.2447, + "step": 3180 + }, + { + "epoch": 1.2936152907686052, + "grad_norm": 11.895661555514843, + "learning_rate": 1.974787614405554e-05, + "loss": 0.7943, + "step": 3181 + }, + { + "epoch": 1.294021960146401, + "grad_norm": 8.70302964082077, + "learning_rate": 1.974765001654903e-05, + "loss": 0.566, + "step": 3182 + }, + { + "epoch": 1.294428629524197, + "grad_norm": 14.327889307646114, + "learning_rate": 1.974742378897799e-05, + "loss": 0.1654, + "step": 3183 + }, + { + "epoch": 1.2948352989019927, + "grad_norm": 8.369055510191615, + "learning_rate": 1.974719746134474e-05, + "loss": 0.1861, + "step": 3184 + }, + { + "epoch": 1.2952419682797887, + "grad_norm": 13.930836760115339, + "learning_rate": 1.9746971033651602e-05, + "loss": 0.5489, + "step": 3185 + }, + { + "epoch": 1.2956486376575844, + "grad_norm": 10.334049293187627, + "learning_rate": 1.9746744505900896e-05, + "loss": 0.2534, + "step": 3186 + }, + { + "epoch": 1.2960553070353802, + "grad_norm": 8.140351405678366, + "learning_rate": 1.9746517878094958e-05, + "loss": 0.304, + "step": 3187 + }, + { + "epoch": 1.296461976413176, + "grad_norm": 2.896181082810832, + "learning_rate": 1.9746291150236106e-05, + "loss": 0.2281, + "step": 3188 + }, + { + "epoch": 1.296868645790972, + "grad_norm": 18.512310137713254, + "learning_rate": 1.9746064322326674e-05, + "loss": 1.1081, + "step": 3189 + }, + { + "epoch": 1.297275315168768, + "grad_norm": 9.75221036400884, + "learning_rate": 1.974583739436898e-05, + "loss": 0.4237, + "step": 3190 + }, + { + "epoch": 1.2976819845465637, + "grad_norm": 2.9711399705912354, + "learning_rate": 1.9745610366365365e-05, + "loss": 0.0988, + "step": 3191 + }, + { + "epoch": 1.2980886539243595, + "grad_norm": 7.410290497767562, + "learning_rate": 1.9745383238318153e-05, + "loss": 0.1074, + "step": 3192 + }, + { + "epoch": 1.2984953233021552, + "grad_norm": 6.950305352355604, + "learning_rate": 1.974515601022968e-05, + "loss": 0.1322, + "step": 3193 + }, + { + "epoch": 1.2989019926799512, + "grad_norm": 3.458803121638674, + "learning_rate": 1.974492868210227e-05, + "loss": 0.0621, + "step": 3194 + }, + { + "epoch": 1.299308662057747, + "grad_norm": 9.392731773449308, + "learning_rate": 1.974470125393827e-05, + "loss": 0.2417, + "step": 3195 + }, + { + "epoch": 1.299715331435543, + "grad_norm": 3.2035796025772285, + "learning_rate": 1.974447372574e-05, + "loss": 0.0451, + "step": 3196 + }, + { + "epoch": 1.3001220008133387, + "grad_norm": 29.595627013220454, + "learning_rate": 1.974424609750981e-05, + "loss": 0.7763, + "step": 3197 + }, + { + "epoch": 1.3005286701911345, + "grad_norm": 13.069277031606198, + "learning_rate": 1.9744018369250026e-05, + "loss": 0.4701, + "step": 3198 + }, + { + "epoch": 1.3009353395689305, + "grad_norm": 13.608961290669335, + "learning_rate": 1.9743790540962987e-05, + "loss": 0.5119, + "step": 3199 + }, + { + "epoch": 1.3013420089467262, + "grad_norm": 8.791256302268202, + "learning_rate": 1.9743562612651038e-05, + "loss": 0.5066, + "step": 3200 + }, + { + "epoch": 1.3017486783245222, + "grad_norm": 13.018179595856958, + "learning_rate": 1.974333458431651e-05, + "loss": 0.4728, + "step": 3201 + }, + { + "epoch": 1.302155347702318, + "grad_norm": 14.275793432154739, + "learning_rate": 1.9743106455961753e-05, + "loss": 0.549, + "step": 3202 + }, + { + "epoch": 1.3025620170801138, + "grad_norm": 7.0726587431341885, + "learning_rate": 1.9742878227589107e-05, + "loss": 0.2852, + "step": 3203 + }, + { + "epoch": 1.3029686864579098, + "grad_norm": 7.095565535096404, + "learning_rate": 1.9742649899200907e-05, + "loss": 0.3187, + "step": 3204 + }, + { + "epoch": 1.3033753558357055, + "grad_norm": 16.22492602398553, + "learning_rate": 1.9742421470799506e-05, + "loss": 0.5869, + "step": 3205 + }, + { + "epoch": 1.3037820252135015, + "grad_norm": 14.505895310256351, + "learning_rate": 1.9742192942387244e-05, + "loss": 0.3725, + "step": 3206 + }, + { + "epoch": 1.3041886945912973, + "grad_norm": 4.962346474664175, + "learning_rate": 1.974196431396647e-05, + "loss": 0.1607, + "step": 3207 + }, + { + "epoch": 1.304595363969093, + "grad_norm": 3.110988450344131, + "learning_rate": 1.9741735585539524e-05, + "loss": 0.047, + "step": 3208 + }, + { + "epoch": 1.305002033346889, + "grad_norm": 11.108352171455701, + "learning_rate": 1.9741506757108765e-05, + "loss": 0.3481, + "step": 3209 + }, + { + "epoch": 1.3054087027246848, + "grad_norm": 1.9751399639399747, + "learning_rate": 1.9741277828676538e-05, + "loss": 0.0542, + "step": 3210 + }, + { + "epoch": 1.3058153721024808, + "grad_norm": 2.378675853103126, + "learning_rate": 1.9741048800245183e-05, + "loss": 0.0312, + "step": 3211 + }, + { + "epoch": 1.3062220414802765, + "grad_norm": 3.634770901592633, + "learning_rate": 1.9740819671817065e-05, + "loss": 0.0959, + "step": 3212 + }, + { + "epoch": 1.3066287108580723, + "grad_norm": 23.056497385984947, + "learning_rate": 1.9740590443394532e-05, + "loss": 1.2949, + "step": 3213 + }, + { + "epoch": 1.3070353802358683, + "grad_norm": 27.26337192839624, + "learning_rate": 1.9740361114979933e-05, + "loss": 0.5349, + "step": 3214 + }, + { + "epoch": 1.307442049613664, + "grad_norm": 3.8527161015570863, + "learning_rate": 1.9740131686575626e-05, + "loss": 0.043, + "step": 3215 + }, + { + "epoch": 1.30784871899146, + "grad_norm": 12.36313745961557, + "learning_rate": 1.9739902158183965e-05, + "loss": 0.5291, + "step": 3216 + }, + { + "epoch": 1.3082553883692558, + "grad_norm": 9.01600628411553, + "learning_rate": 1.9739672529807304e-05, + "loss": 0.2279, + "step": 3217 + }, + { + "epoch": 1.3086620577470516, + "grad_norm": 12.308961757762493, + "learning_rate": 1.9739442801448007e-05, + "loss": 0.2254, + "step": 3218 + }, + { + "epoch": 1.3090687271248476, + "grad_norm": 9.644733697260127, + "learning_rate": 1.9739212973108426e-05, + "loss": 0.3421, + "step": 3219 + }, + { + "epoch": 1.3094753965026433, + "grad_norm": 3.1770422755281054, + "learning_rate": 1.973898304479092e-05, + "loss": 0.0599, + "step": 3220 + }, + { + "epoch": 1.3098820658804393, + "grad_norm": 20.38093142572123, + "learning_rate": 1.9738753016497853e-05, + "loss": 0.1544, + "step": 3221 + }, + { + "epoch": 1.310288735258235, + "grad_norm": 23.703947571630774, + "learning_rate": 1.9738522888231587e-05, + "loss": 0.3469, + "step": 3222 + }, + { + "epoch": 1.3106954046360308, + "grad_norm": 4.743738769918483, + "learning_rate": 1.973829265999448e-05, + "loss": 0.1903, + "step": 3223 + }, + { + "epoch": 1.3111020740138268, + "grad_norm": 21.5828340902138, + "learning_rate": 1.97380623317889e-05, + "loss": 0.7451, + "step": 3224 + }, + { + "epoch": 1.3115087433916226, + "grad_norm": 23.401010865031953, + "learning_rate": 1.9737831903617205e-05, + "loss": 0.91, + "step": 3225 + }, + { + "epoch": 1.3119154127694186, + "grad_norm": 0.28767646158446775, + "learning_rate": 1.973760137548177e-05, + "loss": 0.0036, + "step": 3226 + }, + { + "epoch": 1.3123220821472144, + "grad_norm": 0.27884303389218823, + "learning_rate": 1.973737074738495e-05, + "loss": 0.0028, + "step": 3227 + }, + { + "epoch": 1.3127287515250101, + "grad_norm": 11.02024289591497, + "learning_rate": 1.9737140019329123e-05, + "loss": 0.3398, + "step": 3228 + }, + { + "epoch": 1.3131354209028059, + "grad_norm": 9.375234304172457, + "learning_rate": 1.973690919131665e-05, + "loss": 0.2559, + "step": 3229 + }, + { + "epoch": 1.3135420902806019, + "grad_norm": 11.043149053142503, + "learning_rate": 1.9736678263349906e-05, + "loss": 0.5247, + "step": 3230 + }, + { + "epoch": 1.3139487596583979, + "grad_norm": 41.27284976120871, + "learning_rate": 1.9736447235431257e-05, + "loss": 0.8913, + "step": 3231 + }, + { + "epoch": 1.3143554290361936, + "grad_norm": 15.515741295691702, + "learning_rate": 1.973621610756308e-05, + "loss": 0.8003, + "step": 3232 + }, + { + "epoch": 1.3147620984139894, + "grad_norm": 12.555547322915647, + "learning_rate": 1.973598487974774e-05, + "loss": 0.5241, + "step": 3233 + }, + { + "epoch": 1.3151687677917852, + "grad_norm": 3.039830089146818, + "learning_rate": 1.973575355198762e-05, + "loss": 0.0438, + "step": 3234 + }, + { + "epoch": 1.3155754371695811, + "grad_norm": 8.571439871124658, + "learning_rate": 1.9735522124285088e-05, + "loss": 0.3223, + "step": 3235 + }, + { + "epoch": 1.315982106547377, + "grad_norm": 16.015456640824024, + "learning_rate": 1.973529059664252e-05, + "loss": 0.5419, + "step": 3236 + }, + { + "epoch": 1.316388775925173, + "grad_norm": 7.966241150598879, + "learning_rate": 1.9735058969062296e-05, + "loss": 0.3229, + "step": 3237 + }, + { + "epoch": 1.3167954453029687, + "grad_norm": 2.0757667025643634, + "learning_rate": 1.9734827241546793e-05, + "loss": 0.0291, + "step": 3238 + }, + { + "epoch": 1.3172021146807644, + "grad_norm": 6.065891209616799, + "learning_rate": 1.973459541409839e-05, + "loss": 0.061, + "step": 3239 + }, + { + "epoch": 1.3176087840585604, + "grad_norm": 36.9498231118321, + "learning_rate": 1.973436348671946e-05, + "loss": 0.2491, + "step": 3240 + }, + { + "epoch": 1.3180154534363562, + "grad_norm": 19.2776568185416, + "learning_rate": 1.9734131459412394e-05, + "loss": 0.8426, + "step": 3241 + }, + { + "epoch": 1.3184221228141522, + "grad_norm": 19.724519698625393, + "learning_rate": 1.973389933217957e-05, + "loss": 0.527, + "step": 3242 + }, + { + "epoch": 1.318828792191948, + "grad_norm": 12.091843314660087, + "learning_rate": 1.973366710502337e-05, + "loss": 0.6723, + "step": 3243 + }, + { + "epoch": 1.3192354615697437, + "grad_norm": 8.147094735078854, + "learning_rate": 1.9733434777946178e-05, + "loss": 0.1384, + "step": 3244 + }, + { + "epoch": 1.3196421309475397, + "grad_norm": 17.37149701359157, + "learning_rate": 1.9733202350950377e-05, + "loss": 0.5791, + "step": 3245 + }, + { + "epoch": 1.3200488003253354, + "grad_norm": 5.405221826571641, + "learning_rate": 1.973296982403836e-05, + "loss": 0.0897, + "step": 3246 + }, + { + "epoch": 1.3204554697031314, + "grad_norm": 10.521885727568616, + "learning_rate": 1.9732737197212502e-05, + "loss": 0.321, + "step": 3247 + }, + { + "epoch": 1.3208621390809272, + "grad_norm": 13.499691566723287, + "learning_rate": 1.9732504470475203e-05, + "loss": 0.3834, + "step": 3248 + }, + { + "epoch": 1.321268808458723, + "grad_norm": 2.366801649064574, + "learning_rate": 1.973227164382885e-05, + "loss": 0.051, + "step": 3249 + }, + { + "epoch": 1.321675477836519, + "grad_norm": 8.931290578583136, + "learning_rate": 1.9732038717275825e-05, + "loss": 0.5624, + "step": 3250 + }, + { + "epoch": 1.3220821472143147, + "grad_norm": 11.564033457600766, + "learning_rate": 1.9731805690818527e-05, + "loss": 0.5472, + "step": 3251 + }, + { + "epoch": 1.3224888165921107, + "grad_norm": 10.259771193454348, + "learning_rate": 1.9731572564459344e-05, + "loss": 0.1255, + "step": 3252 + }, + { + "epoch": 1.3228954859699065, + "grad_norm": 46.75343416663918, + "learning_rate": 1.9731339338200674e-05, + "loss": 1.5653, + "step": 3253 + }, + { + "epoch": 1.3233021553477022, + "grad_norm": 7.77974852566974, + "learning_rate": 1.9731106012044905e-05, + "loss": 0.1432, + "step": 3254 + }, + { + "epoch": 1.3237088247254982, + "grad_norm": 8.440589611943153, + "learning_rate": 1.973087258599444e-05, + "loss": 0.3266, + "step": 3255 + }, + { + "epoch": 1.324115494103294, + "grad_norm": 9.2444068439965, + "learning_rate": 1.9730639060051665e-05, + "loss": 0.2353, + "step": 3256 + }, + { + "epoch": 1.32452216348109, + "grad_norm": 26.157495375863615, + "learning_rate": 1.9730405434218986e-05, + "loss": 2.2637, + "step": 3257 + }, + { + "epoch": 1.3249288328588857, + "grad_norm": 22.677435118057847, + "learning_rate": 1.9730171708498795e-05, + "loss": 2.0109, + "step": 3258 + }, + { + "epoch": 1.3253355022366815, + "grad_norm": 23.774485684371374, + "learning_rate": 1.9729937882893497e-05, + "loss": 1.8158, + "step": 3259 + }, + { + "epoch": 1.3257421716144775, + "grad_norm": 7.244767403671954, + "learning_rate": 1.9729703957405487e-05, + "loss": 0.2592, + "step": 3260 + }, + { + "epoch": 1.3261488409922733, + "grad_norm": 12.634166391373851, + "learning_rate": 1.9729469932037172e-05, + "loss": 0.2373, + "step": 3261 + }, + { + "epoch": 1.3265555103700692, + "grad_norm": 9.847927392454027, + "learning_rate": 1.972923580679095e-05, + "loss": 0.2552, + "step": 3262 + }, + { + "epoch": 1.326962179747865, + "grad_norm": 7.526711265841876, + "learning_rate": 1.9729001581669224e-05, + "loss": 0.1503, + "step": 3263 + }, + { + "epoch": 1.3273688491256608, + "grad_norm": 5.016242140833432, + "learning_rate": 1.9728767256674405e-05, + "loss": 0.0967, + "step": 3264 + }, + { + "epoch": 1.3277755185034568, + "grad_norm": 6.955476017714798, + "learning_rate": 1.9728532831808892e-05, + "loss": 0.1448, + "step": 3265 + }, + { + "epoch": 1.3281821878812525, + "grad_norm": 5.92501274856157, + "learning_rate": 1.972829830707509e-05, + "loss": 0.2095, + "step": 3266 + }, + { + "epoch": 1.3285888572590485, + "grad_norm": 6.806355713966058, + "learning_rate": 1.9728063682475412e-05, + "loss": 0.1546, + "step": 3267 + }, + { + "epoch": 1.3289955266368443, + "grad_norm": 9.144233699128783, + "learning_rate": 1.9727828958012266e-05, + "loss": 0.2642, + "step": 3268 + }, + { + "epoch": 1.32940219601464, + "grad_norm": 26.111099941818292, + "learning_rate": 1.9727594133688058e-05, + "loss": 0.8544, + "step": 3269 + }, + { + "epoch": 1.3298088653924358, + "grad_norm": 6.285394952104549, + "learning_rate": 1.97273592095052e-05, + "loss": 0.2148, + "step": 3270 + }, + { + "epoch": 1.3302155347702318, + "grad_norm": 14.693544232758397, + "learning_rate": 1.9727124185466106e-05, + "loss": 1.7208, + "step": 3271 + }, + { + "epoch": 1.3306222041480278, + "grad_norm": 11.728976993554529, + "learning_rate": 1.9726889061573185e-05, + "loss": 0.3015, + "step": 3272 + }, + { + "epoch": 1.3310288735258236, + "grad_norm": 17.07408194180576, + "learning_rate": 1.972665383782885e-05, + "loss": 0.2526, + "step": 3273 + }, + { + "epoch": 1.3314355429036193, + "grad_norm": 7.541917240109893, + "learning_rate": 1.9726418514235522e-05, + "loss": 0.1767, + "step": 3274 + }, + { + "epoch": 1.331842212281415, + "grad_norm": 11.176031013212524, + "learning_rate": 1.972618309079561e-05, + "loss": 0.2991, + "step": 3275 + }, + { + "epoch": 1.332248881659211, + "grad_norm": 14.69175289346567, + "learning_rate": 1.9725947567511535e-05, + "loss": 0.1741, + "step": 3276 + }, + { + "epoch": 1.3326555510370068, + "grad_norm": 15.659011338426009, + "learning_rate": 1.9725711944385713e-05, + "loss": 0.5475, + "step": 3277 + }, + { + "epoch": 1.3330622204148028, + "grad_norm": 17.447040384312604, + "learning_rate": 1.9725476221420562e-05, + "loss": 0.3885, + "step": 3278 + }, + { + "epoch": 1.3334688897925986, + "grad_norm": 10.923234086315293, + "learning_rate": 1.9725240398618504e-05, + "loss": 0.2876, + "step": 3279 + }, + { + "epoch": 1.3338755591703944, + "grad_norm": 25.83170065559397, + "learning_rate": 1.9725004475981958e-05, + "loss": 0.9076, + "step": 3280 + }, + { + "epoch": 1.3342822285481903, + "grad_norm": 8.88442879453646, + "learning_rate": 1.9724768453513346e-05, + "loss": 0.2615, + "step": 3281 + }, + { + "epoch": 1.334688897925986, + "grad_norm": 17.71446397404777, + "learning_rate": 1.9724532331215095e-05, + "loss": 0.8506, + "step": 3282 + }, + { + "epoch": 1.335095567303782, + "grad_norm": 12.342836355548044, + "learning_rate": 1.9724296109089623e-05, + "loss": 0.7248, + "step": 3283 + }, + { + "epoch": 1.3355022366815779, + "grad_norm": 6.471675830996487, + "learning_rate": 1.9724059787139357e-05, + "loss": 0.1772, + "step": 3284 + }, + { + "epoch": 1.3359089060593736, + "grad_norm": 11.054322925864358, + "learning_rate": 1.9723823365366725e-05, + "loss": 0.2771, + "step": 3285 + }, + { + "epoch": 1.3363155754371696, + "grad_norm": 14.337762457471301, + "learning_rate": 1.972358684377415e-05, + "loss": 0.8845, + "step": 3286 + }, + { + "epoch": 1.3367222448149654, + "grad_norm": 14.310951802060663, + "learning_rate": 1.9723350222364063e-05, + "loss": 0.4616, + "step": 3287 + }, + { + "epoch": 1.3371289141927614, + "grad_norm": 4.852663383553416, + "learning_rate": 1.972311350113889e-05, + "loss": 0.1155, + "step": 3288 + }, + { + "epoch": 1.3375355835705571, + "grad_norm": 11.30820108326576, + "learning_rate": 1.9722876680101068e-05, + "loss": 0.2264, + "step": 3289 + }, + { + "epoch": 1.337942252948353, + "grad_norm": 16.21015142970855, + "learning_rate": 1.9722639759253022e-05, + "loss": 0.4795, + "step": 3290 + }, + { + "epoch": 1.3383489223261489, + "grad_norm": 17.995089591786584, + "learning_rate": 1.9722402738597183e-05, + "loss": 0.5184, + "step": 3291 + }, + { + "epoch": 1.3387555917039446, + "grad_norm": 12.513088363674422, + "learning_rate": 1.9722165618135988e-05, + "loss": 0.9092, + "step": 3292 + }, + { + "epoch": 1.3391622610817406, + "grad_norm": 20.376966697418144, + "learning_rate": 1.9721928397871872e-05, + "loss": 0.5375, + "step": 3293 + }, + { + "epoch": 1.3395689304595364, + "grad_norm": 14.865666950476676, + "learning_rate": 1.9721691077807267e-05, + "loss": 0.7125, + "step": 3294 + }, + { + "epoch": 1.3399755998373322, + "grad_norm": 3.906810184961807, + "learning_rate": 1.9721453657944608e-05, + "loss": 0.0915, + "step": 3295 + }, + { + "epoch": 1.3403822692151282, + "grad_norm": 12.267016326753241, + "learning_rate": 1.972121613828634e-05, + "loss": 0.4883, + "step": 3296 + }, + { + "epoch": 1.340788938592924, + "grad_norm": 6.816737042560084, + "learning_rate": 1.972097851883489e-05, + "loss": 0.2622, + "step": 3297 + }, + { + "epoch": 1.34119560797072, + "grad_norm": 3.87644293158215, + "learning_rate": 1.9720740799592704e-05, + "loss": 0.1125, + "step": 3298 + }, + { + "epoch": 1.3416022773485157, + "grad_norm": 18.73004256460719, + "learning_rate": 1.9720502980562224e-05, + "loss": 1.0481, + "step": 3299 + }, + { + "epoch": 1.3420089467263114, + "grad_norm": 12.72144164839788, + "learning_rate": 1.9720265061745885e-05, + "loss": 0.6592, + "step": 3300 + }, + { + "epoch": 1.3424156161041074, + "grad_norm": 5.404450303234163, + "learning_rate": 1.9720027043146135e-05, + "loss": 0.0635, + "step": 3301 + }, + { + "epoch": 1.3428222854819032, + "grad_norm": 7.324900993011534, + "learning_rate": 1.9719788924765416e-05, + "loss": 0.1339, + "step": 3302 + }, + { + "epoch": 1.3432289548596992, + "grad_norm": 23.938093652877498, + "learning_rate": 1.971955070660617e-05, + "loss": 0.4315, + "step": 3303 + }, + { + "epoch": 1.343635624237495, + "grad_norm": 9.124667143221396, + "learning_rate": 1.9719312388670845e-05, + "loss": 0.213, + "step": 3304 + }, + { + "epoch": 1.3440422936152907, + "grad_norm": 15.319167191478286, + "learning_rate": 1.971907397096189e-05, + "loss": 0.3398, + "step": 3305 + }, + { + "epoch": 1.3444489629930867, + "grad_norm": 10.72445707884629, + "learning_rate": 1.9718835453481745e-05, + "loss": 0.2936, + "step": 3306 + }, + { + "epoch": 1.3448556323708825, + "grad_norm": 10.883286618627377, + "learning_rate": 1.9718596836232864e-05, + "loss": 0.472, + "step": 3307 + }, + { + "epoch": 1.3452623017486784, + "grad_norm": 5.4832962533649265, + "learning_rate": 1.9718358119217694e-05, + "loss": 0.1215, + "step": 3308 + }, + { + "epoch": 1.3456689711264742, + "grad_norm": 6.068997820307636, + "learning_rate": 1.971811930243869e-05, + "loss": 0.1001, + "step": 3309 + }, + { + "epoch": 1.34607564050427, + "grad_norm": 4.806576286051711, + "learning_rate": 1.97178803858983e-05, + "loss": 0.0832, + "step": 3310 + }, + { + "epoch": 1.3464823098820657, + "grad_norm": 6.677272464553672, + "learning_rate": 1.9717641369598972e-05, + "loss": 0.2518, + "step": 3311 + }, + { + "epoch": 1.3468889792598617, + "grad_norm": 15.349317878564783, + "learning_rate": 1.971740225354317e-05, + "loss": 0.5427, + "step": 3312 + }, + { + "epoch": 1.3472956486376577, + "grad_norm": 9.497474067803724, + "learning_rate": 1.971716303773334e-05, + "loss": 0.1291, + "step": 3313 + }, + { + "epoch": 1.3477023180154535, + "grad_norm": 9.553902384713796, + "learning_rate": 1.971692372217194e-05, + "loss": 0.3845, + "step": 3314 + }, + { + "epoch": 1.3481089873932492, + "grad_norm": 8.811155893735787, + "learning_rate": 1.9716684306861433e-05, + "loss": 0.1458, + "step": 3315 + }, + { + "epoch": 1.348515656771045, + "grad_norm": 11.74211296722859, + "learning_rate": 1.9716444791804265e-05, + "loss": 0.1216, + "step": 3316 + }, + { + "epoch": 1.348922326148841, + "grad_norm": 41.3439902031373, + "learning_rate": 1.9716205177002906e-05, + "loss": 1.3446, + "step": 3317 + }, + { + "epoch": 1.3493289955266368, + "grad_norm": 1.5144321009421842, + "learning_rate": 1.9715965462459806e-05, + "loss": 0.0122, + "step": 3318 + }, + { + "epoch": 1.3497356649044328, + "grad_norm": 12.719389006369527, + "learning_rate": 1.9715725648177435e-05, + "loss": 0.5192, + "step": 3319 + }, + { + "epoch": 1.3501423342822285, + "grad_norm": 0.9017311687302404, + "learning_rate": 1.9715485734158248e-05, + "loss": 0.0131, + "step": 3320 + }, + { + "epoch": 1.3505490036600243, + "grad_norm": 17.9131951602496, + "learning_rate": 1.9715245720404714e-05, + "loss": 0.3902, + "step": 3321 + }, + { + "epoch": 1.3509556730378203, + "grad_norm": 4.807601173280448, + "learning_rate": 1.9715005606919287e-05, + "loss": 0.0652, + "step": 3322 + }, + { + "epoch": 1.351362342415616, + "grad_norm": 2.7624428168872974, + "learning_rate": 1.971476539370444e-05, + "loss": 0.0518, + "step": 3323 + }, + { + "epoch": 1.351769011793412, + "grad_norm": 10.701279775376996, + "learning_rate": 1.9714525080762637e-05, + "loss": 0.4081, + "step": 3324 + }, + { + "epoch": 1.3521756811712078, + "grad_norm": 1.2222255673163744, + "learning_rate": 1.9714284668096348e-05, + "loss": 0.0164, + "step": 3325 + }, + { + "epoch": 1.3525823505490036, + "grad_norm": 1.6930229865512858, + "learning_rate": 1.9714044155708036e-05, + "loss": 0.0266, + "step": 3326 + }, + { + "epoch": 1.3529890199267995, + "grad_norm": 20.149584502718273, + "learning_rate": 1.971380354360017e-05, + "loss": 0.8167, + "step": 3327 + }, + { + "epoch": 1.3533956893045953, + "grad_norm": 1.627691809901239, + "learning_rate": 1.971356283177522e-05, + "loss": 0.0267, + "step": 3328 + }, + { + "epoch": 1.3538023586823913, + "grad_norm": 19.454005458259875, + "learning_rate": 1.9713322020235665e-05, + "loss": 0.9528, + "step": 3329 + }, + { + "epoch": 1.354209028060187, + "grad_norm": 1.884871170308761, + "learning_rate": 1.9713081108983966e-05, + "loss": 0.0355, + "step": 3330 + }, + { + "epoch": 1.3546156974379828, + "grad_norm": 4.882193501289914, + "learning_rate": 1.9712840098022605e-05, + "loss": 0.1473, + "step": 3331 + }, + { + "epoch": 1.3550223668157788, + "grad_norm": 16.599190902955915, + "learning_rate": 1.9712598987354048e-05, + "loss": 0.8303, + "step": 3332 + }, + { + "epoch": 1.3554290361935746, + "grad_norm": 8.02032752686728, + "learning_rate": 1.9712357776980775e-05, + "loss": 0.1818, + "step": 3333 + }, + { + "epoch": 1.3558357055713706, + "grad_norm": 3.276640322758548, + "learning_rate": 1.9712116466905263e-05, + "loss": 0.096, + "step": 3334 + }, + { + "epoch": 1.3562423749491663, + "grad_norm": 9.819873693414676, + "learning_rate": 1.9711875057129985e-05, + "loss": 0.1977, + "step": 3335 + }, + { + "epoch": 1.356649044326962, + "grad_norm": 5.00687739213794, + "learning_rate": 1.9711633547657426e-05, + "loss": 0.1303, + "step": 3336 + }, + { + "epoch": 1.357055713704758, + "grad_norm": 22.800849791419363, + "learning_rate": 1.9711391938490057e-05, + "loss": 0.5669, + "step": 3337 + }, + { + "epoch": 1.3574623830825538, + "grad_norm": 12.355846433402704, + "learning_rate": 1.9711150229630363e-05, + "loss": 0.5364, + "step": 3338 + }, + { + "epoch": 1.3578690524603498, + "grad_norm": 59.48145831337065, + "learning_rate": 1.9710908421080823e-05, + "loss": 0.5973, + "step": 3339 + }, + { + "epoch": 1.3582757218381456, + "grad_norm": 15.305663009825576, + "learning_rate": 1.9710666512843922e-05, + "loss": 0.2027, + "step": 3340 + }, + { + "epoch": 1.3586823912159414, + "grad_norm": 45.58245794815909, + "learning_rate": 1.971042450492214e-05, + "loss": 1.2934, + "step": 3341 + }, + { + "epoch": 1.3590890605937374, + "grad_norm": 27.618601904426917, + "learning_rate": 1.971018239731797e-05, + "loss": 1.2572, + "step": 3342 + }, + { + "epoch": 1.3594957299715331, + "grad_norm": 28.318515665865, + "learning_rate": 1.9709940190033883e-05, + "loss": 0.8855, + "step": 3343 + }, + { + "epoch": 1.359902399349329, + "grad_norm": 17.406806170187703, + "learning_rate": 1.9709697883072377e-05, + "loss": 1.0463, + "step": 3344 + }, + { + "epoch": 1.3603090687271249, + "grad_norm": 5.226120843218684, + "learning_rate": 1.9709455476435936e-05, + "loss": 0.0686, + "step": 3345 + }, + { + "epoch": 1.3607157381049206, + "grad_norm": 13.956296829421014, + "learning_rate": 1.9709212970127045e-05, + "loss": 0.393, + "step": 3346 + }, + { + "epoch": 1.3611224074827166, + "grad_norm": 13.89608255691408, + "learning_rate": 1.9708970364148197e-05, + "loss": 0.3838, + "step": 3347 + }, + { + "epoch": 1.3615290768605124, + "grad_norm": 11.608047489754226, + "learning_rate": 1.9708727658501882e-05, + "loss": 0.6713, + "step": 3348 + }, + { + "epoch": 1.3619357462383084, + "grad_norm": 10.835382181912314, + "learning_rate": 1.970848485319059e-05, + "loss": 0.1576, + "step": 3349 + }, + { + "epoch": 1.3623424156161041, + "grad_norm": 10.066379794774983, + "learning_rate": 1.970824194821682e-05, + "loss": 0.3038, + "step": 3350 + }, + { + "epoch": 1.3627490849939, + "grad_norm": 12.699979052376179, + "learning_rate": 1.9707998943583058e-05, + "loss": 0.5504, + "step": 3351 + }, + { + "epoch": 1.363155754371696, + "grad_norm": 10.835152103891692, + "learning_rate": 1.97077558392918e-05, + "loss": 0.2947, + "step": 3352 + }, + { + "epoch": 1.3635624237494917, + "grad_norm": 9.544913275998557, + "learning_rate": 1.970751263534554e-05, + "loss": 0.4545, + "step": 3353 + }, + { + "epoch": 1.3639690931272876, + "grad_norm": 14.243016458108775, + "learning_rate": 1.970726933174678e-05, + "loss": 0.7304, + "step": 3354 + }, + { + "epoch": 1.3643757625050834, + "grad_norm": 7.0844264142495685, + "learning_rate": 1.9707025928498013e-05, + "loss": 0.1926, + "step": 3355 + }, + { + "epoch": 1.3647824318828792, + "grad_norm": 17.82240685169231, + "learning_rate": 1.9706782425601743e-05, + "loss": 0.5873, + "step": 3356 + }, + { + "epoch": 1.365189101260675, + "grad_norm": 13.017392961196801, + "learning_rate": 1.970653882306046e-05, + "loss": 0.6662, + "step": 3357 + }, + { + "epoch": 1.365595770638471, + "grad_norm": 12.552391849745476, + "learning_rate": 1.9706295120876678e-05, + "loss": 0.3306, + "step": 3358 + }, + { + "epoch": 1.3660024400162667, + "grad_norm": 6.647199645916804, + "learning_rate": 1.9706051319052886e-05, + "loss": 0.2078, + "step": 3359 + }, + { + "epoch": 1.3664091093940627, + "grad_norm": 6.021461219043981, + "learning_rate": 1.9705807417591596e-05, + "loss": 0.1079, + "step": 3360 + }, + { + "epoch": 1.3668157787718584, + "grad_norm": 8.574270211973134, + "learning_rate": 1.9705563416495304e-05, + "loss": 0.3373, + "step": 3361 + }, + { + "epoch": 1.3672224481496542, + "grad_norm": 7.296129264902366, + "learning_rate": 1.970531931576652e-05, + "loss": 0.3854, + "step": 3362 + }, + { + "epoch": 1.3676291175274502, + "grad_norm": 17.87465729668356, + "learning_rate": 1.970507511540775e-05, + "loss": 0.132, + "step": 3363 + }, + { + "epoch": 1.368035786905246, + "grad_norm": 12.186894068227828, + "learning_rate": 1.97048308154215e-05, + "loss": 0.6119, + "step": 3364 + }, + { + "epoch": 1.368442456283042, + "grad_norm": 6.770462158953771, + "learning_rate": 1.970458641581028e-05, + "loss": 0.1414, + "step": 3365 + }, + { + "epoch": 1.3688491256608377, + "grad_norm": 8.525295472897664, + "learning_rate": 1.970434191657659e-05, + "loss": 0.4244, + "step": 3366 + }, + { + "epoch": 1.3692557950386335, + "grad_norm": 7.632811226065002, + "learning_rate": 1.970409731772295e-05, + "loss": 0.1904, + "step": 3367 + }, + { + "epoch": 1.3696624644164295, + "grad_norm": 35.26697126581435, + "learning_rate": 1.9703852619251864e-05, + "loss": 0.1894, + "step": 3368 + }, + { + "epoch": 1.3700691337942252, + "grad_norm": 10.144137254223999, + "learning_rate": 1.970360782116585e-05, + "loss": 0.2389, + "step": 3369 + }, + { + "epoch": 1.3704758031720212, + "grad_norm": 5.0225089538126975, + "learning_rate": 1.9703362923467416e-05, + "loss": 0.0847, + "step": 3370 + }, + { + "epoch": 1.370882472549817, + "grad_norm": 28.7011336480477, + "learning_rate": 1.970311792615908e-05, + "loss": 0.5241, + "step": 3371 + }, + { + "epoch": 1.3712891419276128, + "grad_norm": 14.7005913661507, + "learning_rate": 1.9702872829243352e-05, + "loss": 0.9147, + "step": 3372 + }, + { + "epoch": 1.3716958113054087, + "grad_norm": 11.915905361856499, + "learning_rate": 1.9702627632722757e-05, + "loss": 0.4908, + "step": 3373 + }, + { + "epoch": 1.3721024806832045, + "grad_norm": 4.2538206145866075, + "learning_rate": 1.97023823365998e-05, + "loss": 0.1118, + "step": 3374 + }, + { + "epoch": 1.3725091500610005, + "grad_norm": 1.9920061387639785, + "learning_rate": 1.9702136940877006e-05, + "loss": 0.0413, + "step": 3375 + }, + { + "epoch": 1.3729158194387963, + "grad_norm": 2.753971467900864, + "learning_rate": 1.9701891445556895e-05, + "loss": 0.0535, + "step": 3376 + }, + { + "epoch": 1.373322488816592, + "grad_norm": 24.59636962337153, + "learning_rate": 1.9701645850641986e-05, + "loss": 0.6155, + "step": 3377 + }, + { + "epoch": 1.373729158194388, + "grad_norm": 13.345117110041441, + "learning_rate": 1.9701400156134797e-05, + "loss": 0.184, + "step": 3378 + }, + { + "epoch": 1.3741358275721838, + "grad_norm": 24.61700209363476, + "learning_rate": 1.9701154362037855e-05, + "loss": 1.4362, + "step": 3379 + }, + { + "epoch": 1.3745424969499798, + "grad_norm": 12.069406735129013, + "learning_rate": 1.970090846835368e-05, + "loss": 0.3595, + "step": 3380 + }, + { + "epoch": 1.3749491663277755, + "grad_norm": 9.14932719736133, + "learning_rate": 1.9700662475084797e-05, + "loss": 0.3878, + "step": 3381 + }, + { + "epoch": 1.3753558357055713, + "grad_norm": 9.705071105383997, + "learning_rate": 1.9700416382233733e-05, + "loss": 0.2544, + "step": 3382 + }, + { + "epoch": 1.3757625050833673, + "grad_norm": 12.758206701108831, + "learning_rate": 1.9700170189803014e-05, + "loss": 0.2644, + "step": 3383 + }, + { + "epoch": 1.376169174461163, + "grad_norm": 13.240103114493607, + "learning_rate": 1.9699923897795165e-05, + "loss": 0.523, + "step": 3384 + }, + { + "epoch": 1.376575843838959, + "grad_norm": 1.2954062409850422, + "learning_rate": 1.969967750621271e-05, + "loss": 0.0105, + "step": 3385 + }, + { + "epoch": 1.3769825132167548, + "grad_norm": 7.668466036438171, + "learning_rate": 1.969943101505819e-05, + "loss": 0.163, + "step": 3386 + }, + { + "epoch": 1.3773891825945506, + "grad_norm": 10.534392632547409, + "learning_rate": 1.9699184424334127e-05, + "loss": 0.2621, + "step": 3387 + }, + { + "epoch": 1.3777958519723466, + "grad_norm": 30.671280879438005, + "learning_rate": 1.969893773404305e-05, + "loss": 0.7861, + "step": 3388 + }, + { + "epoch": 1.3782025213501423, + "grad_norm": 11.737666318221184, + "learning_rate": 1.9698690944187505e-05, + "loss": 0.2778, + "step": 3389 + }, + { + "epoch": 1.3786091907279383, + "grad_norm": 6.1094532117881615, + "learning_rate": 1.969844405477001e-05, + "loss": 0.1767, + "step": 3390 + }, + { + "epoch": 1.379015860105734, + "grad_norm": 1.8470843640496797, + "learning_rate": 1.9698197065793107e-05, + "loss": 0.023, + "step": 3391 + }, + { + "epoch": 1.3794225294835298, + "grad_norm": 16.607784161636374, + "learning_rate": 1.969794997725933e-05, + "loss": 0.3083, + "step": 3392 + }, + { + "epoch": 1.3798291988613258, + "grad_norm": 0.8743879415590986, + "learning_rate": 1.9697702789171217e-05, + "loss": 0.0125, + "step": 3393 + }, + { + "epoch": 1.3802358682391216, + "grad_norm": 16.50096821633784, + "learning_rate": 1.96974555015313e-05, + "loss": 0.4977, + "step": 3394 + }, + { + "epoch": 1.3806425376169176, + "grad_norm": 3.2282340427450027, + "learning_rate": 1.9697208114342126e-05, + "loss": 0.0469, + "step": 3395 + }, + { + "epoch": 1.3810492069947133, + "grad_norm": 19.002560313868237, + "learning_rate": 1.969696062760623e-05, + "loss": 0.733, + "step": 3396 + }, + { + "epoch": 1.381455876372509, + "grad_norm": 2.3505207009825346, + "learning_rate": 1.969671304132615e-05, + "loss": 0.0213, + "step": 3397 + }, + { + "epoch": 1.3818625457503049, + "grad_norm": 6.29412723038642, + "learning_rate": 1.9696465355504432e-05, + "loss": 0.1129, + "step": 3398 + }, + { + "epoch": 1.3822692151281009, + "grad_norm": 4.891508984422087, + "learning_rate": 1.9696217570143616e-05, + "loss": 0.1154, + "step": 3399 + }, + { + "epoch": 1.3826758845058966, + "grad_norm": 6.016118266810501, + "learning_rate": 1.9695969685246247e-05, + "loss": 0.1096, + "step": 3400 + }, + { + "epoch": 1.3830825538836926, + "grad_norm": 15.960374042650093, + "learning_rate": 1.969572170081487e-05, + "loss": 0.6626, + "step": 3401 + }, + { + "epoch": 1.3834892232614884, + "grad_norm": 4.596638814689929, + "learning_rate": 1.969547361685203e-05, + "loss": 0.0728, + "step": 3402 + }, + { + "epoch": 1.3838958926392841, + "grad_norm": 15.04572525445932, + "learning_rate": 1.9695225433360273e-05, + "loss": 0.8095, + "step": 3403 + }, + { + "epoch": 1.3843025620170801, + "grad_norm": 17.97840510712015, + "learning_rate": 1.969497715034215e-05, + "loss": 0.4027, + "step": 3404 + }, + { + "epoch": 1.384709231394876, + "grad_norm": 2.70168082830703, + "learning_rate": 1.96947287678002e-05, + "loss": 0.0434, + "step": 3405 + }, + { + "epoch": 1.3851159007726719, + "grad_norm": 2.3252133863804865, + "learning_rate": 1.9694480285736984e-05, + "loss": 0.0324, + "step": 3406 + }, + { + "epoch": 1.3855225701504676, + "grad_norm": 7.355976614101487, + "learning_rate": 1.9694231704155054e-05, + "loss": 0.2047, + "step": 3407 + }, + { + "epoch": 1.3859292395282634, + "grad_norm": 11.659911073091065, + "learning_rate": 1.969398302305695e-05, + "loss": 0.3611, + "step": 3408 + }, + { + "epoch": 1.3863359089060594, + "grad_norm": 23.117457284840526, + "learning_rate": 1.9693734242445235e-05, + "loss": 2.4054, + "step": 3409 + }, + { + "epoch": 1.3867425782838552, + "grad_norm": 14.64094767732075, + "learning_rate": 1.969348536232246e-05, + "loss": 0.2842, + "step": 3410 + }, + { + "epoch": 1.3871492476616512, + "grad_norm": 10.77558042923643, + "learning_rate": 1.9693236382691174e-05, + "loss": 0.3291, + "step": 3411 + }, + { + "epoch": 1.387555917039447, + "grad_norm": 19.094704547940207, + "learning_rate": 1.9692987303553942e-05, + "loss": 0.1978, + "step": 3412 + }, + { + "epoch": 1.3879625864172427, + "grad_norm": 18.92234377827657, + "learning_rate": 1.9692738124913317e-05, + "loss": 0.8657, + "step": 3413 + }, + { + "epoch": 1.3883692557950387, + "grad_norm": 12.25395070506609, + "learning_rate": 1.9692488846771853e-05, + "loss": 0.305, + "step": 3414 + }, + { + "epoch": 1.3887759251728344, + "grad_norm": 16.544868030534783, + "learning_rate": 1.9692239469132116e-05, + "loss": 0.8245, + "step": 3415 + }, + { + "epoch": 1.3891825945506304, + "grad_norm": 14.364807176850718, + "learning_rate": 1.9691989991996663e-05, + "loss": 0.5153, + "step": 3416 + }, + { + "epoch": 1.3895892639284262, + "grad_norm": 12.545278142947833, + "learning_rate": 1.9691740415368053e-05, + "loss": 0.398, + "step": 3417 + }, + { + "epoch": 1.389995933306222, + "grad_norm": 7.3514018365214255, + "learning_rate": 1.9691490739248854e-05, + "loss": 0.3389, + "step": 3418 + }, + { + "epoch": 1.390402602684018, + "grad_norm": 10.427632462303963, + "learning_rate": 1.969124096364162e-05, + "loss": 0.262, + "step": 3419 + }, + { + "epoch": 1.3908092720618137, + "grad_norm": 17.911060078319682, + "learning_rate": 1.9690991088548923e-05, + "loss": 1.24, + "step": 3420 + }, + { + "epoch": 1.3912159414396097, + "grad_norm": 9.538120883394498, + "learning_rate": 1.9690741113973324e-05, + "loss": 0.1823, + "step": 3421 + }, + { + "epoch": 1.3916226108174055, + "grad_norm": 14.10914356508155, + "learning_rate": 1.969049103991739e-05, + "loss": 0.3764, + "step": 3422 + }, + { + "epoch": 1.3920292801952012, + "grad_norm": 3.6465326093407047, + "learning_rate": 1.969024086638369e-05, + "loss": 0.1633, + "step": 3423 + }, + { + "epoch": 1.3924359495729972, + "grad_norm": 8.861825569957738, + "learning_rate": 1.9689990593374788e-05, + "loss": 0.1397, + "step": 3424 + }, + { + "epoch": 1.392842618950793, + "grad_norm": 11.92285321383512, + "learning_rate": 1.968974022089326e-05, + "loss": 0.5221, + "step": 3425 + }, + { + "epoch": 1.393249288328589, + "grad_norm": 22.4667322588501, + "learning_rate": 1.9689489748941665e-05, + "loss": 0.9959, + "step": 3426 + }, + { + "epoch": 1.3936559577063847, + "grad_norm": 12.076229402570746, + "learning_rate": 1.9689239177522586e-05, + "loss": 0.0993, + "step": 3427 + }, + { + "epoch": 1.3940626270841805, + "grad_norm": 10.75529055851647, + "learning_rate": 1.968898850663859e-05, + "loss": 0.3067, + "step": 3428 + }, + { + "epoch": 1.3944692964619765, + "grad_norm": 15.4108257781621, + "learning_rate": 1.968873773629225e-05, + "loss": 0.5531, + "step": 3429 + }, + { + "epoch": 1.3948759658397722, + "grad_norm": 5.453091609939368, + "learning_rate": 1.968848686648614e-05, + "loss": 0.0599, + "step": 3430 + }, + { + "epoch": 1.3952826352175682, + "grad_norm": 18.981600234045207, + "learning_rate": 1.968823589722284e-05, + "loss": 1.0782, + "step": 3431 + }, + { + "epoch": 1.395689304595364, + "grad_norm": 10.817922073989683, + "learning_rate": 1.968798482850492e-05, + "loss": 0.2888, + "step": 3432 + }, + { + "epoch": 1.3960959739731598, + "grad_norm": 3.722339638494909, + "learning_rate": 1.9687733660334958e-05, + "loss": 0.0534, + "step": 3433 + }, + { + "epoch": 1.3965026433509558, + "grad_norm": 28.047050453742333, + "learning_rate": 1.9687482392715535e-05, + "loss": 0.9695, + "step": 3434 + }, + { + "epoch": 1.3969093127287515, + "grad_norm": 3.7387416459600824, + "learning_rate": 1.968723102564923e-05, + "loss": 0.0617, + "step": 3435 + }, + { + "epoch": 1.3973159821065475, + "grad_norm": 15.830858465131328, + "learning_rate": 1.9686979559138624e-05, + "loss": 0.9736, + "step": 3436 + }, + { + "epoch": 1.3977226514843433, + "grad_norm": 10.930145446312624, + "learning_rate": 1.9686727993186297e-05, + "loss": 0.1597, + "step": 3437 + }, + { + "epoch": 1.398129320862139, + "grad_norm": 4.868553556157944, + "learning_rate": 1.968647632779483e-05, + "loss": 0.0782, + "step": 3438 + }, + { + "epoch": 1.3985359902399348, + "grad_norm": 15.599801728861003, + "learning_rate": 1.968622456296681e-05, + "loss": 0.9377, + "step": 3439 + }, + { + "epoch": 1.3989426596177308, + "grad_norm": 18.900205934329158, + "learning_rate": 1.968597269870482e-05, + "loss": 0.8942, + "step": 3440 + }, + { + "epoch": 1.3993493289955268, + "grad_norm": 8.719068021404931, + "learning_rate": 1.968572073501145e-05, + "loss": 0.3611, + "step": 3441 + }, + { + "epoch": 1.3997559983733225, + "grad_norm": 20.338487316519732, + "learning_rate": 1.9685468671889274e-05, + "loss": 1.2117, + "step": 3442 + }, + { + "epoch": 1.4001626677511183, + "grad_norm": 9.328263204887529, + "learning_rate": 1.968521650934089e-05, + "loss": 0.4293, + "step": 3443 + }, + { + "epoch": 1.400569337128914, + "grad_norm": 14.865125005157063, + "learning_rate": 1.9684964247368885e-05, + "loss": 0.3374, + "step": 3444 + }, + { + "epoch": 1.40097600650671, + "grad_norm": 20.92588283662988, + "learning_rate": 1.9684711885975848e-05, + "loss": 0.5496, + "step": 3445 + }, + { + "epoch": 1.4013826758845058, + "grad_norm": 14.089302498949722, + "learning_rate": 1.968445942516437e-05, + "loss": 0.6451, + "step": 3446 + }, + { + "epoch": 1.4017893452623018, + "grad_norm": 7.405071497448515, + "learning_rate": 1.9684206864937042e-05, + "loss": 0.3471, + "step": 3447 + }, + { + "epoch": 1.4021960146400976, + "grad_norm": 21.365274817849283, + "learning_rate": 1.9683954205296454e-05, + "loss": 1.8675, + "step": 3448 + }, + { + "epoch": 1.4026026840178933, + "grad_norm": 15.600862982234634, + "learning_rate": 1.9683701446245202e-05, + "loss": 0.9098, + "step": 3449 + }, + { + "epoch": 1.4030093533956893, + "grad_norm": 4.904407321582975, + "learning_rate": 1.9683448587785886e-05, + "loss": 0.0946, + "step": 3450 + }, + { + "epoch": 1.403416022773485, + "grad_norm": 8.970459743929855, + "learning_rate": 1.9683195629921094e-05, + "loss": 0.2289, + "step": 3451 + }, + { + "epoch": 1.403822692151281, + "grad_norm": 0.5449516052293052, + "learning_rate": 1.968294257265342e-05, + "loss": 0.0109, + "step": 3452 + }, + { + "epoch": 1.4042293615290768, + "grad_norm": 7.845136231214165, + "learning_rate": 1.9682689415985474e-05, + "loss": 0.1649, + "step": 3453 + }, + { + "epoch": 1.4046360309068726, + "grad_norm": 11.250918512796394, + "learning_rate": 1.9682436159919846e-05, + "loss": 0.5765, + "step": 3454 + }, + { + "epoch": 1.4050427002846686, + "grad_norm": 27.967943144493546, + "learning_rate": 1.9682182804459137e-05, + "loss": 0.1656, + "step": 3455 + }, + { + "epoch": 1.4054493696624644, + "grad_norm": 0.7098587068607981, + "learning_rate": 1.9681929349605952e-05, + "loss": 0.0122, + "step": 3456 + }, + { + "epoch": 1.4058560390402604, + "grad_norm": 10.5984810058462, + "learning_rate": 1.9681675795362884e-05, + "loss": 0.3657, + "step": 3457 + }, + { + "epoch": 1.4062627084180561, + "grad_norm": 12.081607416503166, + "learning_rate": 1.9681422141732544e-05, + "loss": 0.3884, + "step": 3458 + }, + { + "epoch": 1.4066693777958519, + "grad_norm": 13.257755301492773, + "learning_rate": 1.9681168388717534e-05, + "loss": 0.3473, + "step": 3459 + }, + { + "epoch": 1.4070760471736479, + "grad_norm": 12.532686973035768, + "learning_rate": 1.9680914536320457e-05, + "loss": 0.4866, + "step": 3460 + }, + { + "epoch": 1.4074827165514436, + "grad_norm": 17.644530425537237, + "learning_rate": 1.9680660584543922e-05, + "loss": 1.0369, + "step": 3461 + }, + { + "epoch": 1.4078893859292396, + "grad_norm": 97.57896572755686, + "learning_rate": 1.968040653339053e-05, + "loss": 1.076, + "step": 3462 + }, + { + "epoch": 1.4082960553070354, + "grad_norm": 7.4862054705742045, + "learning_rate": 1.9680152382862894e-05, + "loss": 0.3177, + "step": 3463 + }, + { + "epoch": 1.4087027246848312, + "grad_norm": 10.247994236886349, + "learning_rate": 1.9679898132963623e-05, + "loss": 0.521, + "step": 3464 + }, + { + "epoch": 1.4091093940626271, + "grad_norm": 12.780616348299723, + "learning_rate": 1.9679643783695325e-05, + "loss": 0.8433, + "step": 3465 + }, + { + "epoch": 1.409516063440423, + "grad_norm": 3.7552709145750023, + "learning_rate": 1.9679389335060613e-05, + "loss": 0.0604, + "step": 3466 + }, + { + "epoch": 1.409922732818219, + "grad_norm": 6.434932736594949, + "learning_rate": 1.96791347870621e-05, + "loss": 0.181, + "step": 3467 + }, + { + "epoch": 1.4103294021960147, + "grad_norm": 6.882524895017433, + "learning_rate": 1.9678880139702392e-05, + "loss": 0.166, + "step": 3468 + }, + { + "epoch": 1.4107360715738104, + "grad_norm": 7.870744158125587, + "learning_rate": 1.967862539298411e-05, + "loss": 0.1982, + "step": 3469 + }, + { + "epoch": 1.4111427409516064, + "grad_norm": 6.345348274284139, + "learning_rate": 1.9678370546909868e-05, + "loss": 0.2347, + "step": 3470 + }, + { + "epoch": 1.4115494103294022, + "grad_norm": 11.225235992787068, + "learning_rate": 1.9678115601482282e-05, + "loss": 0.2507, + "step": 3471 + }, + { + "epoch": 1.4119560797071982, + "grad_norm": 25.208777482905454, + "learning_rate": 1.9677860556703965e-05, + "loss": 0.323, + "step": 3472 + }, + { + "epoch": 1.412362749084994, + "grad_norm": 9.44894734843842, + "learning_rate": 1.967760541257754e-05, + "loss": 0.2367, + "step": 3473 + }, + { + "epoch": 1.4127694184627897, + "grad_norm": 14.733452505810343, + "learning_rate": 1.9677350169105627e-05, + "loss": 0.7791, + "step": 3474 + }, + { + "epoch": 1.4131760878405857, + "grad_norm": 14.76715843012708, + "learning_rate": 1.9677094826290844e-05, + "loss": 0.5327, + "step": 3475 + }, + { + "epoch": 1.4135827572183814, + "grad_norm": 12.198761301750253, + "learning_rate": 1.9676839384135812e-05, + "loss": 0.4185, + "step": 3476 + }, + { + "epoch": 1.4139894265961774, + "grad_norm": 11.180898522463345, + "learning_rate": 1.9676583842643152e-05, + "loss": 0.3097, + "step": 3477 + }, + { + "epoch": 1.4143960959739732, + "grad_norm": 15.927487092756108, + "learning_rate": 1.967632820181549e-05, + "loss": 0.557, + "step": 3478 + }, + { + "epoch": 1.414802765351769, + "grad_norm": 11.420711904823388, + "learning_rate": 1.9676072461655446e-05, + "loss": 0.4004, + "step": 3479 + }, + { + "epoch": 1.4152094347295647, + "grad_norm": 2.789818539454379, + "learning_rate": 1.9675816622165652e-05, + "loss": 0.0386, + "step": 3480 + }, + { + "epoch": 1.4156161041073607, + "grad_norm": 34.29457606646319, + "learning_rate": 1.967556068334873e-05, + "loss": 1.6077, + "step": 3481 + }, + { + "epoch": 1.4160227734851567, + "grad_norm": 13.05226510371793, + "learning_rate": 1.9675304645207306e-05, + "loss": 0.4176, + "step": 3482 + }, + { + "epoch": 1.4164294428629525, + "grad_norm": 29.22960298442077, + "learning_rate": 1.9675048507744012e-05, + "loss": 0.9587, + "step": 3483 + }, + { + "epoch": 1.4168361122407482, + "grad_norm": 3.189956353450483, + "learning_rate": 1.9674792270961474e-05, + "loss": 0.0421, + "step": 3484 + }, + { + "epoch": 1.417242781618544, + "grad_norm": 5.185279081343552, + "learning_rate": 1.9674535934862327e-05, + "loss": 0.0764, + "step": 3485 + }, + { + "epoch": 1.41764945099634, + "grad_norm": 8.641454893333862, + "learning_rate": 1.9674279499449194e-05, + "loss": 0.3683, + "step": 3486 + }, + { + "epoch": 1.4180561203741358, + "grad_norm": 10.327403903215476, + "learning_rate": 1.9674022964724718e-05, + "loss": 0.4815, + "step": 3487 + }, + { + "epoch": 1.4184627897519317, + "grad_norm": 10.02634139729697, + "learning_rate": 1.9673766330691527e-05, + "loss": 0.2626, + "step": 3488 + }, + { + "epoch": 1.4188694591297275, + "grad_norm": 3.682056354620918, + "learning_rate": 1.9673509597352258e-05, + "loss": 0.0571, + "step": 3489 + }, + { + "epoch": 1.4192761285075233, + "grad_norm": 33.232854663094486, + "learning_rate": 1.967325276470954e-05, + "loss": 0.8517, + "step": 3490 + }, + { + "epoch": 1.4196827978853193, + "grad_norm": 10.616485416318126, + "learning_rate": 1.9672995832766015e-05, + "loss": 0.2124, + "step": 3491 + }, + { + "epoch": 1.420089467263115, + "grad_norm": 12.449140843062963, + "learning_rate": 1.967273880152432e-05, + "loss": 0.2278, + "step": 3492 + }, + { + "epoch": 1.420496136640911, + "grad_norm": 17.539519560886394, + "learning_rate": 1.9672481670987093e-05, + "loss": 0.7761, + "step": 3493 + }, + { + "epoch": 1.4209028060187068, + "grad_norm": 12.476896857590587, + "learning_rate": 1.9672224441156975e-05, + "loss": 0.5591, + "step": 3494 + }, + { + "epoch": 1.4213094753965025, + "grad_norm": 3.5423971536465757, + "learning_rate": 1.9671967112036605e-05, + "loss": 0.0655, + "step": 3495 + }, + { + "epoch": 1.4217161447742985, + "grad_norm": 19.97150864303357, + "learning_rate": 1.9671709683628624e-05, + "loss": 0.2243, + "step": 3496 + }, + { + "epoch": 1.4221228141520943, + "grad_norm": 9.25156184088394, + "learning_rate": 1.9671452155935674e-05, + "loss": 0.3415, + "step": 3497 + }, + { + "epoch": 1.4225294835298903, + "grad_norm": 2.226820993162864, + "learning_rate": 1.96711945289604e-05, + "loss": 0.0411, + "step": 3498 + }, + { + "epoch": 1.422936152907686, + "grad_norm": 8.20907888585257, + "learning_rate": 1.9670936802705447e-05, + "loss": 0.1557, + "step": 3499 + }, + { + "epoch": 1.4233428222854818, + "grad_norm": 18.01620712026314, + "learning_rate": 1.9670678977173462e-05, + "loss": 0.6378, + "step": 3500 + }, + { + "epoch": 1.4237494916632778, + "grad_norm": 14.10931576540015, + "learning_rate": 1.967042105236709e-05, + "loss": 0.7792, + "step": 3501 + }, + { + "epoch": 1.4241561610410736, + "grad_norm": 4.238012963007597, + "learning_rate": 1.9670163028288978e-05, + "loss": 0.0462, + "step": 3502 + }, + { + "epoch": 1.4245628304188696, + "grad_norm": 12.631913244308002, + "learning_rate": 1.9669904904941778e-05, + "loss": 0.4273, + "step": 3503 + }, + { + "epoch": 1.4249694997966653, + "grad_norm": 15.20566561739342, + "learning_rate": 1.9669646682328134e-05, + "loss": 0.6556, + "step": 3504 + }, + { + "epoch": 1.425376169174461, + "grad_norm": 8.805479603857927, + "learning_rate": 1.9669388360450702e-05, + "loss": 0.3515, + "step": 3505 + }, + { + "epoch": 1.425782838552257, + "grad_norm": 8.158401868751934, + "learning_rate": 1.966912993931213e-05, + "loss": 0.2502, + "step": 3506 + }, + { + "epoch": 1.4261895079300528, + "grad_norm": 7.6593165600866415, + "learning_rate": 1.9668871418915074e-05, + "loss": 0.1622, + "step": 3507 + }, + { + "epoch": 1.4265961773078488, + "grad_norm": 4.518468168346309, + "learning_rate": 1.966861279926219e-05, + "loss": 0.196, + "step": 3508 + }, + { + "epoch": 1.4270028466856446, + "grad_norm": 4.631314692382456, + "learning_rate": 1.9668354080356126e-05, + "loss": 0.068, + "step": 3509 + }, + { + "epoch": 1.4274095160634404, + "grad_norm": 7.208928379117441, + "learning_rate": 1.966809526219954e-05, + "loss": 0.1754, + "step": 3510 + }, + { + "epoch": 1.4278161854412363, + "grad_norm": 22.330044962834098, + "learning_rate": 1.9667836344795092e-05, + "loss": 0.9415, + "step": 3511 + }, + { + "epoch": 1.428222854819032, + "grad_norm": 17.896985055234712, + "learning_rate": 1.966757732814544e-05, + "loss": 0.661, + "step": 3512 + }, + { + "epoch": 1.428629524196828, + "grad_norm": 42.42121037687938, + "learning_rate": 1.966731821225324e-05, + "loss": 0.3051, + "step": 3513 + }, + { + "epoch": 1.4290361935746239, + "grad_norm": 3.633003266322307, + "learning_rate": 1.966705899712115e-05, + "loss": 0.0559, + "step": 3514 + }, + { + "epoch": 1.4294428629524196, + "grad_norm": 13.225110708043287, + "learning_rate": 1.966679968275184e-05, + "loss": 0.688, + "step": 3515 + }, + { + "epoch": 1.4298495323302156, + "grad_norm": 12.462969221411447, + "learning_rate": 1.966654026914796e-05, + "loss": 0.323, + "step": 3516 + }, + { + "epoch": 1.4302562017080114, + "grad_norm": 7.828610411824697, + "learning_rate": 1.9666280756312185e-05, + "loss": 0.2408, + "step": 3517 + }, + { + "epoch": 1.4306628710858074, + "grad_norm": 4.134581139785692, + "learning_rate": 1.966602114424717e-05, + "loss": 0.0758, + "step": 3518 + }, + { + "epoch": 1.4310695404636031, + "grad_norm": 13.602357959994935, + "learning_rate": 1.9665761432955582e-05, + "loss": 1.201, + "step": 3519 + }, + { + "epoch": 1.431476209841399, + "grad_norm": 7.785642298174467, + "learning_rate": 1.9665501622440087e-05, + "loss": 0.2192, + "step": 3520 + }, + { + "epoch": 1.4318828792191947, + "grad_norm": 7.2635078298981295, + "learning_rate": 1.9665241712703356e-05, + "loss": 0.1152, + "step": 3521 + }, + { + "epoch": 1.4322895485969906, + "grad_norm": 12.58285008451158, + "learning_rate": 1.9664981703748055e-05, + "loss": 0.3814, + "step": 3522 + }, + { + "epoch": 1.4326962179747866, + "grad_norm": 4.011274039443395, + "learning_rate": 1.966472159557685e-05, + "loss": 0.0454, + "step": 3523 + }, + { + "epoch": 1.4331028873525824, + "grad_norm": 5.020206929428199, + "learning_rate": 1.9664461388192416e-05, + "loss": 0.102, + "step": 3524 + }, + { + "epoch": 1.4335095567303782, + "grad_norm": 6.327334299991869, + "learning_rate": 1.966420108159742e-05, + "loss": 0.3439, + "step": 3525 + }, + { + "epoch": 1.433916226108174, + "grad_norm": 10.622861510279868, + "learning_rate": 1.9663940675794537e-05, + "loss": 0.6941, + "step": 3526 + }, + { + "epoch": 1.43432289548597, + "grad_norm": 4.541271054405765, + "learning_rate": 1.966368017078644e-05, + "loss": 0.0676, + "step": 3527 + }, + { + "epoch": 1.4347295648637657, + "grad_norm": 6.171519840371758, + "learning_rate": 1.9663419566575803e-05, + "loss": 0.2748, + "step": 3528 + }, + { + "epoch": 1.4351362342415617, + "grad_norm": 14.727591182691642, + "learning_rate": 1.9663158863165296e-05, + "loss": 0.6098, + "step": 3529 + }, + { + "epoch": 1.4355429036193574, + "grad_norm": 10.417390063340324, + "learning_rate": 1.9662898060557606e-05, + "loss": 0.2573, + "step": 3530 + }, + { + "epoch": 1.4359495729971532, + "grad_norm": 98.34714660500359, + "learning_rate": 1.96626371587554e-05, + "loss": 0.6437, + "step": 3531 + }, + { + "epoch": 1.4363562423749492, + "grad_norm": 4.3682654221772514, + "learning_rate": 1.966237615776136e-05, + "loss": 0.1074, + "step": 3532 + }, + { + "epoch": 1.436762911752745, + "grad_norm": 15.479115670062612, + "learning_rate": 1.966211505757817e-05, + "loss": 0.9293, + "step": 3533 + }, + { + "epoch": 1.437169581130541, + "grad_norm": 12.644126951473122, + "learning_rate": 1.9661853858208504e-05, + "loss": 0.4778, + "step": 3534 + }, + { + "epoch": 1.4375762505083367, + "grad_norm": 18.905204707157264, + "learning_rate": 1.9661592559655044e-05, + "loss": 0.423, + "step": 3535 + }, + { + "epoch": 1.4379829198861325, + "grad_norm": 6.117177158337989, + "learning_rate": 1.9661331161920474e-05, + "loss": 0.0738, + "step": 3536 + }, + { + "epoch": 1.4383895892639285, + "grad_norm": 10.366030026374206, + "learning_rate": 1.966106966500748e-05, + "loss": 0.2114, + "step": 3537 + }, + { + "epoch": 1.4387962586417242, + "grad_norm": 0.4407246082822749, + "learning_rate": 1.966080806891874e-05, + "loss": 0.0079, + "step": 3538 + }, + { + "epoch": 1.4392029280195202, + "grad_norm": 17.716226710936247, + "learning_rate": 1.9660546373656945e-05, + "loss": 0.4354, + "step": 3539 + }, + { + "epoch": 1.439609597397316, + "grad_norm": 12.661196625398297, + "learning_rate": 1.966028457922478e-05, + "loss": 0.6541, + "step": 3540 + }, + { + "epoch": 1.4400162667751117, + "grad_norm": 7.747356591738031, + "learning_rate": 1.966002268562493e-05, + "loss": 0.1975, + "step": 3541 + }, + { + "epoch": 1.4404229361529077, + "grad_norm": 15.804678305973503, + "learning_rate": 1.9659760692860084e-05, + "loss": 0.8039, + "step": 3542 + }, + { + "epoch": 1.4408296055307035, + "grad_norm": 15.231416971403231, + "learning_rate": 1.9659498600932937e-05, + "loss": 0.7168, + "step": 3543 + }, + { + "epoch": 1.4412362749084995, + "grad_norm": 11.651170225541307, + "learning_rate": 1.9659236409846173e-05, + "loss": 0.5737, + "step": 3544 + }, + { + "epoch": 1.4416429442862952, + "grad_norm": 4.671988820161474, + "learning_rate": 1.9658974119602487e-05, + "loss": 0.1531, + "step": 3545 + }, + { + "epoch": 1.442049613664091, + "grad_norm": 10.55682418008177, + "learning_rate": 1.9658711730204568e-05, + "loss": 0.3415, + "step": 3546 + }, + { + "epoch": 1.442456283041887, + "grad_norm": 11.26058249740373, + "learning_rate": 1.9658449241655114e-05, + "loss": 0.5377, + "step": 3547 + }, + { + "epoch": 1.4428629524196828, + "grad_norm": 4.772156153131741, + "learning_rate": 1.9658186653956818e-05, + "loss": 0.0942, + "step": 3548 + }, + { + "epoch": 1.4432696217974788, + "grad_norm": 11.745878338939704, + "learning_rate": 1.9657923967112373e-05, + "loss": 0.2102, + "step": 3549 + }, + { + "epoch": 1.4436762911752745, + "grad_norm": 14.288247251209446, + "learning_rate": 1.965766118112448e-05, + "loss": 0.7034, + "step": 3550 + }, + { + "epoch": 1.4440829605530703, + "grad_norm": 15.348000299657535, + "learning_rate": 1.9657398295995837e-05, + "loss": 0.5794, + "step": 3551 + }, + { + "epoch": 1.4444896299308663, + "grad_norm": 14.079824677915536, + "learning_rate": 1.9657135311729135e-05, + "loss": 0.3809, + "step": 3552 + }, + { + "epoch": 1.444896299308662, + "grad_norm": 8.228992677979976, + "learning_rate": 1.965687222832708e-05, + "loss": 0.3746, + "step": 3553 + }, + { + "epoch": 1.445302968686458, + "grad_norm": 21.47629150790854, + "learning_rate": 1.965660904579237e-05, + "loss": 0.4547, + "step": 3554 + }, + { + "epoch": 1.4457096380642538, + "grad_norm": 26.34839099654165, + "learning_rate": 1.965634576412771e-05, + "loss": 1.2884, + "step": 3555 + }, + { + "epoch": 1.4461163074420496, + "grad_norm": 7.550865772890974, + "learning_rate": 1.9656082383335803e-05, + "loss": 0.1899, + "step": 3556 + }, + { + "epoch": 1.4465229768198455, + "grad_norm": 29.231070882775768, + "learning_rate": 1.9655818903419343e-05, + "loss": 1.1248, + "step": 3557 + }, + { + "epoch": 1.4469296461976413, + "grad_norm": 0.2686022227408525, + "learning_rate": 1.965555532438105e-05, + "loss": 0.0042, + "step": 3558 + }, + { + "epoch": 1.4473363155754373, + "grad_norm": 20.402445244212576, + "learning_rate": 1.9655291646223618e-05, + "loss": 0.6559, + "step": 3559 + }, + { + "epoch": 1.447742984953233, + "grad_norm": 4.8131112729469105, + "learning_rate": 1.965502786894976e-05, + "loss": 0.2888, + "step": 3560 + }, + { + "epoch": 1.4481496543310288, + "grad_norm": 10.700948464668421, + "learning_rate": 1.9654763992562182e-05, + "loss": 0.4388, + "step": 3561 + }, + { + "epoch": 1.4485563237088246, + "grad_norm": 6.96961579950359, + "learning_rate": 1.965450001706359e-05, + "loss": 0.3213, + "step": 3562 + }, + { + "epoch": 1.4489629930866206, + "grad_norm": 22.220063285403814, + "learning_rate": 1.9654235942456696e-05, + "loss": 1.3368, + "step": 3563 + }, + { + "epoch": 1.4493696624644166, + "grad_norm": 1.3595885258941502, + "learning_rate": 1.9653971768744212e-05, + "loss": 0.0214, + "step": 3564 + }, + { + "epoch": 1.4497763318422123, + "grad_norm": 19.69987052029875, + "learning_rate": 1.9653707495928853e-05, + "loss": 1.1123, + "step": 3565 + }, + { + "epoch": 1.450183001220008, + "grad_norm": 14.034771961500686, + "learning_rate": 1.9653443124013323e-05, + "loss": 0.5512, + "step": 3566 + }, + { + "epoch": 1.4505896705978039, + "grad_norm": 12.728431987034021, + "learning_rate": 1.9653178653000342e-05, + "loss": 0.4979, + "step": 3567 + }, + { + "epoch": 1.4509963399755998, + "grad_norm": 71.37243195193095, + "learning_rate": 1.9652914082892627e-05, + "loss": 0.631, + "step": 3568 + }, + { + "epoch": 1.4514030093533956, + "grad_norm": 18.46705012162955, + "learning_rate": 1.9652649413692887e-05, + "loss": 1.2512, + "step": 3569 + }, + { + "epoch": 1.4518096787311916, + "grad_norm": 11.305051374115441, + "learning_rate": 1.9652384645403843e-05, + "loss": 0.434, + "step": 3570 + }, + { + "epoch": 1.4522163481089874, + "grad_norm": 13.957227370557495, + "learning_rate": 1.9652119778028212e-05, + "loss": 0.3838, + "step": 3571 + }, + { + "epoch": 1.4526230174867831, + "grad_norm": 4.8967595365809045, + "learning_rate": 1.965185481156872e-05, + "loss": 0.1083, + "step": 3572 + }, + { + "epoch": 1.4530296868645791, + "grad_norm": 20.383868081509146, + "learning_rate": 1.9651589746028073e-05, + "loss": 0.2129, + "step": 3573 + }, + { + "epoch": 1.4534363562423749, + "grad_norm": 13.930466416725492, + "learning_rate": 1.9651324581409e-05, + "loss": 0.4834, + "step": 3574 + }, + { + "epoch": 1.4538430256201709, + "grad_norm": 2.3870698898181533, + "learning_rate": 1.965105931771423e-05, + "loss": 0.046, + "step": 3575 + }, + { + "epoch": 1.4542496949979666, + "grad_norm": 10.288902965053238, + "learning_rate": 1.9650793954946472e-05, + "loss": 0.266, + "step": 3576 + }, + { + "epoch": 1.4546563643757624, + "grad_norm": 9.896425773867714, + "learning_rate": 1.9650528493108458e-05, + "loss": 0.7945, + "step": 3577 + }, + { + "epoch": 1.4550630337535584, + "grad_norm": 13.385715105978102, + "learning_rate": 1.9650262932202913e-05, + "loss": 0.3752, + "step": 3578 + }, + { + "epoch": 1.4554697031313542, + "grad_norm": 36.89770125971716, + "learning_rate": 1.9649997272232562e-05, + "loss": 0.379, + "step": 3579 + }, + { + "epoch": 1.4558763725091501, + "grad_norm": 14.381745704097904, + "learning_rate": 1.9649731513200133e-05, + "loss": 0.5297, + "step": 3580 + }, + { + "epoch": 1.456283041886946, + "grad_norm": 6.95624139800417, + "learning_rate": 1.9649465655108353e-05, + "loss": 0.1859, + "step": 3581 + }, + { + "epoch": 1.4566897112647417, + "grad_norm": 4.645911481664897, + "learning_rate": 1.964919969795995e-05, + "loss": 0.1141, + "step": 3582 + }, + { + "epoch": 1.4570963806425377, + "grad_norm": 17.287812662262375, + "learning_rate": 1.9648933641757658e-05, + "loss": 0.4744, + "step": 3583 + }, + { + "epoch": 1.4575030500203334, + "grad_norm": 9.655783948137277, + "learning_rate": 1.964866748650421e-05, + "loss": 0.4022, + "step": 3584 + }, + { + "epoch": 1.4579097193981294, + "grad_norm": 16.288157106412502, + "learning_rate": 1.9648401232202325e-05, + "loss": 0.2459, + "step": 3585 + }, + { + "epoch": 1.4583163887759252, + "grad_norm": 17.198784602265523, + "learning_rate": 1.9648134878854747e-05, + "loss": 0.3919, + "step": 3586 + }, + { + "epoch": 1.458723058153721, + "grad_norm": 12.03508052288277, + "learning_rate": 1.9647868426464214e-05, + "loss": 0.4376, + "step": 3587 + }, + { + "epoch": 1.459129727531517, + "grad_norm": 5.981855161541288, + "learning_rate": 1.9647601875033452e-05, + "loss": 0.1083, + "step": 3588 + }, + { + "epoch": 1.4595363969093127, + "grad_norm": 8.708580493116978, + "learning_rate": 1.9647335224565202e-05, + "loss": 0.1806, + "step": 3589 + }, + { + "epoch": 1.4599430662871087, + "grad_norm": 2.2674691312189514, + "learning_rate": 1.9647068475062202e-05, + "loss": 0.0308, + "step": 3590 + }, + { + "epoch": 1.4603497356649044, + "grad_norm": 9.582635145765542, + "learning_rate": 1.964680162652719e-05, + "loss": 0.1958, + "step": 3591 + }, + { + "epoch": 1.4607564050427002, + "grad_norm": 19.801358810372914, + "learning_rate": 1.96465346789629e-05, + "loss": 0.5459, + "step": 3592 + }, + { + "epoch": 1.4611630744204962, + "grad_norm": 7.3212038227107366, + "learning_rate": 1.964626763237208e-05, + "loss": 0.1465, + "step": 3593 + }, + { + "epoch": 1.461569743798292, + "grad_norm": 7.009843837005462, + "learning_rate": 1.9646000486757466e-05, + "loss": 0.2834, + "step": 3594 + }, + { + "epoch": 1.461976413176088, + "grad_norm": 17.077571549308285, + "learning_rate": 1.96457332421218e-05, + "loss": 0.6616, + "step": 3595 + }, + { + "epoch": 1.4623830825538837, + "grad_norm": 6.8685747290811285, + "learning_rate": 1.964546589846783e-05, + "loss": 0.2224, + "step": 3596 + }, + { + "epoch": 1.4627897519316795, + "grad_norm": 35.11140924934436, + "learning_rate": 1.96451984557983e-05, + "loss": 0.1683, + "step": 3597 + }, + { + "epoch": 1.4631964213094755, + "grad_norm": 9.421264844562334, + "learning_rate": 1.9644930914115955e-05, + "loss": 0.4738, + "step": 3598 + }, + { + "epoch": 1.4636030906872712, + "grad_norm": 23.332050735411848, + "learning_rate": 1.9644663273423537e-05, + "loss": 0.3556, + "step": 3599 + }, + { + "epoch": 1.4640097600650672, + "grad_norm": 7.614840743640679, + "learning_rate": 1.9644395533723796e-05, + "loss": 0.362, + "step": 3600 + }, + { + "epoch": 1.464416429442863, + "grad_norm": 9.76807604376659, + "learning_rate": 1.964412769501948e-05, + "loss": 0.1907, + "step": 3601 + }, + { + "epoch": 1.4648230988206588, + "grad_norm": 9.60304071603229, + "learning_rate": 1.9643859757313342e-05, + "loss": 0.2557, + "step": 3602 + }, + { + "epoch": 1.4652297681984545, + "grad_norm": 7.833053849651771, + "learning_rate": 1.9643591720608128e-05, + "loss": 0.3836, + "step": 3603 + }, + { + "epoch": 1.4656364375762505, + "grad_norm": 6.587634740701024, + "learning_rate": 1.9643323584906593e-05, + "loss": 0.1334, + "step": 3604 + }, + { + "epoch": 1.4660431069540465, + "grad_norm": 8.544549110340382, + "learning_rate": 1.9643055350211483e-05, + "loss": 0.1846, + "step": 3605 + }, + { + "epoch": 1.4664497763318423, + "grad_norm": 11.171492217290341, + "learning_rate": 1.9642787016525564e-05, + "loss": 0.4073, + "step": 3606 + }, + { + "epoch": 1.466856445709638, + "grad_norm": 13.026817855807828, + "learning_rate": 1.9642518583851577e-05, + "loss": 0.4517, + "step": 3607 + }, + { + "epoch": 1.4672631150874338, + "grad_norm": 5.049283458290324, + "learning_rate": 1.964225005219229e-05, + "loss": 0.1024, + "step": 3608 + }, + { + "epoch": 1.4676697844652298, + "grad_norm": 13.239285414719657, + "learning_rate": 1.964198142155045e-05, + "loss": 0.8056, + "step": 3609 + }, + { + "epoch": 1.4680764538430255, + "grad_norm": 10.192967866723723, + "learning_rate": 1.964171269192882e-05, + "loss": 0.5496, + "step": 3610 + }, + { + "epoch": 1.4684831232208215, + "grad_norm": 6.519859812191193, + "learning_rate": 1.964144386333015e-05, + "loss": 0.1344, + "step": 3611 + }, + { + "epoch": 1.4688897925986173, + "grad_norm": 2.961592813578748, + "learning_rate": 1.9641174935757212e-05, + "loss": 0.0327, + "step": 3612 + }, + { + "epoch": 1.469296461976413, + "grad_norm": 2.9581747193909487, + "learning_rate": 1.9640905909212762e-05, + "loss": 0.0571, + "step": 3613 + }, + { + "epoch": 1.469703131354209, + "grad_norm": 10.81328450820187, + "learning_rate": 1.9640636783699557e-05, + "loss": 0.2004, + "step": 3614 + }, + { + "epoch": 1.4701098007320048, + "grad_norm": 9.20282339446113, + "learning_rate": 1.9640367559220366e-05, + "loss": 0.3005, + "step": 3615 + }, + { + "epoch": 1.4705164701098008, + "grad_norm": 9.74956937730741, + "learning_rate": 1.964009823577795e-05, + "loss": 0.1604, + "step": 3616 + }, + { + "epoch": 1.4709231394875966, + "grad_norm": 6.645511404018286, + "learning_rate": 1.963982881337507e-05, + "loss": 0.3142, + "step": 3617 + }, + { + "epoch": 1.4713298088653923, + "grad_norm": 30.409956528749536, + "learning_rate": 1.9639559292014502e-05, + "loss": 1.4133, + "step": 3618 + }, + { + "epoch": 1.4717364782431883, + "grad_norm": 11.00911589424301, + "learning_rate": 1.9639289671699e-05, + "loss": 0.2462, + "step": 3619 + }, + { + "epoch": 1.472143147620984, + "grad_norm": 15.274794534364291, + "learning_rate": 1.9639019952431342e-05, + "loss": 0.8098, + "step": 3620 + }, + { + "epoch": 1.47254981699878, + "grad_norm": 18.11898857719869, + "learning_rate": 1.9638750134214294e-05, + "loss": 0.4834, + "step": 3621 + }, + { + "epoch": 1.4729564863765758, + "grad_norm": 30.949717484788604, + "learning_rate": 1.963848021705062e-05, + "loss": 0.1177, + "step": 3622 + }, + { + "epoch": 1.4733631557543716, + "grad_norm": 14.070004673756587, + "learning_rate": 1.96382102009431e-05, + "loss": 0.5889, + "step": 3623 + }, + { + "epoch": 1.4737698251321676, + "grad_norm": 4.783953961404828, + "learning_rate": 1.96379400858945e-05, + "loss": 0.2557, + "step": 3624 + }, + { + "epoch": 1.4741764945099634, + "grad_norm": 33.859026395475716, + "learning_rate": 1.9637669871907593e-05, + "loss": 1.3103, + "step": 3625 + }, + { + "epoch": 1.4745831638877593, + "grad_norm": 16.78554586798816, + "learning_rate": 1.9637399558985157e-05, + "loss": 0.558, + "step": 3626 + }, + { + "epoch": 1.474989833265555, + "grad_norm": 13.420311239161494, + "learning_rate": 1.9637129147129963e-05, + "loss": 0.4604, + "step": 3627 + }, + { + "epoch": 1.4753965026433509, + "grad_norm": 25.95800882741812, + "learning_rate": 1.9636858636344788e-05, + "loss": 1.5037, + "step": 3628 + }, + { + "epoch": 1.4758031720211469, + "grad_norm": 0.902851050627074, + "learning_rate": 1.9636588026632408e-05, + "loss": 0.0127, + "step": 3629 + }, + { + "epoch": 1.4762098413989426, + "grad_norm": 12.650191813005534, + "learning_rate": 1.9636317317995603e-05, + "loss": 0.4713, + "step": 3630 + }, + { + "epoch": 1.4766165107767386, + "grad_norm": 10.806247772726913, + "learning_rate": 1.963604651043715e-05, + "loss": 0.5222, + "step": 3631 + }, + { + "epoch": 1.4770231801545344, + "grad_norm": 13.776450736877582, + "learning_rate": 1.963577560395983e-05, + "loss": 0.4145, + "step": 3632 + }, + { + "epoch": 1.4774298495323301, + "grad_norm": 11.874330736424433, + "learning_rate": 1.9635504598566423e-05, + "loss": 0.3205, + "step": 3633 + }, + { + "epoch": 1.4778365189101261, + "grad_norm": 9.043547007484538, + "learning_rate": 1.9635233494259715e-05, + "loss": 0.238, + "step": 3634 + }, + { + "epoch": 1.478243188287922, + "grad_norm": 14.723239997426582, + "learning_rate": 1.9634962291042483e-05, + "loss": 0.3516, + "step": 3635 + }, + { + "epoch": 1.4786498576657179, + "grad_norm": 6.389931927360664, + "learning_rate": 1.9634690988917515e-05, + "loss": 0.0935, + "step": 3636 + }, + { + "epoch": 1.4790565270435136, + "grad_norm": 18.469968804373494, + "learning_rate": 1.9634419587887596e-05, + "loss": 0.4151, + "step": 3637 + }, + { + "epoch": 1.4794631964213094, + "grad_norm": 19.38081886102647, + "learning_rate": 1.963414808795551e-05, + "loss": 1.1179, + "step": 3638 + }, + { + "epoch": 1.4798698657991054, + "grad_norm": 12.90588623653018, + "learning_rate": 1.9633876489124044e-05, + "loss": 0.8977, + "step": 3639 + }, + { + "epoch": 1.4802765351769012, + "grad_norm": 23.775390917612633, + "learning_rate": 1.9633604791395987e-05, + "loss": 1.4559, + "step": 3640 + }, + { + "epoch": 1.4806832045546972, + "grad_norm": 14.599090594448624, + "learning_rate": 1.963333299477413e-05, + "loss": 0.6457, + "step": 3641 + }, + { + "epoch": 1.481089873932493, + "grad_norm": 14.035783411023349, + "learning_rate": 1.963306109926126e-05, + "loss": 0.5236, + "step": 3642 + }, + { + "epoch": 1.4814965433102887, + "grad_norm": 16.704186702846933, + "learning_rate": 1.963278910486017e-05, + "loss": 0.679, + "step": 3643 + }, + { + "epoch": 1.4819032126880847, + "grad_norm": 11.081218020861774, + "learning_rate": 1.9632517011573654e-05, + "loss": 0.4088, + "step": 3644 + }, + { + "epoch": 1.4823098820658804, + "grad_norm": 10.759709226567278, + "learning_rate": 1.9632244819404503e-05, + "loss": 0.5147, + "step": 3645 + }, + { + "epoch": 1.4827165514436764, + "grad_norm": 11.39114246203606, + "learning_rate": 1.963197252835551e-05, + "loss": 0.545, + "step": 3646 + }, + { + "epoch": 1.4831232208214722, + "grad_norm": 10.529115444820048, + "learning_rate": 1.9631700138429467e-05, + "loss": 0.619, + "step": 3647 + }, + { + "epoch": 1.483529890199268, + "grad_norm": 9.524283231037888, + "learning_rate": 1.963142764962918e-05, + "loss": 0.4054, + "step": 3648 + }, + { + "epoch": 1.4839365595770637, + "grad_norm": 8.05850449104451, + "learning_rate": 1.9631155061957437e-05, + "loss": 0.112, + "step": 3649 + }, + { + "epoch": 1.4843432289548597, + "grad_norm": 6.162183926839055, + "learning_rate": 1.9630882375417043e-05, + "loss": 0.2334, + "step": 3650 + }, + { + "epoch": 1.4847498983326555, + "grad_norm": 2.8383238123618337, + "learning_rate": 1.9630609590010794e-05, + "loss": 0.0476, + "step": 3651 + }, + { + "epoch": 1.4851565677104515, + "grad_norm": 6.8618111172168925, + "learning_rate": 1.963033670574149e-05, + "loss": 0.1976, + "step": 3652 + }, + { + "epoch": 1.4855632370882472, + "grad_norm": 6.60764665148012, + "learning_rate": 1.963006372261193e-05, + "loss": 0.149, + "step": 3653 + }, + { + "epoch": 1.485969906466043, + "grad_norm": 6.554868355220557, + "learning_rate": 1.9629790640624925e-05, + "loss": 0.1362, + "step": 3654 + }, + { + "epoch": 1.486376575843839, + "grad_norm": 6.6733980988173, + "learning_rate": 1.962951745978327e-05, + "loss": 0.1232, + "step": 3655 + }, + { + "epoch": 1.4867832452216347, + "grad_norm": 10.212745412508026, + "learning_rate": 1.962924418008977e-05, + "loss": 0.2276, + "step": 3656 + }, + { + "epoch": 1.4871899145994307, + "grad_norm": 14.445266827618909, + "learning_rate": 1.9628970801547233e-05, + "loss": 0.9323, + "step": 3657 + }, + { + "epoch": 1.4875965839772265, + "grad_norm": 8.643682599447876, + "learning_rate": 1.9628697324158464e-05, + "loss": 0.324, + "step": 3658 + }, + { + "epoch": 1.4880032533550223, + "grad_norm": 18.858979976771035, + "learning_rate": 1.9628423747926274e-05, + "loss": 0.7158, + "step": 3659 + }, + { + "epoch": 1.4884099227328182, + "grad_norm": 10.781558658450608, + "learning_rate": 1.9628150072853466e-05, + "loss": 0.4413, + "step": 3660 + }, + { + "epoch": 1.488816592110614, + "grad_norm": 8.075678036500618, + "learning_rate": 1.962787629894285e-05, + "loss": 0.1701, + "step": 3661 + }, + { + "epoch": 1.48922326148841, + "grad_norm": 14.9406200640913, + "learning_rate": 1.9627602426197242e-05, + "loss": 0.3265, + "step": 3662 + }, + { + "epoch": 1.4896299308662058, + "grad_norm": 7.88303847495284, + "learning_rate": 1.9627328454619446e-05, + "loss": 0.1923, + "step": 3663 + }, + { + "epoch": 1.4900366002440015, + "grad_norm": 11.091496304433045, + "learning_rate": 1.962705438421228e-05, + "loss": 0.2603, + "step": 3664 + }, + { + "epoch": 1.4904432696217975, + "grad_norm": 5.144298712763396, + "learning_rate": 1.9626780214978556e-05, + "loss": 0.0865, + "step": 3665 + }, + { + "epoch": 1.4908499389995933, + "grad_norm": 10.804726764779312, + "learning_rate": 1.962650594692109e-05, + "loss": 0.2436, + "step": 3666 + }, + { + "epoch": 1.4912566083773893, + "grad_norm": 11.684069561141618, + "learning_rate": 1.962623158004269e-05, + "loss": 0.3874, + "step": 3667 + }, + { + "epoch": 1.491663277755185, + "grad_norm": 1.155148787307514, + "learning_rate": 1.9625957114346182e-05, + "loss": 0.0227, + "step": 3668 + }, + { + "epoch": 1.4920699471329808, + "grad_norm": 11.697762911277556, + "learning_rate": 1.962568254983438e-05, + "loss": 0.7804, + "step": 3669 + }, + { + "epoch": 1.4924766165107768, + "grad_norm": 8.148829031047695, + "learning_rate": 1.9625407886510103e-05, + "loss": 0.1474, + "step": 3670 + }, + { + "epoch": 1.4928832858885726, + "grad_norm": 9.745137080606046, + "learning_rate": 1.9625133124376166e-05, + "loss": 0.224, + "step": 3671 + }, + { + "epoch": 1.4932899552663685, + "grad_norm": 8.929111062892016, + "learning_rate": 1.9624858263435393e-05, + "loss": 0.2001, + "step": 3672 + }, + { + "epoch": 1.4936966246441643, + "grad_norm": 11.990498716811812, + "learning_rate": 1.962458330369061e-05, + "loss": 0.4376, + "step": 3673 + }, + { + "epoch": 1.49410329402196, + "grad_norm": 15.439164011188863, + "learning_rate": 1.9624308245144633e-05, + "loss": 0.9717, + "step": 3674 + }, + { + "epoch": 1.494509963399756, + "grad_norm": 21.43562658916415, + "learning_rate": 1.962403308780029e-05, + "loss": 0.3475, + "step": 3675 + }, + { + "epoch": 1.4949166327775518, + "grad_norm": 11.567837004804353, + "learning_rate": 1.96237578316604e-05, + "loss": 0.2413, + "step": 3676 + }, + { + "epoch": 1.4953233021553478, + "grad_norm": 9.684562917458228, + "learning_rate": 1.962348247672779e-05, + "loss": 0.3657, + "step": 3677 + }, + { + "epoch": 1.4957299715331436, + "grad_norm": 16.49965338541587, + "learning_rate": 1.9623207023005294e-05, + "loss": 0.7231, + "step": 3678 + }, + { + "epoch": 1.4961366409109393, + "grad_norm": 13.384565022042558, + "learning_rate": 1.9622931470495733e-05, + "loss": 0.5016, + "step": 3679 + }, + { + "epoch": 1.4965433102887353, + "grad_norm": 13.747130215880087, + "learning_rate": 1.962265581920194e-05, + "loss": 0.3613, + "step": 3680 + }, + { + "epoch": 1.496949979666531, + "grad_norm": 14.816279269570225, + "learning_rate": 1.9622380069126737e-05, + "loss": 0.6413, + "step": 3681 + }, + { + "epoch": 1.497356649044327, + "grad_norm": 5.182858698298561, + "learning_rate": 1.9622104220272965e-05, + "loss": 0.1141, + "step": 3682 + }, + { + "epoch": 1.4977633184221228, + "grad_norm": 13.426583488387577, + "learning_rate": 1.9621828272643452e-05, + "loss": 0.5893, + "step": 3683 + }, + { + "epoch": 1.4981699877999186, + "grad_norm": 21.78925622346675, + "learning_rate": 1.9621552226241024e-05, + "loss": 1.1718, + "step": 3684 + }, + { + "epoch": 1.4985766571777146, + "grad_norm": 11.248407327522026, + "learning_rate": 1.962127608106852e-05, + "loss": 0.3464, + "step": 3685 + }, + { + "epoch": 1.4989833265555104, + "grad_norm": 17.68421818325081, + "learning_rate": 1.962099983712878e-05, + "loss": 0.492, + "step": 3686 + }, + { + "epoch": 1.4993899959333064, + "grad_norm": 7.277532308530619, + "learning_rate": 1.9620723494424627e-05, + "loss": 0.176, + "step": 3687 + }, + { + "epoch": 1.4997966653111021, + "grad_norm": 9.277143017805766, + "learning_rate": 1.962044705295891e-05, + "loss": 0.255, + "step": 3688 + }, + { + "epoch": 1.5002033346888979, + "grad_norm": 10.864918466700882, + "learning_rate": 1.962017051273446e-05, + "loss": 0.4028, + "step": 3689 + }, + { + "epoch": 1.5006100040666936, + "grad_norm": 9.577016911419326, + "learning_rate": 1.961989387375412e-05, + "loss": 0.3114, + "step": 3690 + }, + { + "epoch": 1.5010166734444896, + "grad_norm": 7.117381419663248, + "learning_rate": 1.961961713602073e-05, + "loss": 0.125, + "step": 3691 + }, + { + "epoch": 1.5014233428222856, + "grad_norm": 9.395240333174408, + "learning_rate": 1.961934029953712e-05, + "loss": 0.302, + "step": 3692 + }, + { + "epoch": 1.5018300122000814, + "grad_norm": 7.420677564156291, + "learning_rate": 1.961906336430615e-05, + "loss": 0.5037, + "step": 3693 + }, + { + "epoch": 1.5022366815778772, + "grad_norm": 6.864218648566655, + "learning_rate": 1.9618786330330646e-05, + "loss": 0.147, + "step": 3694 + }, + { + "epoch": 1.502643350955673, + "grad_norm": 8.766882769423887, + "learning_rate": 1.9618509197613464e-05, + "loss": 0.183, + "step": 3695 + }, + { + "epoch": 1.503050020333469, + "grad_norm": 5.523341379413048, + "learning_rate": 1.9618231966157445e-05, + "loss": 0.1623, + "step": 3696 + }, + { + "epoch": 1.503456689711265, + "grad_norm": 10.598175559166528, + "learning_rate": 1.961795463596543e-05, + "loss": 0.2329, + "step": 3697 + }, + { + "epoch": 1.5038633590890607, + "grad_norm": 8.662386850901065, + "learning_rate": 1.9617677207040274e-05, + "loss": 0.3033, + "step": 3698 + }, + { + "epoch": 1.5042700284668564, + "grad_norm": 15.683864923700515, + "learning_rate": 1.9617399679384815e-05, + "loss": 1.0509, + "step": 3699 + }, + { + "epoch": 1.5046766978446522, + "grad_norm": 11.815545097214335, + "learning_rate": 1.9617122053001917e-05, + "loss": 0.372, + "step": 3700 + }, + { + "epoch": 1.5050833672224482, + "grad_norm": 11.712763306043597, + "learning_rate": 1.9616844327894415e-05, + "loss": 0.3474, + "step": 3701 + }, + { + "epoch": 1.5054900366002442, + "grad_norm": 27.90660827327707, + "learning_rate": 1.9616566504065165e-05, + "loss": 1.0467, + "step": 3702 + }, + { + "epoch": 1.50589670597804, + "grad_norm": 8.037007702651344, + "learning_rate": 1.9616288581517024e-05, + "loss": 0.2278, + "step": 3703 + }, + { + "epoch": 1.5063033753558357, + "grad_norm": 9.51558271217228, + "learning_rate": 1.9616010560252837e-05, + "loss": 0.3317, + "step": 3704 + }, + { + "epoch": 1.5067100447336315, + "grad_norm": 13.239763091827516, + "learning_rate": 1.9615732440275464e-05, + "loss": 0.4725, + "step": 3705 + }, + { + "epoch": 1.5071167141114274, + "grad_norm": 14.737114944270402, + "learning_rate": 1.961545422158776e-05, + "loss": 0.6207, + "step": 3706 + }, + { + "epoch": 1.5075233834892232, + "grad_norm": 7.797203812615873, + "learning_rate": 1.9615175904192573e-05, + "loss": 0.4261, + "step": 3707 + }, + { + "epoch": 1.5079300528670192, + "grad_norm": 3.4911693200989564, + "learning_rate": 1.961489748809277e-05, + "loss": 0.0697, + "step": 3708 + }, + { + "epoch": 1.508336722244815, + "grad_norm": 6.023954212009039, + "learning_rate": 1.9614618973291207e-05, + "loss": 0.0676, + "step": 3709 + }, + { + "epoch": 1.5087433916226107, + "grad_norm": 19.912122595662456, + "learning_rate": 1.9614340359790738e-05, + "loss": 0.4942, + "step": 3710 + }, + { + "epoch": 1.5091500610004067, + "grad_norm": 5.615927420223838, + "learning_rate": 1.9614061647594228e-05, + "loss": 0.1378, + "step": 3711 + }, + { + "epoch": 1.5095567303782025, + "grad_norm": 7.919586871525617, + "learning_rate": 1.961378283670454e-05, + "loss": 0.3266, + "step": 3712 + }, + { + "epoch": 1.5099633997559985, + "grad_norm": 0.33344006415708294, + "learning_rate": 1.9613503927124526e-05, + "loss": 0.0044, + "step": 3713 + }, + { + "epoch": 1.5103700691337942, + "grad_norm": 4.559504885908413, + "learning_rate": 1.9613224918857058e-05, + "loss": 0.1902, + "step": 3714 + }, + { + "epoch": 1.51077673851159, + "grad_norm": 13.780422344785993, + "learning_rate": 1.9612945811905e-05, + "loss": 0.623, + "step": 3715 + }, + { + "epoch": 1.5111834078893858, + "grad_norm": 4.369143681094201, + "learning_rate": 1.9612666606271215e-05, + "loss": 0.059, + "step": 3716 + }, + { + "epoch": 1.5115900772671818, + "grad_norm": 14.745427458912141, + "learning_rate": 1.9612387301958565e-05, + "loss": 0.4521, + "step": 3717 + }, + { + "epoch": 1.5119967466449777, + "grad_norm": 17.336658684117342, + "learning_rate": 1.961210789896993e-05, + "loss": 0.915, + "step": 3718 + }, + { + "epoch": 1.5124034160227735, + "grad_norm": 42.76063356828525, + "learning_rate": 1.961182839730816e-05, + "loss": 0.4556, + "step": 3719 + }, + { + "epoch": 1.5128100854005693, + "grad_norm": 20.933192191959037, + "learning_rate": 1.961154879697614e-05, + "loss": 0.9216, + "step": 3720 + }, + { + "epoch": 1.513216754778365, + "grad_norm": 11.764545426809702, + "learning_rate": 1.9611269097976733e-05, + "loss": 0.272, + "step": 3721 + }, + { + "epoch": 1.513623424156161, + "grad_norm": 13.853583545473274, + "learning_rate": 1.9610989300312813e-05, + "loss": 0.4657, + "step": 3722 + }, + { + "epoch": 1.514030093533957, + "grad_norm": 4.890282400669649, + "learning_rate": 1.9610709403987248e-05, + "loss": 0.0602, + "step": 3723 + }, + { + "epoch": 1.5144367629117528, + "grad_norm": 5.406856330079639, + "learning_rate": 1.9610429409002917e-05, + "loss": 0.3866, + "step": 3724 + }, + { + "epoch": 1.5148434322895485, + "grad_norm": 0.35369417459387026, + "learning_rate": 1.9610149315362687e-05, + "loss": 0.0054, + "step": 3725 + }, + { + "epoch": 1.5152501016673443, + "grad_norm": 21.751254953293188, + "learning_rate": 1.960986912306944e-05, + "loss": 0.4757, + "step": 3726 + }, + { + "epoch": 1.5156567710451403, + "grad_norm": 14.991738276388352, + "learning_rate": 1.9609588832126048e-05, + "loss": 0.6194, + "step": 3727 + }, + { + "epoch": 1.5160634404229363, + "grad_norm": 10.069676811150407, + "learning_rate": 1.9609308442535393e-05, + "loss": 0.2208, + "step": 3728 + }, + { + "epoch": 1.516470109800732, + "grad_norm": 10.831989990631945, + "learning_rate": 1.9609027954300348e-05, + "loss": 0.4979, + "step": 3729 + }, + { + "epoch": 1.5168767791785278, + "grad_norm": 8.884284877669076, + "learning_rate": 1.9608747367423796e-05, + "loss": 0.3062, + "step": 3730 + }, + { + "epoch": 1.5172834485563236, + "grad_norm": 3.354894704941168, + "learning_rate": 1.960846668190862e-05, + "loss": 0.0294, + "step": 3731 + }, + { + "epoch": 1.5176901179341196, + "grad_norm": 5.5751446313453785, + "learning_rate": 1.9608185897757694e-05, + "loss": 0.1311, + "step": 3732 + }, + { + "epoch": 1.5180967873119156, + "grad_norm": 9.04316261090916, + "learning_rate": 1.9607905014973902e-05, + "loss": 0.3227, + "step": 3733 + }, + { + "epoch": 1.5185034566897113, + "grad_norm": 2.6907874422062066, + "learning_rate": 1.9607624033560134e-05, + "loss": 0.0435, + "step": 3734 + }, + { + "epoch": 1.518910126067507, + "grad_norm": 5.092783156110921, + "learning_rate": 1.9607342953519266e-05, + "loss": 0.102, + "step": 3735 + }, + { + "epoch": 1.5193167954453028, + "grad_norm": 7.201148345692586, + "learning_rate": 1.960706177485419e-05, + "loss": 0.4112, + "step": 3736 + }, + { + "epoch": 1.5197234648230988, + "grad_norm": 22.282687016462745, + "learning_rate": 1.960678049756779e-05, + "loss": 0.8535, + "step": 3737 + }, + { + "epoch": 1.5201301342008948, + "grad_norm": 13.618998293527477, + "learning_rate": 1.960649912166295e-05, + "loss": 0.7218, + "step": 3738 + }, + { + "epoch": 1.5205368035786906, + "grad_norm": 14.172112392074393, + "learning_rate": 1.9606217647142566e-05, + "loss": 0.3777, + "step": 3739 + }, + { + "epoch": 1.5209434729564864, + "grad_norm": 28.864371167522503, + "learning_rate": 1.960593607400952e-05, + "loss": 0.3317, + "step": 3740 + }, + { + "epoch": 1.5213501423342821, + "grad_norm": 0.10076484902535907, + "learning_rate": 1.9605654402266706e-05, + "loss": 0.0014, + "step": 3741 + }, + { + "epoch": 1.521756811712078, + "grad_norm": 8.670231058799839, + "learning_rate": 1.9605372631917015e-05, + "loss": 0.3106, + "step": 3742 + }, + { + "epoch": 1.522163481089874, + "grad_norm": 15.310723895711302, + "learning_rate": 1.960509076296334e-05, + "loss": 0.7193, + "step": 3743 + }, + { + "epoch": 1.5225701504676699, + "grad_norm": 0.24172592246425398, + "learning_rate": 1.9604808795408577e-05, + "loss": 0.0055, + "step": 3744 + }, + { + "epoch": 1.5229768198454656, + "grad_norm": 19.73550334511328, + "learning_rate": 1.9604526729255614e-05, + "loss": 1.0187, + "step": 3745 + }, + { + "epoch": 1.5233834892232614, + "grad_norm": 16.289920380754996, + "learning_rate": 1.9604244564507352e-05, + "loss": 0.8138, + "step": 3746 + }, + { + "epoch": 1.5237901586010574, + "grad_norm": 11.1379471877072, + "learning_rate": 1.9603962301166684e-05, + "loss": 0.5381, + "step": 3747 + }, + { + "epoch": 1.5241968279788531, + "grad_norm": 27.30799294685123, + "learning_rate": 1.960367993923651e-05, + "loss": 0.2073, + "step": 3748 + }, + { + "epoch": 1.5246034973566491, + "grad_norm": 16.527957363792265, + "learning_rate": 1.960339747871973e-05, + "loss": 1.1316, + "step": 3749 + }, + { + "epoch": 1.525010166734445, + "grad_norm": 9.345412705282769, + "learning_rate": 1.9603114919619243e-05, + "loss": 0.2543, + "step": 3750 + }, + { + "epoch": 1.5254168361122407, + "grad_norm": 15.563723967000916, + "learning_rate": 1.960283226193794e-05, + "loss": 0.5199, + "step": 3751 + }, + { + "epoch": 1.5258235054900366, + "grad_norm": 23.098344482614426, + "learning_rate": 1.960254950567874e-05, + "loss": 0.9006, + "step": 3752 + }, + { + "epoch": 1.5262301748678324, + "grad_norm": 14.583876740188837, + "learning_rate": 1.9602266650844532e-05, + "loss": 0.8389, + "step": 3753 + }, + { + "epoch": 1.5266368442456284, + "grad_norm": 13.666306293648974, + "learning_rate": 1.9601983697438225e-05, + "loss": 1.1282, + "step": 3754 + }, + { + "epoch": 1.5270435136234242, + "grad_norm": 11.004242712948843, + "learning_rate": 1.9601700645462724e-05, + "loss": 0.5156, + "step": 3755 + }, + { + "epoch": 1.52745018300122, + "grad_norm": 13.994332352487502, + "learning_rate": 1.9601417494920933e-05, + "loss": 0.2458, + "step": 3756 + }, + { + "epoch": 1.5278568523790157, + "grad_norm": 8.327425483961925, + "learning_rate": 1.9601134245815755e-05, + "loss": 0.3015, + "step": 3757 + }, + { + "epoch": 1.5282635217568117, + "grad_norm": 12.028185399231017, + "learning_rate": 1.9600850898150105e-05, + "loss": 0.2742, + "step": 3758 + }, + { + "epoch": 1.5286701911346077, + "grad_norm": 13.296705581998326, + "learning_rate": 1.960056745192689e-05, + "loss": 0.4961, + "step": 3759 + }, + { + "epoch": 1.5290768605124034, + "grad_norm": 7.494181371312562, + "learning_rate": 1.9600283907149016e-05, + "loss": 0.3557, + "step": 3760 + }, + { + "epoch": 1.5294835298901992, + "grad_norm": 8.492860357773266, + "learning_rate": 1.9600000263819398e-05, + "loss": 0.4096, + "step": 3761 + }, + { + "epoch": 1.529890199267995, + "grad_norm": 6.841875652072536, + "learning_rate": 1.9599716521940945e-05, + "loss": 0.2137, + "step": 3762 + }, + { + "epoch": 1.530296868645791, + "grad_norm": 11.396398495057994, + "learning_rate": 1.959943268151657e-05, + "loss": 0.3886, + "step": 3763 + }, + { + "epoch": 1.530703538023587, + "grad_norm": 22.910999576237618, + "learning_rate": 1.9599148742549185e-05, + "loss": 1.2874, + "step": 3764 + }, + { + "epoch": 1.5311102074013827, + "grad_norm": 15.319282106086218, + "learning_rate": 1.959886470504171e-05, + "loss": 0.6905, + "step": 3765 + }, + { + "epoch": 1.5315168767791785, + "grad_norm": 15.324941572503645, + "learning_rate": 1.9598580568997057e-05, + "loss": 0.4564, + "step": 3766 + }, + { + "epoch": 1.5319235461569742, + "grad_norm": 5.428938685784274, + "learning_rate": 1.9598296334418143e-05, + "loss": 0.1646, + "step": 3767 + }, + { + "epoch": 1.5323302155347702, + "grad_norm": 17.767206166474978, + "learning_rate": 1.9598012001307887e-05, + "loss": 0.4765, + "step": 3768 + }, + { + "epoch": 1.5327368849125662, + "grad_norm": 13.520587814138452, + "learning_rate": 1.9597727569669208e-05, + "loss": 0.5342, + "step": 3769 + }, + { + "epoch": 1.533143554290362, + "grad_norm": 11.897146808861242, + "learning_rate": 1.9597443039505024e-05, + "loss": 0.2791, + "step": 3770 + }, + { + "epoch": 1.5335502236681577, + "grad_norm": 14.483744698480747, + "learning_rate": 1.9597158410818258e-05, + "loss": 0.6697, + "step": 3771 + }, + { + "epoch": 1.5339568930459535, + "grad_norm": 9.545121052374101, + "learning_rate": 1.9596873683611826e-05, + "loss": 0.2365, + "step": 3772 + }, + { + "epoch": 1.5343635624237495, + "grad_norm": 13.909741489691346, + "learning_rate": 1.9596588857888662e-05, + "loss": 0.3795, + "step": 3773 + }, + { + "epoch": 1.5347702318015455, + "grad_norm": 6.623108575525822, + "learning_rate": 1.959630393365168e-05, + "loss": 0.1606, + "step": 3774 + }, + { + "epoch": 1.5351769011793412, + "grad_norm": 10.592380886729396, + "learning_rate": 1.959601891090381e-05, + "loss": 0.3248, + "step": 3775 + }, + { + "epoch": 1.535583570557137, + "grad_norm": 7.814125956687987, + "learning_rate": 1.9595733789647976e-05, + "loss": 0.1933, + "step": 3776 + }, + { + "epoch": 1.5359902399349328, + "grad_norm": 15.155913740476123, + "learning_rate": 1.9595448569887104e-05, + "loss": 0.3225, + "step": 3777 + }, + { + "epoch": 1.5363969093127288, + "grad_norm": 23.240077573870987, + "learning_rate": 1.9595163251624124e-05, + "loss": 1.5225, + "step": 3778 + }, + { + "epoch": 1.5368035786905248, + "grad_norm": 21.26905412536164, + "learning_rate": 1.9594877834861966e-05, + "loss": 0.6751, + "step": 3779 + }, + { + "epoch": 1.5372102480683205, + "grad_norm": 8.81980800037823, + "learning_rate": 1.9594592319603554e-05, + "loss": 0.4432, + "step": 3780 + }, + { + "epoch": 1.5376169174461163, + "grad_norm": 8.647723517855015, + "learning_rate": 1.959430670585183e-05, + "loss": 0.3846, + "step": 3781 + }, + { + "epoch": 1.538023586823912, + "grad_norm": 4.292003420065371, + "learning_rate": 1.959402099360971e-05, + "loss": 0.0818, + "step": 3782 + }, + { + "epoch": 1.538430256201708, + "grad_norm": 11.325334144738674, + "learning_rate": 1.9593735182880144e-05, + "loss": 0.3504, + "step": 3783 + }, + { + "epoch": 1.538836925579504, + "grad_norm": 9.947064299444827, + "learning_rate": 1.959344927366605e-05, + "loss": 0.4522, + "step": 3784 + }, + { + "epoch": 1.5392435949572998, + "grad_norm": 5.376182636580343, + "learning_rate": 1.9593163265970378e-05, + "loss": 0.123, + "step": 3785 + }, + { + "epoch": 1.5396502643350956, + "grad_norm": 2.8054608749621974, + "learning_rate": 1.9592877159796052e-05, + "loss": 0.0445, + "step": 3786 + }, + { + "epoch": 1.5400569337128913, + "grad_norm": 5.377981929429414, + "learning_rate": 1.9592590955146016e-05, + "loss": 0.0664, + "step": 3787 + }, + { + "epoch": 1.5404636030906873, + "grad_norm": 7.943769870829527, + "learning_rate": 1.9592304652023208e-05, + "loss": 0.1796, + "step": 3788 + }, + { + "epoch": 1.540870272468483, + "grad_norm": 2.439839550849401, + "learning_rate": 1.959201825043056e-05, + "loss": 0.0398, + "step": 3789 + }, + { + "epoch": 1.541276941846279, + "grad_norm": 11.709443869256518, + "learning_rate": 1.959173175037102e-05, + "loss": 0.3864, + "step": 3790 + }, + { + "epoch": 1.5416836112240748, + "grad_norm": 14.161498023950017, + "learning_rate": 1.9591445151847525e-05, + "loss": 0.5846, + "step": 3791 + }, + { + "epoch": 1.5420902806018706, + "grad_norm": 14.079199592793854, + "learning_rate": 1.9591158454863023e-05, + "loss": 0.7067, + "step": 3792 + }, + { + "epoch": 1.5424969499796666, + "grad_norm": 12.478089132447062, + "learning_rate": 1.9590871659420448e-05, + "loss": 0.4268, + "step": 3793 + }, + { + "epoch": 1.5429036193574623, + "grad_norm": 7.781788404671553, + "learning_rate": 1.9590584765522748e-05, + "loss": 0.1294, + "step": 3794 + }, + { + "epoch": 1.5433102887352583, + "grad_norm": 4.444321622565611, + "learning_rate": 1.959029777317287e-05, + "loss": 0.136, + "step": 3795 + }, + { + "epoch": 1.543716958113054, + "grad_norm": 11.114258018083913, + "learning_rate": 1.959001068237376e-05, + "loss": 0.5196, + "step": 3796 + }, + { + "epoch": 1.5441236274908499, + "grad_norm": 17.765346246705274, + "learning_rate": 1.958972349312836e-05, + "loss": 0.6643, + "step": 3797 + }, + { + "epoch": 1.5445302968686456, + "grad_norm": 11.960016715052914, + "learning_rate": 1.9589436205439626e-05, + "loss": 0.2937, + "step": 3798 + }, + { + "epoch": 1.5449369662464416, + "grad_norm": 8.860166735063098, + "learning_rate": 1.95891488193105e-05, + "loss": 0.1127, + "step": 3799 + }, + { + "epoch": 1.5453436356242376, + "grad_norm": 6.558493988492025, + "learning_rate": 1.9588861334743936e-05, + "loss": 0.2249, + "step": 3800 + }, + { + "epoch": 1.5457503050020334, + "grad_norm": 5.600517946427563, + "learning_rate": 1.9588573751742886e-05, + "loss": 0.2139, + "step": 3801 + }, + { + "epoch": 1.5461569743798291, + "grad_norm": 46.6475019250932, + "learning_rate": 1.95882860703103e-05, + "loss": 0.7354, + "step": 3802 + }, + { + "epoch": 1.546563643757625, + "grad_norm": 13.598810164152894, + "learning_rate": 1.958799829044913e-05, + "loss": 0.6297, + "step": 3803 + }, + { + "epoch": 1.5469703131354209, + "grad_norm": 14.644238301719813, + "learning_rate": 1.9587710412162332e-05, + "loss": 0.3912, + "step": 3804 + }, + { + "epoch": 1.5473769825132169, + "grad_norm": 12.832044198263363, + "learning_rate": 1.9587422435452864e-05, + "loss": 0.7342, + "step": 3805 + }, + { + "epoch": 1.5477836518910126, + "grad_norm": 4.299391427924235, + "learning_rate": 1.9587134360323676e-05, + "loss": 0.0952, + "step": 3806 + }, + { + "epoch": 1.5481903212688084, + "grad_norm": 11.491163234882377, + "learning_rate": 1.9586846186777734e-05, + "loss": 0.3099, + "step": 3807 + }, + { + "epoch": 1.5485969906466042, + "grad_norm": 8.550834979844627, + "learning_rate": 1.9586557914817988e-05, + "loss": 0.5342, + "step": 3808 + }, + { + "epoch": 1.5490036600244002, + "grad_norm": 10.656504845394792, + "learning_rate": 1.95862695444474e-05, + "loss": 0.5492, + "step": 3809 + }, + { + "epoch": 1.5494103294021961, + "grad_norm": 5.9606467635798035, + "learning_rate": 1.9585981075668926e-05, + "loss": 0.1083, + "step": 3810 + }, + { + "epoch": 1.549816998779992, + "grad_norm": 7.656593017198765, + "learning_rate": 1.958569250848554e-05, + "loss": 0.1943, + "step": 3811 + }, + { + "epoch": 1.5502236681577877, + "grad_norm": 0.26366110400424525, + "learning_rate": 1.958540384290019e-05, + "loss": 0.0033, + "step": 3812 + }, + { + "epoch": 1.5506303375355834, + "grad_norm": 24.96857880055027, + "learning_rate": 1.9585115078915845e-05, + "loss": 0.7626, + "step": 3813 + }, + { + "epoch": 1.5510370069133794, + "grad_norm": 17.334741863194303, + "learning_rate": 1.9584826216535476e-05, + "loss": 0.2886, + "step": 3814 + }, + { + "epoch": 1.5514436762911754, + "grad_norm": 1.994007093982204, + "learning_rate": 1.958453725576204e-05, + "loss": 0.0696, + "step": 3815 + }, + { + "epoch": 1.5518503456689712, + "grad_norm": 11.720971994488258, + "learning_rate": 1.9584248196598498e-05, + "loss": 0.3815, + "step": 3816 + }, + { + "epoch": 1.552257015046767, + "grad_norm": 13.104536392031607, + "learning_rate": 1.958395903904783e-05, + "loss": 0.4118, + "step": 3817 + }, + { + "epoch": 1.5526636844245627, + "grad_norm": 3.632269330013189, + "learning_rate": 1.9583669783113e-05, + "loss": 0.0684, + "step": 3818 + }, + { + "epoch": 1.5530703538023587, + "grad_norm": 5.880736299642369, + "learning_rate": 1.9583380428796973e-05, + "loss": 0.1252, + "step": 3819 + }, + { + "epoch": 1.5534770231801547, + "grad_norm": 2.850539201878605, + "learning_rate": 1.9583090976102728e-05, + "loss": 0.0461, + "step": 3820 + }, + { + "epoch": 1.5538836925579504, + "grad_norm": 11.827423341498287, + "learning_rate": 1.9582801425033226e-05, + "loss": 0.3104, + "step": 3821 + }, + { + "epoch": 1.5542903619357462, + "grad_norm": 12.205681250347387, + "learning_rate": 1.9582511775591445e-05, + "loss": 0.5617, + "step": 3822 + }, + { + "epoch": 1.554697031313542, + "grad_norm": 8.411142813839266, + "learning_rate": 1.958222202778036e-05, + "loss": 0.2956, + "step": 3823 + }, + { + "epoch": 1.555103700691338, + "grad_norm": 10.286209746999488, + "learning_rate": 1.9581932181602943e-05, + "loss": 0.5916, + "step": 3824 + }, + { + "epoch": 1.555510370069134, + "grad_norm": 0.6692921748397931, + "learning_rate": 1.9581642237062168e-05, + "loss": 0.009, + "step": 3825 + }, + { + "epoch": 1.5559170394469297, + "grad_norm": 9.628102570420069, + "learning_rate": 1.9581352194161013e-05, + "loss": 0.2908, + "step": 3826 + }, + { + "epoch": 1.5563237088247255, + "grad_norm": 8.767466331732868, + "learning_rate": 1.9581062052902456e-05, + "loss": 0.5077, + "step": 3827 + }, + { + "epoch": 1.5567303782025212, + "grad_norm": 1.361483614346996, + "learning_rate": 1.958077181328948e-05, + "loss": 0.0239, + "step": 3828 + }, + { + "epoch": 1.5571370475803172, + "grad_norm": 15.529573225312312, + "learning_rate": 1.9580481475325052e-05, + "loss": 0.9303, + "step": 3829 + }, + { + "epoch": 1.557543716958113, + "grad_norm": 12.856662814614596, + "learning_rate": 1.9580191039012164e-05, + "loss": 0.4144, + "step": 3830 + }, + { + "epoch": 1.557950386335909, + "grad_norm": 18.09843400666833, + "learning_rate": 1.9579900504353793e-05, + "loss": 0.4652, + "step": 3831 + }, + { + "epoch": 1.5583570557137048, + "grad_norm": 9.8840344213358, + "learning_rate": 1.957960987135292e-05, + "loss": 0.2757, + "step": 3832 + }, + { + "epoch": 1.5587637250915005, + "grad_norm": 12.473016512369318, + "learning_rate": 1.9579319140012533e-05, + "loss": 0.3153, + "step": 3833 + }, + { + "epoch": 1.5591703944692965, + "grad_norm": 4.931620481139828, + "learning_rate": 1.957902831033561e-05, + "loss": 0.1234, + "step": 3834 + }, + { + "epoch": 1.5595770638470923, + "grad_norm": 13.722036227120627, + "learning_rate": 1.9578737382325143e-05, + "loss": 0.9286, + "step": 3835 + }, + { + "epoch": 1.5599837332248883, + "grad_norm": 4.136898572464121, + "learning_rate": 1.9578446355984117e-05, + "loss": 0.0596, + "step": 3836 + }, + { + "epoch": 1.560390402602684, + "grad_norm": 1.0240915228866745, + "learning_rate": 1.9578155231315518e-05, + "loss": 0.0169, + "step": 3837 + }, + { + "epoch": 1.5607970719804798, + "grad_norm": 6.7205576898859665, + "learning_rate": 1.9577864008322335e-05, + "loss": 0.0812, + "step": 3838 + }, + { + "epoch": 1.5612037413582758, + "grad_norm": 14.13612750211806, + "learning_rate": 1.9577572687007556e-05, + "loss": 1.0576, + "step": 3839 + }, + { + "epoch": 1.5616104107360715, + "grad_norm": 7.463301647810893, + "learning_rate": 1.957728126737417e-05, + "loss": 0.2777, + "step": 3840 + }, + { + "epoch": 1.5620170801138675, + "grad_norm": 1.6585168962212382, + "learning_rate": 1.9576989749425176e-05, + "loss": 0.0218, + "step": 3841 + }, + { + "epoch": 1.5624237494916633, + "grad_norm": 7.711341715001623, + "learning_rate": 1.9576698133163564e-05, + "loss": 0.1493, + "step": 3842 + }, + { + "epoch": 1.562830418869459, + "grad_norm": 18.737310547026773, + "learning_rate": 1.9576406418592323e-05, + "loss": 0.3205, + "step": 3843 + }, + { + "epoch": 1.5632370882472548, + "grad_norm": 6.666282862351004, + "learning_rate": 1.9576114605714452e-05, + "loss": 0.5299, + "step": 3844 + }, + { + "epoch": 1.5636437576250508, + "grad_norm": 3.869526149817444, + "learning_rate": 1.957582269453294e-05, + "loss": 0.1808, + "step": 3845 + }, + { + "epoch": 1.5640504270028468, + "grad_norm": 5.169788234367447, + "learning_rate": 1.9575530685050793e-05, + "loss": 0.1115, + "step": 3846 + }, + { + "epoch": 1.5644570963806426, + "grad_norm": 3.1843228093724485, + "learning_rate": 1.9575238577271e-05, + "loss": 0.0473, + "step": 3847 + }, + { + "epoch": 1.5648637657584383, + "grad_norm": 4.66947600035183, + "learning_rate": 1.9574946371196574e-05, + "loss": 0.0945, + "step": 3848 + }, + { + "epoch": 1.565270435136234, + "grad_norm": 20.047270368669256, + "learning_rate": 1.9574654066830495e-05, + "loss": 0.2925, + "step": 3849 + }, + { + "epoch": 1.56567710451403, + "grad_norm": 7.294647851310984, + "learning_rate": 1.9574361664175776e-05, + "loss": 0.1278, + "step": 3850 + }, + { + "epoch": 1.566083773891826, + "grad_norm": 15.311939271657995, + "learning_rate": 1.9574069163235417e-05, + "loss": 0.8619, + "step": 3851 + }, + { + "epoch": 1.5664904432696218, + "grad_norm": 14.447124038383865, + "learning_rate": 1.957377656401242e-05, + "loss": 0.4607, + "step": 3852 + }, + { + "epoch": 1.5668971126474176, + "grad_norm": 28.1280113912754, + "learning_rate": 1.9573483866509787e-05, + "loss": 0.6656, + "step": 3853 + }, + { + "epoch": 1.5673037820252134, + "grad_norm": 16.457768123929593, + "learning_rate": 1.957319107073052e-05, + "loss": 1.1166, + "step": 3854 + }, + { + "epoch": 1.5677104514030094, + "grad_norm": 8.093065407559957, + "learning_rate": 1.9572898176677634e-05, + "loss": 0.1157, + "step": 3855 + }, + { + "epoch": 1.5681171207808053, + "grad_norm": 15.117725392562658, + "learning_rate": 1.957260518435413e-05, + "loss": 0.3779, + "step": 3856 + }, + { + "epoch": 1.568523790158601, + "grad_norm": 5.632076156944658, + "learning_rate": 1.9572312093763012e-05, + "loss": 0.1967, + "step": 3857 + }, + { + "epoch": 1.5689304595363969, + "grad_norm": 16.01008352290043, + "learning_rate": 1.9572018904907298e-05, + "loss": 0.1352, + "step": 3858 + }, + { + "epoch": 1.5693371289141926, + "grad_norm": 6.073300004439462, + "learning_rate": 1.9571725617789988e-05, + "loss": 0.0783, + "step": 3859 + }, + { + "epoch": 1.5697437982919886, + "grad_norm": 10.957444989285863, + "learning_rate": 1.95714322324141e-05, + "loss": 0.25, + "step": 3860 + }, + { + "epoch": 1.5701504676697846, + "grad_norm": 16.04347600569182, + "learning_rate": 1.957113874878264e-05, + "loss": 0.4335, + "step": 3861 + }, + { + "epoch": 1.5705571370475804, + "grad_norm": 6.310044346125221, + "learning_rate": 1.9570845166898625e-05, + "loss": 0.1348, + "step": 3862 + }, + { + "epoch": 1.5709638064253761, + "grad_norm": 6.972586322081155, + "learning_rate": 1.957055148676507e-05, + "loss": 0.3268, + "step": 3863 + }, + { + "epoch": 1.571370475803172, + "grad_norm": 13.861541253462756, + "learning_rate": 1.9570257708384984e-05, + "loss": 0.5302, + "step": 3864 + }, + { + "epoch": 1.571777145180968, + "grad_norm": 10.437232245273785, + "learning_rate": 1.9569963831761386e-05, + "loss": 0.4853, + "step": 3865 + }, + { + "epoch": 1.5721838145587639, + "grad_norm": 5.141910724501373, + "learning_rate": 1.9569669856897295e-05, + "loss": 0.0718, + "step": 3866 + }, + { + "epoch": 1.5725904839365596, + "grad_norm": 17.95390453355007, + "learning_rate": 1.9569375783795725e-05, + "loss": 0.2491, + "step": 3867 + }, + { + "epoch": 1.5729971533143554, + "grad_norm": 9.657241120668857, + "learning_rate": 1.95690816124597e-05, + "loss": 0.2996, + "step": 3868 + }, + { + "epoch": 1.5734038226921512, + "grad_norm": 16.31835415598651, + "learning_rate": 1.956878734289223e-05, + "loss": 0.8364, + "step": 3869 + }, + { + "epoch": 1.5738104920699472, + "grad_norm": 12.557090587596809, + "learning_rate": 1.9568492975096345e-05, + "loss": 0.6843, + "step": 3870 + }, + { + "epoch": 1.574217161447743, + "grad_norm": 14.977787941260678, + "learning_rate": 1.9568198509075066e-05, + "loss": 0.5484, + "step": 3871 + }, + { + "epoch": 1.574623830825539, + "grad_norm": 0.880906109917266, + "learning_rate": 1.9567903944831413e-05, + "loss": 0.0127, + "step": 3872 + }, + { + "epoch": 1.5750305002033347, + "grad_norm": 13.092953498394872, + "learning_rate": 1.9567609282368408e-05, + "loss": 0.1886, + "step": 3873 + }, + { + "epoch": 1.5754371695811304, + "grad_norm": 35.77813957679526, + "learning_rate": 1.956731452168908e-05, + "loss": 0.2818, + "step": 3874 + }, + { + "epoch": 1.5758438389589264, + "grad_norm": 22.419488391271297, + "learning_rate": 1.956701966279645e-05, + "loss": 1.0338, + "step": 3875 + }, + { + "epoch": 1.5762505083367222, + "grad_norm": 8.880682370931636, + "learning_rate": 1.9566724705693553e-05, + "loss": 0.2907, + "step": 3876 + }, + { + "epoch": 1.5766571777145182, + "grad_norm": 1.8999673751188069, + "learning_rate": 1.956642965038341e-05, + "loss": 0.0276, + "step": 3877 + }, + { + "epoch": 1.577063847092314, + "grad_norm": 10.910393311294388, + "learning_rate": 1.9566134496869054e-05, + "loss": 0.611, + "step": 3878 + }, + { + "epoch": 1.5774705164701097, + "grad_norm": 4.364482988974345, + "learning_rate": 1.9565839245153512e-05, + "loss": 0.0726, + "step": 3879 + }, + { + "epoch": 1.5778771858479057, + "grad_norm": 13.03184832920569, + "learning_rate": 1.9565543895239813e-05, + "loss": 0.3943, + "step": 3880 + }, + { + "epoch": 1.5782838552257015, + "grad_norm": 11.382138303310484, + "learning_rate": 1.9565248447130995e-05, + "loss": 0.371, + "step": 3881 + }, + { + "epoch": 1.5786905246034975, + "grad_norm": 3.1403068134539316, + "learning_rate": 1.9564952900830085e-05, + "loss": 0.047, + "step": 3882 + }, + { + "epoch": 1.5790971939812932, + "grad_norm": 3.2617065943060295, + "learning_rate": 1.9564657256340124e-05, + "loss": 0.0446, + "step": 3883 + }, + { + "epoch": 1.579503863359089, + "grad_norm": 9.892446732207459, + "learning_rate": 1.956436151366414e-05, + "loss": 0.3179, + "step": 3884 + }, + { + "epoch": 1.5799105327368848, + "grad_norm": 1.5479789796515326, + "learning_rate": 1.9564065672805167e-05, + "loss": 0.0196, + "step": 3885 + }, + { + "epoch": 1.5803172021146807, + "grad_norm": 5.63700384544747, + "learning_rate": 1.956376973376625e-05, + "loss": 0.1201, + "step": 3886 + }, + { + "epoch": 1.5807238714924767, + "grad_norm": 18.081703795109032, + "learning_rate": 1.9563473696550425e-05, + "loss": 0.796, + "step": 3887 + }, + { + "epoch": 1.5811305408702725, + "grad_norm": 18.87075993058665, + "learning_rate": 1.9563177561160727e-05, + "loss": 0.6178, + "step": 3888 + }, + { + "epoch": 1.5815372102480683, + "grad_norm": 18.014063935313292, + "learning_rate": 1.9562881327600197e-05, + "loss": 0.7987, + "step": 3889 + }, + { + "epoch": 1.581943879625864, + "grad_norm": 14.416781816961246, + "learning_rate": 1.956258499587188e-05, + "loss": 0.6944, + "step": 3890 + }, + { + "epoch": 1.58235054900366, + "grad_norm": 25.53658740412811, + "learning_rate": 1.9562288565978814e-05, + "loss": 1.4841, + "step": 3891 + }, + { + "epoch": 1.582757218381456, + "grad_norm": 6.227451003590351, + "learning_rate": 1.956199203792405e-05, + "loss": 0.1134, + "step": 3892 + }, + { + "epoch": 1.5831638877592518, + "grad_norm": 14.562583147410692, + "learning_rate": 1.9561695411710616e-05, + "loss": 0.5946, + "step": 3893 + }, + { + "epoch": 1.5835705571370475, + "grad_norm": 12.128761502366253, + "learning_rate": 1.9561398687341565e-05, + "loss": 0.3814, + "step": 3894 + }, + { + "epoch": 1.5839772265148433, + "grad_norm": 5.116474740902134, + "learning_rate": 1.9561101864819952e-05, + "loss": 0.1557, + "step": 3895 + }, + { + "epoch": 1.5843838958926393, + "grad_norm": 11.36149569706382, + "learning_rate": 1.9560804944148814e-05, + "loss": 0.5716, + "step": 3896 + }, + { + "epoch": 1.5847905652704353, + "grad_norm": 10.649738998431921, + "learning_rate": 1.9560507925331202e-05, + "loss": 0.3656, + "step": 3897 + }, + { + "epoch": 1.585197234648231, + "grad_norm": 17.692083812835293, + "learning_rate": 1.9560210808370162e-05, + "loss": 0.674, + "step": 3898 + }, + { + "epoch": 1.5856039040260268, + "grad_norm": 24.06436302602264, + "learning_rate": 1.9559913593268745e-05, + "loss": 0.3079, + "step": 3899 + }, + { + "epoch": 1.5860105734038226, + "grad_norm": 6.098807638377324, + "learning_rate": 1.9559616280030006e-05, + "loss": 0.1998, + "step": 3900 + }, + { + "epoch": 1.5864172427816186, + "grad_norm": 1.0704720147921756, + "learning_rate": 1.9559318868656997e-05, + "loss": 0.017, + "step": 3901 + }, + { + "epoch": 1.5868239121594145, + "grad_norm": 13.684652569095588, + "learning_rate": 1.9559021359152765e-05, + "loss": 0.4004, + "step": 3902 + }, + { + "epoch": 1.5872305815372103, + "grad_norm": 10.055890763609007, + "learning_rate": 1.955872375152037e-05, + "loss": 0.3434, + "step": 3903 + }, + { + "epoch": 1.587637250915006, + "grad_norm": 1.653129020490722, + "learning_rate": 1.9558426045762863e-05, + "loss": 0.0223, + "step": 3904 + }, + { + "epoch": 1.5880439202928018, + "grad_norm": 8.406699557674115, + "learning_rate": 1.9558128241883304e-05, + "loss": 0.3943, + "step": 3905 + }, + { + "epoch": 1.5884505896705978, + "grad_norm": 12.934428295394126, + "learning_rate": 1.9557830339884746e-05, + "loss": 0.6462, + "step": 3906 + }, + { + "epoch": 1.5888572590483938, + "grad_norm": 8.090741077373531, + "learning_rate": 1.955753233977025e-05, + "loss": 0.4333, + "step": 3907 + }, + { + "epoch": 1.5892639284261896, + "grad_norm": 15.28277099169071, + "learning_rate": 1.9557234241542876e-05, + "loss": 0.4112, + "step": 3908 + }, + { + "epoch": 1.5896705978039853, + "grad_norm": 6.255993500658673, + "learning_rate": 1.9556936045205682e-05, + "loss": 0.1288, + "step": 3909 + }, + { + "epoch": 1.590077267181781, + "grad_norm": 10.59726457793909, + "learning_rate": 1.9556637750761727e-05, + "loss": 0.4487, + "step": 3910 + }, + { + "epoch": 1.590483936559577, + "grad_norm": 9.249490390513964, + "learning_rate": 1.9556339358214077e-05, + "loss": 0.4409, + "step": 3911 + }, + { + "epoch": 1.590890605937373, + "grad_norm": 13.189349881336799, + "learning_rate": 1.9556040867565795e-05, + "loss": 0.3601, + "step": 3912 + }, + { + "epoch": 1.5912972753151688, + "grad_norm": 11.074259639635907, + "learning_rate": 1.9555742278819944e-05, + "loss": 0.2775, + "step": 3913 + }, + { + "epoch": 1.5917039446929646, + "grad_norm": 15.66975491490337, + "learning_rate": 1.9555443591979587e-05, + "loss": 0.2882, + "step": 3914 + }, + { + "epoch": 1.5921106140707604, + "grad_norm": 10.059892740536355, + "learning_rate": 1.9555144807047794e-05, + "loss": 0.5729, + "step": 3915 + }, + { + "epoch": 1.5925172834485564, + "grad_norm": 9.272455490857977, + "learning_rate": 1.955484592402763e-05, + "loss": 0.1981, + "step": 3916 + }, + { + "epoch": 1.5929239528263521, + "grad_norm": 8.767965049739898, + "learning_rate": 1.9554546942922162e-05, + "loss": 0.2333, + "step": 3917 + }, + { + "epoch": 1.5933306222041481, + "grad_norm": 14.196773604493375, + "learning_rate": 1.9554247863734467e-05, + "loss": 0.9112, + "step": 3918 + }, + { + "epoch": 1.5937372915819439, + "grad_norm": 2.1680415346377853, + "learning_rate": 1.9553948686467604e-05, + "loss": 0.0359, + "step": 3919 + }, + { + "epoch": 1.5941439609597396, + "grad_norm": 14.753245028754646, + "learning_rate": 1.955364941112465e-05, + "loss": 0.6112, + "step": 3920 + }, + { + "epoch": 1.5945506303375356, + "grad_norm": 8.097282291944799, + "learning_rate": 1.9553350037708675e-05, + "loss": 0.2636, + "step": 3921 + }, + { + "epoch": 1.5949572997153314, + "grad_norm": 16.20836176827367, + "learning_rate": 1.955305056622276e-05, + "loss": 0.9213, + "step": 3922 + }, + { + "epoch": 1.5953639690931274, + "grad_norm": 0.20402083841064694, + "learning_rate": 1.9552750996669967e-05, + "loss": 0.0023, + "step": 3923 + }, + { + "epoch": 1.5957706384709232, + "grad_norm": 0.9351081037766619, + "learning_rate": 1.955245132905338e-05, + "loss": 0.0145, + "step": 3924 + }, + { + "epoch": 1.596177307848719, + "grad_norm": 14.167542561801413, + "learning_rate": 1.955215156337607e-05, + "loss": 0.6143, + "step": 3925 + }, + { + "epoch": 1.5965839772265147, + "grad_norm": 13.301001764001573, + "learning_rate": 1.955185169964112e-05, + "loss": 0.6455, + "step": 3926 + }, + { + "epoch": 1.5969906466043107, + "grad_norm": 9.970123937749014, + "learning_rate": 1.9551551737851603e-05, + "loss": 0.1209, + "step": 3927 + }, + { + "epoch": 1.5973973159821067, + "grad_norm": 13.288921680074361, + "learning_rate": 1.95512516780106e-05, + "loss": 0.3458, + "step": 3928 + }, + { + "epoch": 1.5978039853599024, + "grad_norm": 0.23643004713791457, + "learning_rate": 1.955095152012119e-05, + "loss": 0.003, + "step": 3929 + }, + { + "epoch": 1.5982106547376982, + "grad_norm": 9.376730917244627, + "learning_rate": 1.9550651264186457e-05, + "loss": 0.2342, + "step": 3930 + }, + { + "epoch": 1.598617324115494, + "grad_norm": 17.729640326515344, + "learning_rate": 1.9550350910209484e-05, + "loss": 0.4275, + "step": 3931 + }, + { + "epoch": 1.59902399349329, + "grad_norm": 9.824835597233585, + "learning_rate": 1.955005045819335e-05, + "loss": 0.182, + "step": 3932 + }, + { + "epoch": 1.599430662871086, + "grad_norm": 8.476501939267164, + "learning_rate": 1.9549749908141144e-05, + "loss": 0.3651, + "step": 3933 + }, + { + "epoch": 1.5998373322488817, + "grad_norm": 13.364261477507876, + "learning_rate": 1.9549449260055947e-05, + "loss": 0.245, + "step": 3934 + }, + { + "epoch": 1.6002440016266775, + "grad_norm": 3.6920127186760303, + "learning_rate": 1.954914851394085e-05, + "loss": 0.0703, + "step": 3935 + }, + { + "epoch": 1.6006506710044732, + "grad_norm": 12.818995876426708, + "learning_rate": 1.9548847669798933e-05, + "loss": 0.6669, + "step": 3936 + }, + { + "epoch": 1.6010573403822692, + "grad_norm": 9.624836199768902, + "learning_rate": 1.9548546727633293e-05, + "loss": 0.4736, + "step": 3937 + }, + { + "epoch": 1.6014640097600652, + "grad_norm": 14.610938277536228, + "learning_rate": 1.954824568744701e-05, + "loss": 0.9632, + "step": 3938 + }, + { + "epoch": 1.601870679137861, + "grad_norm": 11.958978127715133, + "learning_rate": 1.9547944549243187e-05, + "loss": 0.4438, + "step": 3939 + }, + { + "epoch": 1.6022773485156567, + "grad_norm": 33.843139325917164, + "learning_rate": 1.9547643313024898e-05, + "loss": 0.9521, + "step": 3940 + }, + { + "epoch": 1.6026840178934525, + "grad_norm": 7.477246395335624, + "learning_rate": 1.9547341978795252e-05, + "loss": 0.1501, + "step": 3941 + }, + { + "epoch": 1.6030906872712485, + "grad_norm": 9.940741210981942, + "learning_rate": 1.9547040546557334e-05, + "loss": 0.4603, + "step": 3942 + }, + { + "epoch": 1.6034973566490445, + "grad_norm": 4.58093361779955, + "learning_rate": 1.954673901631424e-05, + "loss": 0.0268, + "step": 3943 + }, + { + "epoch": 1.6039040260268402, + "grad_norm": 9.574180382053365, + "learning_rate": 1.9546437388069067e-05, + "loss": 0.3132, + "step": 3944 + }, + { + "epoch": 1.604310695404636, + "grad_norm": 2.3072979483286713, + "learning_rate": 1.954613566182491e-05, + "loss": 0.0315, + "step": 3945 + }, + { + "epoch": 1.6047173647824318, + "grad_norm": 38.165665582510194, + "learning_rate": 1.9545833837584863e-05, + "loss": 1.8833, + "step": 3946 + }, + { + "epoch": 1.6051240341602278, + "grad_norm": 23.578807325456776, + "learning_rate": 1.954553191535203e-05, + "loss": 0.2103, + "step": 3947 + }, + { + "epoch": 1.6055307035380237, + "grad_norm": 2.042596888350555, + "learning_rate": 1.9545229895129504e-05, + "loss": 0.0264, + "step": 3948 + }, + { + "epoch": 1.6059373729158195, + "grad_norm": 8.70660571262967, + "learning_rate": 1.9544927776920392e-05, + "loss": 0.2904, + "step": 3949 + }, + { + "epoch": 1.6063440422936153, + "grad_norm": 5.050339880268961, + "learning_rate": 1.954462556072779e-05, + "loss": 0.0879, + "step": 3950 + }, + { + "epoch": 1.606750711671411, + "grad_norm": 6.109329107726348, + "learning_rate": 1.9544323246554802e-05, + "loss": 0.1969, + "step": 3951 + }, + { + "epoch": 1.607157381049207, + "grad_norm": 13.088199936735938, + "learning_rate": 1.954402083440454e-05, + "loss": 0.6548, + "step": 3952 + }, + { + "epoch": 1.607564050427003, + "grad_norm": 19.412902041316997, + "learning_rate": 1.9543718324280092e-05, + "loss": 0.5014, + "step": 3953 + }, + { + "epoch": 1.6079707198047988, + "grad_norm": 0.2920748291775819, + "learning_rate": 1.9543415716184576e-05, + "loss": 0.0054, + "step": 3954 + }, + { + "epoch": 1.6083773891825945, + "grad_norm": 16.276339927280187, + "learning_rate": 1.9543113010121093e-05, + "loss": 0.2469, + "step": 3955 + }, + { + "epoch": 1.6087840585603903, + "grad_norm": 16.080866778218635, + "learning_rate": 1.954281020609275e-05, + "loss": 0.4034, + "step": 3956 + }, + { + "epoch": 1.6091907279381863, + "grad_norm": 9.289639674474452, + "learning_rate": 1.9542507304102664e-05, + "loss": 0.3467, + "step": 3957 + }, + { + "epoch": 1.609597397315982, + "grad_norm": 18.906103418310057, + "learning_rate": 1.9542204304153934e-05, + "loss": 0.9245, + "step": 3958 + }, + { + "epoch": 1.610004066693778, + "grad_norm": 6.177479184663237, + "learning_rate": 1.9541901206249676e-05, + "loss": 0.1452, + "step": 3959 + }, + { + "epoch": 1.6104107360715738, + "grad_norm": 19.044298488455674, + "learning_rate": 1.9541598010393e-05, + "loss": 1.3677, + "step": 3960 + }, + { + "epoch": 1.6108174054493696, + "grad_norm": 5.3941644830743805, + "learning_rate": 1.9541294716587016e-05, + "loss": 0.1115, + "step": 3961 + }, + { + "epoch": 1.6112240748271656, + "grad_norm": 11.927091095347842, + "learning_rate": 1.954099132483484e-05, + "loss": 0.3251, + "step": 3962 + }, + { + "epoch": 1.6116307442049613, + "grad_norm": 21.34522934439414, + "learning_rate": 1.9540687835139587e-05, + "loss": 1.2498, + "step": 3963 + }, + { + "epoch": 1.6120374135827573, + "grad_norm": 3.3068326163730095, + "learning_rate": 1.9540384247504372e-05, + "loss": 0.0582, + "step": 3964 + }, + { + "epoch": 1.612444082960553, + "grad_norm": 9.107436506535251, + "learning_rate": 1.954008056193231e-05, + "loss": 0.3452, + "step": 3965 + }, + { + "epoch": 1.6128507523383488, + "grad_norm": 5.110012021989247, + "learning_rate": 1.953977677842652e-05, + "loss": 0.093, + "step": 3966 + }, + { + "epoch": 1.6132574217161446, + "grad_norm": 3.1155563919081337, + "learning_rate": 1.953947289699012e-05, + "loss": 0.0448, + "step": 3967 + }, + { + "epoch": 1.6136640910939406, + "grad_norm": 8.63471035232102, + "learning_rate": 1.953916891762623e-05, + "loss": 0.4341, + "step": 3968 + }, + { + "epoch": 1.6140707604717366, + "grad_norm": 13.195697075280146, + "learning_rate": 1.953886484033797e-05, + "loss": 0.5897, + "step": 3969 + }, + { + "epoch": 1.6144774298495324, + "grad_norm": 3.6909779057683965, + "learning_rate": 1.953856066512846e-05, + "loss": 0.0532, + "step": 3970 + }, + { + "epoch": 1.6148840992273281, + "grad_norm": 12.59577349869229, + "learning_rate": 1.9538256392000826e-05, + "loss": 0.6564, + "step": 3971 + }, + { + "epoch": 1.6152907686051239, + "grad_norm": 8.147445098609394, + "learning_rate": 1.9537952020958187e-05, + "loss": 0.2333, + "step": 3972 + }, + { + "epoch": 1.6156974379829199, + "grad_norm": 11.592998900995456, + "learning_rate": 1.9537647552003677e-05, + "loss": 0.4782, + "step": 3973 + }, + { + "epoch": 1.6161041073607159, + "grad_norm": 11.678855676278028, + "learning_rate": 1.9537342985140407e-05, + "loss": 0.4781, + "step": 3974 + }, + { + "epoch": 1.6165107767385116, + "grad_norm": 9.642032390460402, + "learning_rate": 1.9537038320371517e-05, + "loss": 0.2929, + "step": 3975 + }, + { + "epoch": 1.6169174461163074, + "grad_norm": 12.175253254317184, + "learning_rate": 1.9536733557700125e-05, + "loss": 0.5038, + "step": 3976 + }, + { + "epoch": 1.6173241154941032, + "grad_norm": 7.192394317312162, + "learning_rate": 1.9536428697129364e-05, + "loss": 0.1377, + "step": 3977 + }, + { + "epoch": 1.6177307848718991, + "grad_norm": 7.465555295971475, + "learning_rate": 1.9536123738662363e-05, + "loss": 0.1879, + "step": 3978 + }, + { + "epoch": 1.6181374542496951, + "grad_norm": 15.967588611069928, + "learning_rate": 1.9535818682302254e-05, + "loss": 0.4773, + "step": 3979 + }, + { + "epoch": 1.618544123627491, + "grad_norm": 56.87195957540594, + "learning_rate": 1.9535513528052162e-05, + "loss": 0.9815, + "step": 3980 + }, + { + "epoch": 1.6189507930052867, + "grad_norm": 5.819981362936667, + "learning_rate": 1.953520827591523e-05, + "loss": 0.1981, + "step": 3981 + }, + { + "epoch": 1.6193574623830824, + "grad_norm": 5.990697643865747, + "learning_rate": 1.9534902925894585e-05, + "loss": 0.1079, + "step": 3982 + }, + { + "epoch": 1.6197641317608784, + "grad_norm": 40.814857532750146, + "learning_rate": 1.953459747799336e-05, + "loss": 0.7616, + "step": 3983 + }, + { + "epoch": 1.6201708011386744, + "grad_norm": 17.760649728201383, + "learning_rate": 1.9534291932214696e-05, + "loss": 0.4043, + "step": 3984 + }, + { + "epoch": 1.6205774705164702, + "grad_norm": 6.7089964180252455, + "learning_rate": 1.9533986288561728e-05, + "loss": 0.2153, + "step": 3985 + }, + { + "epoch": 1.620984139894266, + "grad_norm": 10.99060466407416, + "learning_rate": 1.953368054703759e-05, + "loss": 0.5288, + "step": 3986 + }, + { + "epoch": 1.6213908092720617, + "grad_norm": 11.868188073284562, + "learning_rate": 1.953337470764542e-05, + "loss": 0.3351, + "step": 3987 + }, + { + "epoch": 1.6217974786498577, + "grad_norm": 15.700282869962681, + "learning_rate": 1.9533068770388365e-05, + "loss": 0.5796, + "step": 3988 + }, + { + "epoch": 1.6222041480276537, + "grad_norm": 1.3157929896568397, + "learning_rate": 1.9532762735269562e-05, + "loss": 0.0172, + "step": 3989 + }, + { + "epoch": 1.6226108174054494, + "grad_norm": 4.406992806562982, + "learning_rate": 1.9532456602292148e-05, + "loss": 0.0491, + "step": 3990 + }, + { + "epoch": 1.6230174867832452, + "grad_norm": 10.948779452871484, + "learning_rate": 1.953215037145927e-05, + "loss": 0.2897, + "step": 3991 + }, + { + "epoch": 1.623424156161041, + "grad_norm": 6.905850905434726, + "learning_rate": 1.953184404277407e-05, + "loss": 0.3485, + "step": 3992 + }, + { + "epoch": 1.623830825538837, + "grad_norm": 7.595924932178331, + "learning_rate": 1.9531537616239697e-05, + "loss": 0.157, + "step": 3993 + }, + { + "epoch": 1.624237494916633, + "grad_norm": 7.156157998337985, + "learning_rate": 1.9531231091859295e-05, + "loss": 0.2593, + "step": 3994 + }, + { + "epoch": 1.6246441642944287, + "grad_norm": 8.46524969416152, + "learning_rate": 1.9530924469636003e-05, + "loss": 0.2058, + "step": 3995 + }, + { + "epoch": 1.6250508336722245, + "grad_norm": 24.167550235489202, + "learning_rate": 1.953061774957298e-05, + "loss": 0.6473, + "step": 3996 + }, + { + "epoch": 1.6254575030500202, + "grad_norm": 9.269930514153618, + "learning_rate": 1.9530310931673364e-05, + "loss": 0.2486, + "step": 3997 + }, + { + "epoch": 1.6258641724278162, + "grad_norm": 6.058656317990689, + "learning_rate": 1.9530004015940315e-05, + "loss": 0.1381, + "step": 3998 + }, + { + "epoch": 1.626270841805612, + "grad_norm": 8.501511366343708, + "learning_rate": 1.9529697002376977e-05, + "loss": 0.3269, + "step": 3999 + }, + { + "epoch": 1.626677511183408, + "grad_norm": 5.640278674184428, + "learning_rate": 1.95293898909865e-05, + "loss": 0.1508, + "step": 4000 + }, + { + "epoch": 1.6270841805612037, + "grad_norm": 9.374419251921818, + "learning_rate": 1.9529082681772042e-05, + "loss": 0.2188, + "step": 4001 + }, + { + "epoch": 1.6274908499389995, + "grad_norm": 15.567037005311105, + "learning_rate": 1.952877537473675e-05, + "loss": 0.4602, + "step": 4002 + }, + { + "epoch": 1.6278975193167955, + "grad_norm": 22.89030549689751, + "learning_rate": 1.952846796988379e-05, + "loss": 1.2548, + "step": 4003 + }, + { + "epoch": 1.6283041886945913, + "grad_norm": 2.9303248608066874, + "learning_rate": 1.9528160467216307e-05, + "loss": 0.0769, + "step": 4004 + }, + { + "epoch": 1.6287108580723872, + "grad_norm": 1.0114669342713596, + "learning_rate": 1.9527852866737463e-05, + "loss": 0.0185, + "step": 4005 + }, + { + "epoch": 1.629117527450183, + "grad_norm": 5.952832072415213, + "learning_rate": 1.9527545168450413e-05, + "loss": 0.1847, + "step": 4006 + }, + { + "epoch": 1.6295241968279788, + "grad_norm": 12.59217359801177, + "learning_rate": 1.9527237372358316e-05, + "loss": 0.7859, + "step": 4007 + }, + { + "epoch": 1.6299308662057745, + "grad_norm": 19.439663794999223, + "learning_rate": 1.9526929478464335e-05, + "loss": 0.9592, + "step": 4008 + }, + { + "epoch": 1.6303375355835705, + "grad_norm": 12.819317615926318, + "learning_rate": 1.9526621486771625e-05, + "loss": 0.4846, + "step": 4009 + }, + { + "epoch": 1.6307442049613665, + "grad_norm": 15.29706448235748, + "learning_rate": 1.952631339728335e-05, + "loss": 0.8463, + "step": 4010 + }, + { + "epoch": 1.6311508743391623, + "grad_norm": 2.750146448803124, + "learning_rate": 1.952600521000268e-05, + "loss": 0.0495, + "step": 4011 + }, + { + "epoch": 1.631557543716958, + "grad_norm": 10.814047734898871, + "learning_rate": 1.9525696924932766e-05, + "loss": 0.2803, + "step": 4012 + }, + { + "epoch": 1.6319642130947538, + "grad_norm": 3.103888222205158, + "learning_rate": 1.9525388542076778e-05, + "loss": 0.0379, + "step": 4013 + }, + { + "epoch": 1.6323708824725498, + "grad_norm": 0.8230673350518692, + "learning_rate": 1.9525080061437887e-05, + "loss": 0.0101, + "step": 4014 + }, + { + "epoch": 1.6327775518503458, + "grad_norm": 1.717442077238307, + "learning_rate": 1.9524771483019255e-05, + "loss": 0.0253, + "step": 4015 + }, + { + "epoch": 1.6331842212281416, + "grad_norm": 8.231646090825288, + "learning_rate": 1.9524462806824048e-05, + "loss": 0.491, + "step": 4016 + }, + { + "epoch": 1.6335908906059373, + "grad_norm": 6.009071072125224, + "learning_rate": 1.9524154032855435e-05, + "loss": 0.1007, + "step": 4017 + }, + { + "epoch": 1.633997559983733, + "grad_norm": 11.961868056112209, + "learning_rate": 1.9523845161116594e-05, + "loss": 0.4384, + "step": 4018 + }, + { + "epoch": 1.634404229361529, + "grad_norm": 11.700482593470776, + "learning_rate": 1.9523536191610683e-05, + "loss": 0.2974, + "step": 4019 + }, + { + "epoch": 1.634810898739325, + "grad_norm": 14.152887098587902, + "learning_rate": 1.952322712434088e-05, + "loss": 0.9616, + "step": 4020 + }, + { + "epoch": 1.6352175681171208, + "grad_norm": 5.061921345549899, + "learning_rate": 1.952291795931036e-05, + "loss": 0.1232, + "step": 4021 + }, + { + "epoch": 1.6356242374949166, + "grad_norm": 7.705424704541118, + "learning_rate": 1.9522608696522294e-05, + "loss": 0.362, + "step": 4022 + }, + { + "epoch": 1.6360309068727124, + "grad_norm": 2.32100624838907, + "learning_rate": 1.952229933597986e-05, + "loss": 0.0376, + "step": 4023 + }, + { + "epoch": 1.6364375762505083, + "grad_norm": 6.3512947806456435, + "learning_rate": 1.9521989877686226e-05, + "loss": 0.0987, + "step": 4024 + }, + { + "epoch": 1.6368442456283043, + "grad_norm": 2.159411401103713, + "learning_rate": 1.9521680321644575e-05, + "loss": 0.0293, + "step": 4025 + }, + { + "epoch": 1.6372509150061, + "grad_norm": 11.727091657064843, + "learning_rate": 1.9521370667858083e-05, + "loss": 0.7423, + "step": 4026 + }, + { + "epoch": 1.6376575843838959, + "grad_norm": 4.739887658678668, + "learning_rate": 1.952106091632993e-05, + "loss": 0.1023, + "step": 4027 + }, + { + "epoch": 1.6380642537616916, + "grad_norm": 26.059720217020057, + "learning_rate": 1.9520751067063295e-05, + "loss": 0.7414, + "step": 4028 + }, + { + "epoch": 1.6384709231394876, + "grad_norm": 18.083388640819813, + "learning_rate": 1.9520441120061356e-05, + "loss": 0.7892, + "step": 4029 + }, + { + "epoch": 1.6388775925172836, + "grad_norm": 10.797987776684847, + "learning_rate": 1.95201310753273e-05, + "loss": 0.3399, + "step": 4030 + }, + { + "epoch": 1.6392842618950794, + "grad_norm": 10.602250987015974, + "learning_rate": 1.9519820932864307e-05, + "loss": 0.2823, + "step": 4031 + }, + { + "epoch": 1.6396909312728751, + "grad_norm": 1.673876485854843, + "learning_rate": 1.951951069267556e-05, + "loss": 0.0257, + "step": 4032 + }, + { + "epoch": 1.640097600650671, + "grad_norm": 2.6796748400651866, + "learning_rate": 1.9519200354764244e-05, + "loss": 0.0453, + "step": 4033 + }, + { + "epoch": 1.6405042700284669, + "grad_norm": 14.813244088399763, + "learning_rate": 1.951888991913355e-05, + "loss": 0.4293, + "step": 4034 + }, + { + "epoch": 1.6409109394062629, + "grad_norm": 14.753629993693448, + "learning_rate": 1.9518579385786657e-05, + "loss": 0.5343, + "step": 4035 + }, + { + "epoch": 1.6413176087840586, + "grad_norm": 0.20025274937499285, + "learning_rate": 1.9518268754726754e-05, + "loss": 0.0036, + "step": 4036 + }, + { + "epoch": 1.6417242781618544, + "grad_norm": 9.142349305737136, + "learning_rate": 1.9517958025957037e-05, + "loss": 0.3078, + "step": 4037 + }, + { + "epoch": 1.6421309475396502, + "grad_norm": 11.9659600703861, + "learning_rate": 1.9517647199480683e-05, + "loss": 0.2258, + "step": 4038 + }, + { + "epoch": 1.6425376169174462, + "grad_norm": 4.250472053281031, + "learning_rate": 1.95173362753009e-05, + "loss": 0.0824, + "step": 4039 + }, + { + "epoch": 1.642944286295242, + "grad_norm": 14.697502959290016, + "learning_rate": 1.951702525342086e-05, + "loss": 0.6651, + "step": 4040 + }, + { + "epoch": 1.643350955673038, + "grad_norm": 11.643920416616355, + "learning_rate": 1.9516714133843773e-05, + "loss": 0.1752, + "step": 4041 + }, + { + "epoch": 1.6437576250508337, + "grad_norm": 6.735288993719971, + "learning_rate": 1.9516402916572825e-05, + "loss": 0.1178, + "step": 4042 + }, + { + "epoch": 1.6441642944286294, + "grad_norm": 8.805246706188123, + "learning_rate": 1.951609160161121e-05, + "loss": 0.3264, + "step": 4043 + }, + { + "epoch": 1.6445709638064254, + "grad_norm": 7.925954251107443, + "learning_rate": 1.9515780188962123e-05, + "loss": 0.1744, + "step": 4044 + }, + { + "epoch": 1.6449776331842212, + "grad_norm": 1.4316422204271033, + "learning_rate": 1.9515468678628763e-05, + "loss": 0.0217, + "step": 4045 + }, + { + "epoch": 1.6453843025620172, + "grad_norm": 11.89124506246599, + "learning_rate": 1.9515157070614333e-05, + "loss": 0.3084, + "step": 4046 + }, + { + "epoch": 1.645790971939813, + "grad_norm": 5.146112523588328, + "learning_rate": 1.9514845364922026e-05, + "loss": 0.0722, + "step": 4047 + }, + { + "epoch": 1.6461976413176087, + "grad_norm": 11.240841434037298, + "learning_rate": 1.951453356155504e-05, + "loss": 0.5754, + "step": 4048 + }, + { + "epoch": 1.6466043106954045, + "grad_norm": 6.091535760488862, + "learning_rate": 1.9514221660516578e-05, + "loss": 0.1465, + "step": 4049 + }, + { + "epoch": 1.6470109800732005, + "grad_norm": 7.84352450128574, + "learning_rate": 1.9513909661809845e-05, + "loss": 0.1181, + "step": 4050 + }, + { + "epoch": 1.6474176494509964, + "grad_norm": 14.911054037007075, + "learning_rate": 1.9513597565438037e-05, + "loss": 0.4128, + "step": 4051 + }, + { + "epoch": 1.6478243188287922, + "grad_norm": 15.074949178374759, + "learning_rate": 1.9513285371404368e-05, + "loss": 0.723, + "step": 4052 + }, + { + "epoch": 1.648230988206588, + "grad_norm": 37.780913563264505, + "learning_rate": 1.951297307971203e-05, + "loss": 0.388, + "step": 4053 + }, + { + "epoch": 1.6486376575843837, + "grad_norm": 9.929535566287417, + "learning_rate": 1.9512660690364244e-05, + "loss": 0.2122, + "step": 4054 + }, + { + "epoch": 1.6490443269621797, + "grad_norm": 10.054095764493068, + "learning_rate": 1.95123482033642e-05, + "loss": 0.2472, + "step": 4055 + }, + { + "epoch": 1.6494509963399757, + "grad_norm": 13.16509621545307, + "learning_rate": 1.9512035618715116e-05, + "loss": 0.4732, + "step": 4056 + }, + { + "epoch": 1.6498576657177715, + "grad_norm": 6.249244095420301, + "learning_rate": 1.9511722936420204e-05, + "loss": 0.1769, + "step": 4057 + }, + { + "epoch": 1.6502643350955672, + "grad_norm": 16.520000204852362, + "learning_rate": 1.9511410156482664e-05, + "loss": 0.8886, + "step": 4058 + }, + { + "epoch": 1.650671004473363, + "grad_norm": 3.632667350432947, + "learning_rate": 1.9511097278905712e-05, + "loss": 0.0975, + "step": 4059 + }, + { + "epoch": 1.651077673851159, + "grad_norm": 11.121570839545283, + "learning_rate": 1.9510784303692563e-05, + "loss": 0.191, + "step": 4060 + }, + { + "epoch": 1.651484343228955, + "grad_norm": 12.678419132186773, + "learning_rate": 1.9510471230846422e-05, + "loss": 0.4002, + "step": 4061 + }, + { + "epoch": 1.6518910126067508, + "grad_norm": 5.04424551165553, + "learning_rate": 1.9510158060370508e-05, + "loss": 0.1481, + "step": 4062 + }, + { + "epoch": 1.6522976819845465, + "grad_norm": 9.103431268550594, + "learning_rate": 1.950984479226804e-05, + "loss": 0.2218, + "step": 4063 + }, + { + "epoch": 1.6527043513623423, + "grad_norm": 12.809611355473766, + "learning_rate": 1.9509531426542226e-05, + "loss": 0.3233, + "step": 4064 + }, + { + "epoch": 1.6531110207401383, + "grad_norm": 5.423418504932381, + "learning_rate": 1.9509217963196285e-05, + "loss": 0.0921, + "step": 4065 + }, + { + "epoch": 1.6535176901179343, + "grad_norm": 0.6405702658074613, + "learning_rate": 1.9508904402233436e-05, + "loss": 0.0062, + "step": 4066 + }, + { + "epoch": 1.65392435949573, + "grad_norm": 0.8331726748627941, + "learning_rate": 1.9508590743656897e-05, + "loss": 0.0108, + "step": 4067 + }, + { + "epoch": 1.6543310288735258, + "grad_norm": 10.842946308128642, + "learning_rate": 1.950827698746989e-05, + "loss": 0.2666, + "step": 4068 + }, + { + "epoch": 1.6547376982513216, + "grad_norm": 12.444711804186252, + "learning_rate": 1.9507963133675635e-05, + "loss": 0.6384, + "step": 4069 + }, + { + "epoch": 1.6551443676291175, + "grad_norm": 9.322381666241688, + "learning_rate": 1.950764918227735e-05, + "loss": 0.2715, + "step": 4070 + }, + { + "epoch": 1.6555510370069135, + "grad_norm": 8.392869845507706, + "learning_rate": 1.9507335133278264e-05, + "loss": 0.2563, + "step": 4071 + }, + { + "epoch": 1.6559577063847093, + "grad_norm": 5.168136851984709, + "learning_rate": 1.95070209866816e-05, + "loss": 0.0818, + "step": 4072 + }, + { + "epoch": 1.656364375762505, + "grad_norm": 4.061706147380262, + "learning_rate": 1.9506706742490573e-05, + "loss": 0.0574, + "step": 4073 + }, + { + "epoch": 1.6567710451403008, + "grad_norm": 1.5572863725773345, + "learning_rate": 1.9506392400708422e-05, + "loss": 0.0664, + "step": 4074 + }, + { + "epoch": 1.6571777145180968, + "grad_norm": 14.717673532894713, + "learning_rate": 1.9506077961338366e-05, + "loss": 0.3955, + "step": 4075 + }, + { + "epoch": 1.6575843838958928, + "grad_norm": 15.250659301793574, + "learning_rate": 1.950576342438364e-05, + "loss": 0.357, + "step": 4076 + }, + { + "epoch": 1.6579910532736886, + "grad_norm": 6.244218938097571, + "learning_rate": 1.9505448789847462e-05, + "loss": 0.3181, + "step": 4077 + }, + { + "epoch": 1.6583977226514843, + "grad_norm": 8.708120580415557, + "learning_rate": 1.9505134057733075e-05, + "loss": 0.1748, + "step": 4078 + }, + { + "epoch": 1.65880439202928, + "grad_norm": 9.377450019308087, + "learning_rate": 1.9504819228043697e-05, + "loss": 0.1898, + "step": 4079 + }, + { + "epoch": 1.659211061407076, + "grad_norm": 12.184475795304047, + "learning_rate": 1.9504504300782572e-05, + "loss": 0.2122, + "step": 4080 + }, + { + "epoch": 1.6596177307848718, + "grad_norm": 9.117418840711274, + "learning_rate": 1.950418927595292e-05, + "loss": 0.2964, + "step": 4081 + }, + { + "epoch": 1.6600244001626678, + "grad_norm": 5.765548781914312, + "learning_rate": 1.950387415355799e-05, + "loss": 0.2136, + "step": 4082 + }, + { + "epoch": 1.6604310695404636, + "grad_norm": 2.0390099783432856, + "learning_rate": 1.9503558933601e-05, + "loss": 0.0461, + "step": 4083 + }, + { + "epoch": 1.6608377389182594, + "grad_norm": 12.484308275816124, + "learning_rate": 1.95032436160852e-05, + "loss": 0.4134, + "step": 4084 + }, + { + "epoch": 1.6612444082960554, + "grad_norm": 5.910063244768756, + "learning_rate": 1.950292820101382e-05, + "loss": 0.1182, + "step": 4085 + }, + { + "epoch": 1.6616510776738511, + "grad_norm": 8.22382502279693, + "learning_rate": 1.9502612688390098e-05, + "loss": 0.1295, + "step": 4086 + }, + { + "epoch": 1.662057747051647, + "grad_norm": 9.198097993324147, + "learning_rate": 1.9502297078217275e-05, + "loss": 0.2751, + "step": 4087 + }, + { + "epoch": 1.6624644164294429, + "grad_norm": 10.439422733500841, + "learning_rate": 1.950198137049859e-05, + "loss": 0.2992, + "step": 4088 + }, + { + "epoch": 1.6628710858072386, + "grad_norm": 2.000747228354032, + "learning_rate": 1.9501665565237284e-05, + "loss": 0.0272, + "step": 4089 + }, + { + "epoch": 1.6632777551850344, + "grad_norm": 9.690437703627632, + "learning_rate": 1.95013496624366e-05, + "loss": 0.2021, + "step": 4090 + }, + { + "epoch": 1.6636844245628304, + "grad_norm": 0.2524533383592516, + "learning_rate": 1.950103366209978e-05, + "loss": 0.0041, + "step": 4091 + }, + { + "epoch": 1.6640910939406264, + "grad_norm": 16.52819821547879, + "learning_rate": 1.9500717564230064e-05, + "loss": 0.8922, + "step": 4092 + }, + { + "epoch": 1.6644977633184221, + "grad_norm": 5.975043572454385, + "learning_rate": 1.9500401368830705e-05, + "loss": 0.1018, + "step": 4093 + }, + { + "epoch": 1.664904432696218, + "grad_norm": 11.222853311142325, + "learning_rate": 1.9500085075904942e-05, + "loss": 0.2403, + "step": 4094 + }, + { + "epoch": 1.6653111020740137, + "grad_norm": 12.264870395196505, + "learning_rate": 1.9499768685456027e-05, + "loss": 0.5062, + "step": 4095 + }, + { + "epoch": 1.6657177714518097, + "grad_norm": 9.439672625080478, + "learning_rate": 1.9499452197487203e-05, + "loss": 0.2883, + "step": 4096 + }, + { + "epoch": 1.6661244408296056, + "grad_norm": 2.2057869815992612, + "learning_rate": 1.949913561200172e-05, + "loss": 0.0261, + "step": 4097 + }, + { + "epoch": 1.6665311102074014, + "grad_norm": 10.684774269770202, + "learning_rate": 1.9498818929002832e-05, + "loss": 0.3251, + "step": 4098 + }, + { + "epoch": 1.6669377795851972, + "grad_norm": 11.419680487995846, + "learning_rate": 1.9498502148493785e-05, + "loss": 0.8073, + "step": 4099 + }, + { + "epoch": 1.667344448962993, + "grad_norm": 3.680341366361295, + "learning_rate": 1.9498185270477835e-05, + "loss": 0.062, + "step": 4100 + }, + { + "epoch": 1.667751118340789, + "grad_norm": 3.7729820625482087, + "learning_rate": 1.9497868294958232e-05, + "loss": 0.1719, + "step": 4101 + }, + { + "epoch": 1.668157787718585, + "grad_norm": 20.120956410246826, + "learning_rate": 1.9497551221938228e-05, + "loss": 0.7533, + "step": 4102 + }, + { + "epoch": 1.6685644570963807, + "grad_norm": 0.8809237127586426, + "learning_rate": 1.9497234051421087e-05, + "loss": 0.0097, + "step": 4103 + }, + { + "epoch": 1.6689711264741764, + "grad_norm": 2.88733641222307, + "learning_rate": 1.9496916783410053e-05, + "loss": 0.0629, + "step": 4104 + }, + { + "epoch": 1.6693777958519722, + "grad_norm": 10.762440076078564, + "learning_rate": 1.949659941790839e-05, + "loss": 0.4405, + "step": 4105 + }, + { + "epoch": 1.6697844652297682, + "grad_norm": 11.322512061197216, + "learning_rate": 1.9496281954919354e-05, + "loss": 0.5999, + "step": 4106 + }, + { + "epoch": 1.6701911346075642, + "grad_norm": 9.53206900350968, + "learning_rate": 1.9495964394446204e-05, + "loss": 0.2956, + "step": 4107 + }, + { + "epoch": 1.67059780398536, + "grad_norm": 9.621792629176024, + "learning_rate": 1.9495646736492203e-05, + "loss": 0.1395, + "step": 4108 + }, + { + "epoch": 1.6710044733631557, + "grad_norm": 0.8955235572177548, + "learning_rate": 1.949532898106061e-05, + "loss": 0.0077, + "step": 4109 + }, + { + "epoch": 1.6714111427409515, + "grad_norm": 12.246980999141165, + "learning_rate": 1.9495011128154678e-05, + "loss": 0.3014, + "step": 4110 + }, + { + "epoch": 1.6718178121187475, + "grad_norm": 13.112984356276296, + "learning_rate": 1.9494693177777686e-05, + "loss": 0.4079, + "step": 4111 + }, + { + "epoch": 1.6722244814965435, + "grad_norm": 23.127191468737358, + "learning_rate": 1.9494375129932888e-05, + "loss": 0.2424, + "step": 4112 + }, + { + "epoch": 1.6726311508743392, + "grad_norm": 6.46570762235631, + "learning_rate": 1.949405698462355e-05, + "loss": 0.225, + "step": 4113 + }, + { + "epoch": 1.673037820252135, + "grad_norm": 5.668649453819725, + "learning_rate": 1.9493738741852943e-05, + "loss": 0.1065, + "step": 4114 + }, + { + "epoch": 1.6734444896299308, + "grad_norm": 11.455331141676817, + "learning_rate": 1.9493420401624324e-05, + "loss": 0.1685, + "step": 4115 + }, + { + "epoch": 1.6738511590077267, + "grad_norm": 6.221662486435053, + "learning_rate": 1.9493101963940967e-05, + "loss": 0.0996, + "step": 4116 + }, + { + "epoch": 1.6742578283855227, + "grad_norm": 4.539326624643454, + "learning_rate": 1.9492783428806145e-05, + "loss": 0.0844, + "step": 4117 + }, + { + "epoch": 1.6746644977633185, + "grad_norm": 10.222488784402264, + "learning_rate": 1.949246479622312e-05, + "loss": 0.1816, + "step": 4118 + }, + { + "epoch": 1.6750711671411143, + "grad_norm": 10.583178942596666, + "learning_rate": 1.9492146066195166e-05, + "loss": 0.4449, + "step": 4119 + }, + { + "epoch": 1.67547783651891, + "grad_norm": 6.880814837547932, + "learning_rate": 1.9491827238725556e-05, + "loss": 0.1564, + "step": 4120 + }, + { + "epoch": 1.675884505896706, + "grad_norm": 10.894526288173992, + "learning_rate": 1.9491508313817562e-05, + "loss": 0.2332, + "step": 4121 + }, + { + "epoch": 1.6762911752745018, + "grad_norm": 1.7447693754806195, + "learning_rate": 1.949118929147446e-05, + "loss": 0.0277, + "step": 4122 + }, + { + "epoch": 1.6766978446522978, + "grad_norm": 1.2588411232368315, + "learning_rate": 1.949087017169952e-05, + "loss": 0.0126, + "step": 4123 + }, + { + "epoch": 1.6771045140300935, + "grad_norm": 19.5125077422218, + "learning_rate": 1.9490550954496023e-05, + "loss": 1.3271, + "step": 4124 + }, + { + "epoch": 1.6775111834078893, + "grad_norm": 10.965396100038962, + "learning_rate": 1.9490231639867245e-05, + "loss": 0.4038, + "step": 4125 + }, + { + "epoch": 1.6779178527856853, + "grad_norm": 14.746928350059475, + "learning_rate": 1.948991222781646e-05, + "loss": 0.9352, + "step": 4126 + }, + { + "epoch": 1.678324522163481, + "grad_norm": 21.718371553348483, + "learning_rate": 1.9489592718346954e-05, + "loss": 0.738, + "step": 4127 + }, + { + "epoch": 1.678731191541277, + "grad_norm": 16.80851792357857, + "learning_rate": 1.9489273111462e-05, + "loss": 0.5938, + "step": 4128 + }, + { + "epoch": 1.6791378609190728, + "grad_norm": 12.635549155386125, + "learning_rate": 1.9488953407164883e-05, + "loss": 0.4178, + "step": 4129 + }, + { + "epoch": 1.6795445302968686, + "grad_norm": 6.722116817145499, + "learning_rate": 1.948863360545888e-05, + "loss": 0.0917, + "step": 4130 + }, + { + "epoch": 1.6799511996746646, + "grad_norm": 6.371504626833329, + "learning_rate": 1.948831370634728e-05, + "loss": 0.1186, + "step": 4131 + }, + { + "epoch": 1.6803578690524603, + "grad_norm": 14.877631914347011, + "learning_rate": 1.9487993709833367e-05, + "loss": 0.7439, + "step": 4132 + }, + { + "epoch": 1.6807645384302563, + "grad_norm": 0.17591987391645506, + "learning_rate": 1.948767361592042e-05, + "loss": 0.0027, + "step": 4133 + }, + { + "epoch": 1.681171207808052, + "grad_norm": 9.696530225404532, + "learning_rate": 1.9487353424611732e-05, + "loss": 0.6042, + "step": 4134 + }, + { + "epoch": 1.6815778771858478, + "grad_norm": 13.63327647050872, + "learning_rate": 1.948703313591058e-05, + "loss": 0.2348, + "step": 4135 + }, + { + "epoch": 1.6819845465636436, + "grad_norm": 9.898304182200034, + "learning_rate": 1.9486712749820264e-05, + "loss": 0.6277, + "step": 4136 + }, + { + "epoch": 1.6823912159414396, + "grad_norm": 2.6069354905114963, + "learning_rate": 1.9486392266344064e-05, + "loss": 0.0413, + "step": 4137 + }, + { + "epoch": 1.6827978853192356, + "grad_norm": 26.30783682954201, + "learning_rate": 1.9486071685485272e-05, + "loss": 1.505, + "step": 4138 + }, + { + "epoch": 1.6832045546970313, + "grad_norm": 9.856541644946288, + "learning_rate": 1.948575100724718e-05, + "loss": 0.5471, + "step": 4139 + }, + { + "epoch": 1.683611224074827, + "grad_norm": 9.562817616532435, + "learning_rate": 1.9485430231633082e-05, + "loss": 0.46, + "step": 4140 + }, + { + "epoch": 1.6840178934526229, + "grad_norm": 9.630884647819734, + "learning_rate": 1.948510935864627e-05, + "loss": 0.2648, + "step": 4141 + }, + { + "epoch": 1.6844245628304189, + "grad_norm": 5.022913016101389, + "learning_rate": 1.9484788388290034e-05, + "loss": 0.0733, + "step": 4142 + }, + { + "epoch": 1.6848312322082148, + "grad_norm": 9.563066935246221, + "learning_rate": 1.9484467320567672e-05, + "loss": 0.6087, + "step": 4143 + }, + { + "epoch": 1.6852379015860106, + "grad_norm": 7.43728196345172, + "learning_rate": 1.948414615548248e-05, + "loss": 0.3946, + "step": 4144 + }, + { + "epoch": 1.6856445709638064, + "grad_norm": 4.791933224766836, + "learning_rate": 1.9483824893037757e-05, + "loss": 0.1654, + "step": 4145 + }, + { + "epoch": 1.6860512403416021, + "grad_norm": 15.87222644898335, + "learning_rate": 1.9483503533236794e-05, + "loss": 0.8009, + "step": 4146 + }, + { + "epoch": 1.6864579097193981, + "grad_norm": 7.376544081676839, + "learning_rate": 1.9483182076082892e-05, + "loss": 0.1507, + "step": 4147 + }, + { + "epoch": 1.6868645790971941, + "grad_norm": 7.266918320223307, + "learning_rate": 1.948286052157936e-05, + "loss": 0.406, + "step": 4148 + }, + { + "epoch": 1.6872712484749899, + "grad_norm": 5.679206478013035, + "learning_rate": 1.948253886972949e-05, + "loss": 0.1603, + "step": 4149 + }, + { + "epoch": 1.6876779178527856, + "grad_norm": 7.212506878055078, + "learning_rate": 1.9482217120536588e-05, + "loss": 0.3321, + "step": 4150 + }, + { + "epoch": 1.6880845872305814, + "grad_norm": 2.9982625135977026, + "learning_rate": 1.9481895274003947e-05, + "loss": 0.0506, + "step": 4151 + }, + { + "epoch": 1.6884912566083774, + "grad_norm": 7.816958758429257, + "learning_rate": 1.9481573330134885e-05, + "loss": 0.1907, + "step": 4152 + }, + { + "epoch": 1.6888979259861734, + "grad_norm": 12.162457002047539, + "learning_rate": 1.94812512889327e-05, + "loss": 0.4361, + "step": 4153 + }, + { + "epoch": 1.6893045953639692, + "grad_norm": 4.837924761978015, + "learning_rate": 1.94809291504007e-05, + "loss": 0.0863, + "step": 4154 + }, + { + "epoch": 1.689711264741765, + "grad_norm": 2.8481820515647285, + "learning_rate": 1.948060691454219e-05, + "loss": 0.0564, + "step": 4155 + }, + { + "epoch": 1.6901179341195607, + "grad_norm": 7.3145828719402886, + "learning_rate": 1.9480284581360477e-05, + "loss": 0.2641, + "step": 4156 + }, + { + "epoch": 1.6905246034973567, + "grad_norm": 8.461698621427534, + "learning_rate": 1.9479962150858874e-05, + "loss": 0.3572, + "step": 4157 + }, + { + "epoch": 1.6909312728751527, + "grad_norm": 0.2864710760465442, + "learning_rate": 1.9479639623040684e-05, + "loss": 0.0067, + "step": 4158 + }, + { + "epoch": 1.6913379422529484, + "grad_norm": 1.6139978120072864, + "learning_rate": 1.9479316997909226e-05, + "loss": 0.0235, + "step": 4159 + }, + { + "epoch": 1.6917446116307442, + "grad_norm": 2.2440615136909483, + "learning_rate": 1.9478994275467808e-05, + "loss": 0.0905, + "step": 4160 + }, + { + "epoch": 1.69215128100854, + "grad_norm": 1.5447829277250078, + "learning_rate": 1.9478671455719742e-05, + "loss": 0.0216, + "step": 4161 + }, + { + "epoch": 1.692557950386336, + "grad_norm": 11.152819083551043, + "learning_rate": 1.947834853866834e-05, + "loss": 0.3173, + "step": 4162 + }, + { + "epoch": 1.6929646197641317, + "grad_norm": 11.94933043969229, + "learning_rate": 1.9478025524316922e-05, + "loss": 0.393, + "step": 4163 + }, + { + "epoch": 1.6933712891419277, + "grad_norm": 4.841380032204604, + "learning_rate": 1.9477702412668808e-05, + "loss": 0.095, + "step": 4164 + }, + { + "epoch": 1.6937779585197235, + "grad_norm": 7.769335583437704, + "learning_rate": 1.94773792037273e-05, + "loss": 0.2018, + "step": 4165 + }, + { + "epoch": 1.6941846278975192, + "grad_norm": 18.677497782490715, + "learning_rate": 1.947705589749573e-05, + "loss": 1.0606, + "step": 4166 + }, + { + "epoch": 1.6945912972753152, + "grad_norm": 1.845804808085709, + "learning_rate": 1.9476732493977408e-05, + "loss": 0.0255, + "step": 4167 + }, + { + "epoch": 1.694997966653111, + "grad_norm": 9.956545219736359, + "learning_rate": 1.9476408993175662e-05, + "loss": 0.1886, + "step": 4168 + }, + { + "epoch": 1.695404636030907, + "grad_norm": 2.1656481680139565, + "learning_rate": 1.9476085395093802e-05, + "loss": 0.0313, + "step": 4169 + }, + { + "epoch": 1.6958113054087027, + "grad_norm": 1.439430144162866, + "learning_rate": 1.9475761699735162e-05, + "loss": 0.0203, + "step": 4170 + }, + { + "epoch": 1.6962179747864985, + "grad_norm": 1.5750280517671054, + "learning_rate": 1.9475437907103058e-05, + "loss": 0.0268, + "step": 4171 + }, + { + "epoch": 1.6966246441642945, + "grad_norm": 28.079484996858284, + "learning_rate": 1.947511401720081e-05, + "loss": 1.0994, + "step": 4172 + }, + { + "epoch": 1.6970313135420902, + "grad_norm": 6.421126397658816, + "learning_rate": 1.947479003003175e-05, + "loss": 0.1327, + "step": 4173 + }, + { + "epoch": 1.6974379829198862, + "grad_norm": 4.885236467275991, + "learning_rate": 1.9474465945599206e-05, + "loss": 0.1454, + "step": 4174 + }, + { + "epoch": 1.697844652297682, + "grad_norm": 14.036246672510675, + "learning_rate": 1.94741417639065e-05, + "loss": 0.3639, + "step": 4175 + }, + { + "epoch": 1.6982513216754778, + "grad_norm": 8.062153663295971, + "learning_rate": 1.9473817484956957e-05, + "loss": 0.1661, + "step": 4176 + }, + { + "epoch": 1.6986579910532735, + "grad_norm": 3.7218611285620176, + "learning_rate": 1.9473493108753908e-05, + "loss": 0.0537, + "step": 4177 + }, + { + "epoch": 1.6990646604310695, + "grad_norm": 12.484187391180548, + "learning_rate": 1.9473168635300688e-05, + "loss": 0.7399, + "step": 4178 + }, + { + "epoch": 1.6994713298088655, + "grad_norm": 36.702809962142226, + "learning_rate": 1.9472844064600626e-05, + "loss": 0.2934, + "step": 4179 + }, + { + "epoch": 1.6998779991866613, + "grad_norm": 1.2734737565516148, + "learning_rate": 1.947251939665705e-05, + "loss": 0.0206, + "step": 4180 + }, + { + "epoch": 1.700284668564457, + "grad_norm": 8.953479361324417, + "learning_rate": 1.9472194631473294e-05, + "loss": 0.196, + "step": 4181 + }, + { + "epoch": 1.7006913379422528, + "grad_norm": 14.434612460378446, + "learning_rate": 1.9471869769052693e-05, + "loss": 0.8007, + "step": 4182 + }, + { + "epoch": 1.7010980073200488, + "grad_norm": 9.57367039585902, + "learning_rate": 1.947154480939858e-05, + "loss": 0.2358, + "step": 4183 + }, + { + "epoch": 1.7015046766978448, + "grad_norm": 4.156816526883824, + "learning_rate": 1.9471219752514295e-05, + "loss": 0.0851, + "step": 4184 + }, + { + "epoch": 1.7019113460756405, + "grad_norm": 18.09758837995621, + "learning_rate": 1.9470894598403177e-05, + "loss": 0.3107, + "step": 4185 + }, + { + "epoch": 1.7023180154534363, + "grad_norm": 12.20225118400726, + "learning_rate": 1.947056934706855e-05, + "loss": 0.1597, + "step": 4186 + }, + { + "epoch": 1.702724684831232, + "grad_norm": 11.590905112630391, + "learning_rate": 1.9470243998513768e-05, + "loss": 0.4167, + "step": 4187 + }, + { + "epoch": 1.703131354209028, + "grad_norm": 7.733545989704991, + "learning_rate": 1.9469918552742165e-05, + "loss": 0.183, + "step": 4188 + }, + { + "epoch": 1.703538023586824, + "grad_norm": 2.9793298795825724, + "learning_rate": 1.9469593009757083e-05, + "loss": 0.0663, + "step": 4189 + }, + { + "epoch": 1.7039446929646198, + "grad_norm": 11.814986606944508, + "learning_rate": 1.946926736956186e-05, + "loss": 0.2126, + "step": 4190 + }, + { + "epoch": 1.7043513623424156, + "grad_norm": 3.5948057538053613, + "learning_rate": 1.946894163215984e-05, + "loss": 0.0861, + "step": 4191 + }, + { + "epoch": 1.7047580317202113, + "grad_norm": 1.7555603444441767, + "learning_rate": 1.9468615797554374e-05, + "loss": 0.0371, + "step": 4192 + }, + { + "epoch": 1.7051647010980073, + "grad_norm": 16.34041635596134, + "learning_rate": 1.94682898657488e-05, + "loss": 0.3787, + "step": 4193 + }, + { + "epoch": 1.7055713704758033, + "grad_norm": 10.722445005735857, + "learning_rate": 1.946796383674646e-05, + "loss": 0.6483, + "step": 4194 + }, + { + "epoch": 1.705978039853599, + "grad_norm": 7.361583860256193, + "learning_rate": 1.946763771055071e-05, + "loss": 0.2243, + "step": 4195 + }, + { + "epoch": 1.7063847092313948, + "grad_norm": 13.893387549523768, + "learning_rate": 1.9467311487164893e-05, + "loss": 0.6263, + "step": 4196 + }, + { + "epoch": 1.7067913786091906, + "grad_norm": 10.047479850377211, + "learning_rate": 1.946698516659236e-05, + "loss": 0.2319, + "step": 4197 + }, + { + "epoch": 1.7071980479869866, + "grad_norm": 15.34422339534265, + "learning_rate": 1.9466658748836458e-05, + "loss": 1.3197, + "step": 4198 + }, + { + "epoch": 1.7076047173647826, + "grad_norm": 9.754736335417297, + "learning_rate": 1.946633223390054e-05, + "loss": 0.3673, + "step": 4199 + }, + { + "epoch": 1.7080113867425784, + "grad_norm": 28.809002106439433, + "learning_rate": 1.9466005621787957e-05, + "loss": 0.4306, + "step": 4200 + }, + { + "epoch": 1.7084180561203741, + "grad_norm": 9.180786732591699, + "learning_rate": 1.9465678912502064e-05, + "loss": 0.5766, + "step": 4201 + }, + { + "epoch": 1.7088247254981699, + "grad_norm": 8.516813116460972, + "learning_rate": 1.946535210604621e-05, + "loss": 0.5528, + "step": 4202 + }, + { + "epoch": 1.7092313948759659, + "grad_norm": 0.6260903221162364, + "learning_rate": 1.9465025202423757e-05, + "loss": 0.0086, + "step": 4203 + }, + { + "epoch": 1.7096380642537619, + "grad_norm": 19.318539179037597, + "learning_rate": 1.9464698201638055e-05, + "loss": 0.5949, + "step": 4204 + }, + { + "epoch": 1.7100447336315576, + "grad_norm": 9.949374565828773, + "learning_rate": 1.946437110369246e-05, + "loss": 0.446, + "step": 4205 + }, + { + "epoch": 1.7104514030093534, + "grad_norm": 12.379962645247192, + "learning_rate": 1.9464043908590334e-05, + "loss": 0.4031, + "step": 4206 + }, + { + "epoch": 1.7108580723871492, + "grad_norm": 0.47063087144978316, + "learning_rate": 1.9463716616335032e-05, + "loss": 0.0075, + "step": 4207 + }, + { + "epoch": 1.7112647417649451, + "grad_norm": 9.088204502442768, + "learning_rate": 1.9463389226929922e-05, + "loss": 0.6384, + "step": 4208 + }, + { + "epoch": 1.711671411142741, + "grad_norm": 14.719416566000925, + "learning_rate": 1.9463061740378353e-05, + "loss": 0.8617, + "step": 4209 + }, + { + "epoch": 1.712078080520537, + "grad_norm": 6.0260983048571966, + "learning_rate": 1.9462734156683693e-05, + "loss": 0.2988, + "step": 4210 + }, + { + "epoch": 1.7124847498983327, + "grad_norm": 10.584736883980053, + "learning_rate": 1.9462406475849308e-05, + "loss": 0.6014, + "step": 4211 + }, + { + "epoch": 1.7128914192761284, + "grad_norm": 6.724711885097328, + "learning_rate": 1.9462078697878554e-05, + "loss": 0.2113, + "step": 4212 + }, + { + "epoch": 1.7132980886539244, + "grad_norm": 0.038270125953467904, + "learning_rate": 1.94617508227748e-05, + "loss": 0.0006, + "step": 4213 + }, + { + "epoch": 1.7137047580317202, + "grad_norm": 15.468122222231887, + "learning_rate": 1.946142285054142e-05, + "loss": 0.2736, + "step": 4214 + }, + { + "epoch": 1.7141114274095162, + "grad_norm": 18.26642318960721, + "learning_rate": 1.9461094781181763e-05, + "loss": 0.4627, + "step": 4215 + }, + { + "epoch": 1.714518096787312, + "grad_norm": 9.181650703136981, + "learning_rate": 1.946076661469921e-05, + "loss": 0.3931, + "step": 4216 + }, + { + "epoch": 1.7149247661651077, + "grad_norm": 10.707108644253074, + "learning_rate": 1.9460438351097127e-05, + "loss": 0.4834, + "step": 4217 + }, + { + "epoch": 1.7153314355429035, + "grad_norm": 16.822313357145646, + "learning_rate": 1.9460109990378882e-05, + "loss": 0.7407, + "step": 4218 + }, + { + "epoch": 1.7157381049206994, + "grad_norm": 6.375717615913408, + "learning_rate": 1.9459781532547846e-05, + "loss": 0.2963, + "step": 4219 + }, + { + "epoch": 1.7161447742984954, + "grad_norm": 2.2838636011691076, + "learning_rate": 1.9459452977607394e-05, + "loss": 0.0609, + "step": 4220 + }, + { + "epoch": 1.7165514436762912, + "grad_norm": 8.06931730828229, + "learning_rate": 1.9459124325560895e-05, + "loss": 0.1098, + "step": 4221 + }, + { + "epoch": 1.716958113054087, + "grad_norm": 10.456545802195501, + "learning_rate": 1.945879557641172e-05, + "loss": 0.537, + "step": 4222 + }, + { + "epoch": 1.7173647824318827, + "grad_norm": 11.572176979415527, + "learning_rate": 1.9458466730163255e-05, + "loss": 0.5685, + "step": 4223 + }, + { + "epoch": 1.7177714518096787, + "grad_norm": 2.180967146017599, + "learning_rate": 1.9458137786818866e-05, + "loss": 0.0357, + "step": 4224 + }, + { + "epoch": 1.7181781211874747, + "grad_norm": 38.42327311644864, + "learning_rate": 1.9457808746381934e-05, + "loss": 0.5762, + "step": 4225 + }, + { + "epoch": 1.7185847905652705, + "grad_norm": 9.821260631701154, + "learning_rate": 1.945747960885583e-05, + "loss": 0.3606, + "step": 4226 + }, + { + "epoch": 1.7189914599430662, + "grad_norm": 10.223292241885765, + "learning_rate": 1.9457150374243944e-05, + "loss": 0.7127, + "step": 4227 + }, + { + "epoch": 1.719398129320862, + "grad_norm": 8.525763859395095, + "learning_rate": 1.9456821042549644e-05, + "loss": 0.276, + "step": 4228 + }, + { + "epoch": 1.719804798698658, + "grad_norm": 14.31528098587799, + "learning_rate": 1.9456491613776317e-05, + "loss": 0.8295, + "step": 4229 + }, + { + "epoch": 1.720211468076454, + "grad_norm": 7.9520935883013, + "learning_rate": 1.945616208792735e-05, + "loss": 0.2472, + "step": 4230 + }, + { + "epoch": 1.7206181374542497, + "grad_norm": 11.216333198860811, + "learning_rate": 1.9455832465006118e-05, + "loss": 0.429, + "step": 4231 + }, + { + "epoch": 1.7210248068320455, + "grad_norm": 3.586696752650706, + "learning_rate": 1.9455502745016e-05, + "loss": 0.0876, + "step": 4232 + }, + { + "epoch": 1.7214314762098413, + "grad_norm": 9.820786663731713, + "learning_rate": 1.9455172927960396e-05, + "loss": 0.4203, + "step": 4233 + }, + { + "epoch": 1.7218381455876373, + "grad_norm": 43.801282938842526, + "learning_rate": 1.945484301384268e-05, + "loss": 0.5813, + "step": 4234 + }, + { + "epoch": 1.7222448149654332, + "grad_norm": 9.545309164098487, + "learning_rate": 1.945451300266624e-05, + "loss": 0.1933, + "step": 4235 + }, + { + "epoch": 1.722651484343229, + "grad_norm": 4.5183973914667925, + "learning_rate": 1.9454182894434468e-05, + "loss": 0.0763, + "step": 4236 + }, + { + "epoch": 1.7230581537210248, + "grad_norm": 61.16111005051653, + "learning_rate": 1.945385268915075e-05, + "loss": 1.0116, + "step": 4237 + }, + { + "epoch": 1.7234648230988205, + "grad_norm": 5.374971310937625, + "learning_rate": 1.9453522386818476e-05, + "loss": 0.1609, + "step": 4238 + }, + { + "epoch": 1.7238714924766165, + "grad_norm": 18.520663433768803, + "learning_rate": 1.9453191987441036e-05, + "loss": 0.8453, + "step": 4239 + }, + { + "epoch": 1.7242781618544125, + "grad_norm": 12.927492110637363, + "learning_rate": 1.9452861491021826e-05, + "loss": 0.5667, + "step": 4240 + }, + { + "epoch": 1.7246848312322083, + "grad_norm": 1.5397232397050984, + "learning_rate": 1.9452530897564232e-05, + "loss": 0.0113, + "step": 4241 + }, + { + "epoch": 1.725091500610004, + "grad_norm": 1.0791362490678238, + "learning_rate": 1.945220020707165e-05, + "loss": 0.0146, + "step": 4242 + }, + { + "epoch": 1.7254981699877998, + "grad_norm": 9.754464400473198, + "learning_rate": 1.9451869419547478e-05, + "loss": 0.1737, + "step": 4243 + }, + { + "epoch": 1.7259048393655958, + "grad_norm": 0.1222053088470735, + "learning_rate": 1.945153853499511e-05, + "loss": 0.0017, + "step": 4244 + }, + { + "epoch": 1.7263115087433918, + "grad_norm": 15.464179630453613, + "learning_rate": 1.9451207553417938e-05, + "loss": 0.7535, + "step": 4245 + }, + { + "epoch": 1.7267181781211876, + "grad_norm": 16.380695399960853, + "learning_rate": 1.9450876474819367e-05, + "loss": 0.9473, + "step": 4246 + }, + { + "epoch": 1.7271248474989833, + "grad_norm": 183.09806911089996, + "learning_rate": 1.9450545299202794e-05, + "loss": 0.8331, + "step": 4247 + }, + { + "epoch": 1.727531516876779, + "grad_norm": 16.921562395407115, + "learning_rate": 1.9450214026571614e-05, + "loss": 0.6473, + "step": 4248 + }, + { + "epoch": 1.727938186254575, + "grad_norm": 11.189851257899535, + "learning_rate": 1.9449882656929232e-05, + "loss": 0.4577, + "step": 4249 + }, + { + "epoch": 1.7283448556323708, + "grad_norm": 16.855940629237217, + "learning_rate": 1.9449551190279047e-05, + "loss": 0.7232, + "step": 4250 + }, + { + "epoch": 1.7287515250101668, + "grad_norm": 9.282338099383963, + "learning_rate": 1.9449219626624466e-05, + "loss": 0.2845, + "step": 4251 + }, + { + "epoch": 1.7291581943879626, + "grad_norm": 6.809153519723955, + "learning_rate": 1.944888796596889e-05, + "loss": 0.1484, + "step": 4252 + }, + { + "epoch": 1.7295648637657584, + "grad_norm": 13.358985054518378, + "learning_rate": 1.944855620831572e-05, + "loss": 0.577, + "step": 4253 + }, + { + "epoch": 1.7299715331435543, + "grad_norm": 9.844732648876743, + "learning_rate": 1.944822435366837e-05, + "loss": 0.3928, + "step": 4254 + }, + { + "epoch": 1.73037820252135, + "grad_norm": 9.59914219855543, + "learning_rate": 1.9447892402030238e-05, + "loss": 0.3104, + "step": 4255 + }, + { + "epoch": 1.730784871899146, + "grad_norm": 6.236954107446574, + "learning_rate": 1.9447560353404735e-05, + "loss": 0.1343, + "step": 4256 + }, + { + "epoch": 1.7311915412769419, + "grad_norm": 11.136431844542011, + "learning_rate": 1.9447228207795274e-05, + "loss": 0.3423, + "step": 4257 + }, + { + "epoch": 1.7315982106547376, + "grad_norm": 13.720410588283526, + "learning_rate": 1.9446895965205256e-05, + "loss": 0.8169, + "step": 4258 + }, + { + "epoch": 1.7320048800325334, + "grad_norm": 6.605014822756276, + "learning_rate": 1.94465636256381e-05, + "loss": 0.1148, + "step": 4259 + }, + { + "epoch": 1.7324115494103294, + "grad_norm": 18.745811317094425, + "learning_rate": 1.9446231189097214e-05, + "loss": 1.1658, + "step": 4260 + }, + { + "epoch": 1.7328182187881254, + "grad_norm": 3.4433296405722276, + "learning_rate": 1.9445898655586006e-05, + "loss": 0.0097, + "step": 4261 + }, + { + "epoch": 1.7332248881659211, + "grad_norm": 5.3391964703413155, + "learning_rate": 1.94455660251079e-05, + "loss": 0.0806, + "step": 4262 + }, + { + "epoch": 1.733631557543717, + "grad_norm": 9.218826195434298, + "learning_rate": 1.9445233297666302e-05, + "loss": 0.126, + "step": 4263 + }, + { + "epoch": 1.7340382269215127, + "grad_norm": 15.549115822436105, + "learning_rate": 1.944490047326463e-05, + "loss": 0.5995, + "step": 4264 + }, + { + "epoch": 1.7344448962993086, + "grad_norm": 20.030880005539327, + "learning_rate": 1.9444567551906302e-05, + "loss": 0.4265, + "step": 4265 + }, + { + "epoch": 1.7348515656771046, + "grad_norm": 4.957902684805611, + "learning_rate": 1.9444234533594736e-05, + "loss": 0.0935, + "step": 4266 + }, + { + "epoch": 1.7352582350549004, + "grad_norm": 72.41924855100218, + "learning_rate": 1.9443901418333348e-05, + "loss": 0.7293, + "step": 4267 + }, + { + "epoch": 1.7356649044326962, + "grad_norm": 5.7132845452817325, + "learning_rate": 1.944356820612556e-05, + "loss": 0.1052, + "step": 4268 + }, + { + "epoch": 1.736071573810492, + "grad_norm": 5.313505260147363, + "learning_rate": 1.944323489697479e-05, + "loss": 0.172, + "step": 4269 + }, + { + "epoch": 1.736478243188288, + "grad_norm": 10.549184593643167, + "learning_rate": 1.9442901490884462e-05, + "loss": 0.3437, + "step": 4270 + }, + { + "epoch": 1.736884912566084, + "grad_norm": 9.444214639130292, + "learning_rate": 1.9442567987857998e-05, + "loss": 0.3717, + "step": 4271 + }, + { + "epoch": 1.7372915819438797, + "grad_norm": 6.62627170533699, + "learning_rate": 1.944223438789882e-05, + "loss": 0.1372, + "step": 4272 + }, + { + "epoch": 1.7376982513216754, + "grad_norm": 25.182235194750756, + "learning_rate": 1.9441900691010354e-05, + "loss": 0.6994, + "step": 4273 + }, + { + "epoch": 1.7381049206994712, + "grad_norm": 7.45203437006776, + "learning_rate": 1.9441566897196025e-05, + "loss": 0.1569, + "step": 4274 + }, + { + "epoch": 1.7385115900772672, + "grad_norm": 10.208444045360146, + "learning_rate": 1.9441233006459263e-05, + "loss": 0.3624, + "step": 4275 + }, + { + "epoch": 1.7389182594550632, + "grad_norm": 6.204553409936863, + "learning_rate": 1.944089901880349e-05, + "loss": 0.1073, + "step": 4276 + }, + { + "epoch": 1.739324928832859, + "grad_norm": 27.325046235936867, + "learning_rate": 1.944056493423214e-05, + "loss": 0.4895, + "step": 4277 + }, + { + "epoch": 1.7397315982106547, + "grad_norm": 8.283021584493305, + "learning_rate": 1.9440230752748635e-05, + "loss": 0.2448, + "step": 4278 + }, + { + "epoch": 1.7401382675884505, + "grad_norm": 36.02701371244533, + "learning_rate": 1.943989647435641e-05, + "loss": 0.9141, + "step": 4279 + }, + { + "epoch": 1.7405449369662465, + "grad_norm": 1.3220362866956383, + "learning_rate": 1.94395620990589e-05, + "loss": 0.0213, + "step": 4280 + }, + { + "epoch": 1.7409516063440424, + "grad_norm": 3.919151815676175, + "learning_rate": 1.9439227626859535e-05, + "loss": 0.0814, + "step": 4281 + }, + { + "epoch": 1.7413582757218382, + "grad_norm": 11.952314495831562, + "learning_rate": 1.9438893057761744e-05, + "loss": 0.532, + "step": 4282 + }, + { + "epoch": 1.741764945099634, + "grad_norm": 13.14224973397429, + "learning_rate": 1.9438558391768966e-05, + "loss": 0.4997, + "step": 4283 + }, + { + "epoch": 1.7421716144774297, + "grad_norm": 13.105768843301295, + "learning_rate": 1.943822362888464e-05, + "loss": 0.462, + "step": 4284 + }, + { + "epoch": 1.7425782838552257, + "grad_norm": 0.22760309023250305, + "learning_rate": 1.9437888769112194e-05, + "loss": 0.0048, + "step": 4285 + }, + { + "epoch": 1.7429849532330217, + "grad_norm": 1.2122917537862956, + "learning_rate": 1.9437553812455073e-05, + "loss": 0.016, + "step": 4286 + }, + { + "epoch": 1.7433916226108175, + "grad_norm": 4.296357347642929, + "learning_rate": 1.943721875891671e-05, + "loss": 0.1245, + "step": 4287 + }, + { + "epoch": 1.7437982919886132, + "grad_norm": 6.692454795749995, + "learning_rate": 1.9436883608500545e-05, + "loss": 0.1831, + "step": 4288 + }, + { + "epoch": 1.744204961366409, + "grad_norm": 19.26970279867792, + "learning_rate": 1.9436548361210025e-05, + "loss": 1.1624, + "step": 4289 + }, + { + "epoch": 1.744611630744205, + "grad_norm": 5.446794718171672, + "learning_rate": 1.9436213017048582e-05, + "loss": 0.2181, + "step": 4290 + }, + { + "epoch": 1.7450183001220008, + "grad_norm": 6.994217834834449, + "learning_rate": 1.9435877576019665e-05, + "loss": 0.1124, + "step": 4291 + }, + { + "epoch": 1.7454249694997968, + "grad_norm": 10.52997618903713, + "learning_rate": 1.9435542038126717e-05, + "loss": 0.2566, + "step": 4292 + }, + { + "epoch": 1.7458316388775925, + "grad_norm": 11.726419417164694, + "learning_rate": 1.943520640337318e-05, + "loss": 0.4764, + "step": 4293 + }, + { + "epoch": 1.7462383082553883, + "grad_norm": 0.840611042900081, + "learning_rate": 1.94348706717625e-05, + "loss": 0.0138, + "step": 4294 + }, + { + "epoch": 1.7466449776331843, + "grad_norm": 16.17509445265295, + "learning_rate": 1.9434534843298126e-05, + "loss": 0.5828, + "step": 4295 + }, + { + "epoch": 1.74705164701098, + "grad_norm": 1.0047811098969193, + "learning_rate": 1.9434198917983502e-05, + "loss": 0.0108, + "step": 4296 + }, + { + "epoch": 1.747458316388776, + "grad_norm": 8.353247618763083, + "learning_rate": 1.943386289582208e-05, + "loss": 0.2498, + "step": 4297 + }, + { + "epoch": 1.7478649857665718, + "grad_norm": 15.137467668033773, + "learning_rate": 1.9433526776817305e-05, + "loss": 0.4928, + "step": 4298 + }, + { + "epoch": 1.7482716551443676, + "grad_norm": 7.003475037377957, + "learning_rate": 1.943319056097263e-05, + "loss": 0.3856, + "step": 4299 + }, + { + "epoch": 1.7486783245221633, + "grad_norm": 67.1143685157282, + "learning_rate": 1.9432854248291505e-05, + "loss": 0.5271, + "step": 4300 + }, + { + "epoch": 1.7490849938999593, + "grad_norm": 12.478749739847546, + "learning_rate": 1.943251783877739e-05, + "loss": 0.3612, + "step": 4301 + }, + { + "epoch": 1.7494916632777553, + "grad_norm": 3.450291598385786, + "learning_rate": 1.9432181332433725e-05, + "loss": 0.0742, + "step": 4302 + }, + { + "epoch": 1.749898332655551, + "grad_norm": 6.41254724173254, + "learning_rate": 1.9431844729263973e-05, + "loss": 0.2673, + "step": 4303 + }, + { + "epoch": 1.7503050020333468, + "grad_norm": 6.768861829397419, + "learning_rate": 1.9431508029271592e-05, + "loss": 0.3715, + "step": 4304 + }, + { + "epoch": 1.7507116714111426, + "grad_norm": 17.83510502476784, + "learning_rate": 1.943117123246003e-05, + "loss": 0.6327, + "step": 4305 + }, + { + "epoch": 1.7511183407889386, + "grad_norm": 15.820894700864121, + "learning_rate": 1.943083433883275e-05, + "loss": 0.4944, + "step": 4306 + }, + { + "epoch": 1.7515250101667346, + "grad_norm": 13.524033939887529, + "learning_rate": 1.9430497348393208e-05, + "loss": 0.3212, + "step": 4307 + }, + { + "epoch": 1.7519316795445303, + "grad_norm": 47.64039090225091, + "learning_rate": 1.943016026114487e-05, + "loss": 0.8561, + "step": 4308 + }, + { + "epoch": 1.752338348922326, + "grad_norm": 18.82099352851847, + "learning_rate": 1.9429823077091187e-05, + "loss": 1.4636, + "step": 4309 + }, + { + "epoch": 1.7527450183001219, + "grad_norm": 18.814934014090802, + "learning_rate": 1.9429485796235622e-05, + "loss": 0.5467, + "step": 4310 + }, + { + "epoch": 1.7531516876779178, + "grad_norm": 5.918655593517824, + "learning_rate": 1.942914841858164e-05, + "loss": 0.0867, + "step": 4311 + }, + { + "epoch": 1.7535583570557138, + "grad_norm": 2.1414714143991347, + "learning_rate": 1.9428810944132705e-05, + "loss": 0.0291, + "step": 4312 + }, + { + "epoch": 1.7539650264335096, + "grad_norm": 11.250797671774993, + "learning_rate": 1.942847337289228e-05, + "loss": 0.1735, + "step": 4313 + }, + { + "epoch": 1.7543716958113054, + "grad_norm": 17.04352976405492, + "learning_rate": 1.9428135704863834e-05, + "loss": 0.3256, + "step": 4314 + }, + { + "epoch": 1.7547783651891011, + "grad_norm": 9.57660040406196, + "learning_rate": 1.9427797940050825e-05, + "loss": 0.4395, + "step": 4315 + }, + { + "epoch": 1.7551850345668971, + "grad_norm": 9.177296368663432, + "learning_rate": 1.942746007845673e-05, + "loss": 0.227, + "step": 4316 + }, + { + "epoch": 1.755591703944693, + "grad_norm": 17.087348569242252, + "learning_rate": 1.9427122120085005e-05, + "loss": 0.7915, + "step": 4317 + }, + { + "epoch": 1.7559983733224889, + "grad_norm": 12.074777759593367, + "learning_rate": 1.9426784064939134e-05, + "loss": 0.2797, + "step": 4318 + }, + { + "epoch": 1.7564050427002846, + "grad_norm": 13.87059042182445, + "learning_rate": 1.9426445913022574e-05, + "loss": 0.548, + "step": 4319 + }, + { + "epoch": 1.7568117120780804, + "grad_norm": 2.266103765806339, + "learning_rate": 1.9426107664338806e-05, + "loss": 0.0465, + "step": 4320 + }, + { + "epoch": 1.7572183814558764, + "grad_norm": 17.666707497265495, + "learning_rate": 1.94257693188913e-05, + "loss": 1.0001, + "step": 4321 + }, + { + "epoch": 1.7576250508336724, + "grad_norm": 8.594578980213718, + "learning_rate": 1.942543087668353e-05, + "loss": 0.1434, + "step": 4322 + }, + { + "epoch": 1.7580317202114681, + "grad_norm": 1.9880443501513356, + "learning_rate": 1.9425092337718964e-05, + "loss": 0.0324, + "step": 4323 + }, + { + "epoch": 1.758438389589264, + "grad_norm": 7.950676729991977, + "learning_rate": 1.942475370200108e-05, + "loss": 0.2259, + "step": 4324 + }, + { + "epoch": 1.7588450589670597, + "grad_norm": 7.865310589211737, + "learning_rate": 1.942441496953336e-05, + "loss": 0.3019, + "step": 4325 + }, + { + "epoch": 1.7592517283448557, + "grad_norm": 1.506122669328751, + "learning_rate": 1.942407614031928e-05, + "loss": 0.0192, + "step": 4326 + }, + { + "epoch": 1.7596583977226516, + "grad_norm": 17.982365847401176, + "learning_rate": 1.9423737214362307e-05, + "loss": 0.646, + "step": 4327 + }, + { + "epoch": 1.7600650671004474, + "grad_norm": 11.132142683435372, + "learning_rate": 1.9423398191665933e-05, + "loss": 0.5041, + "step": 4328 + }, + { + "epoch": 1.7604717364782432, + "grad_norm": 9.492034478779084, + "learning_rate": 1.9423059072233633e-05, + "loss": 0.8697, + "step": 4329 + }, + { + "epoch": 1.760878405856039, + "grad_norm": 33.71866848129692, + "learning_rate": 1.9422719856068893e-05, + "loss": 1.0381, + "step": 4330 + }, + { + "epoch": 1.761285075233835, + "grad_norm": 14.186908983914861, + "learning_rate": 1.942238054317519e-05, + "loss": 0.4294, + "step": 4331 + }, + { + "epoch": 1.7616917446116307, + "grad_norm": 4.015215454575734, + "learning_rate": 1.9422041133556005e-05, + "loss": 0.0607, + "step": 4332 + }, + { + "epoch": 1.7620984139894267, + "grad_norm": 17.538594579083465, + "learning_rate": 1.9421701627214826e-05, + "loss": 0.4208, + "step": 4333 + }, + { + "epoch": 1.7625050833672224, + "grad_norm": 3.2626917063902376, + "learning_rate": 1.942136202415514e-05, + "loss": 0.0463, + "step": 4334 + }, + { + "epoch": 1.7629117527450182, + "grad_norm": 9.422655510461555, + "learning_rate": 1.9421022324380428e-05, + "loss": 0.3707, + "step": 4335 + }, + { + "epoch": 1.7633184221228142, + "grad_norm": 3.9233453860952925, + "learning_rate": 1.9420682527894185e-05, + "loss": 0.0567, + "step": 4336 + }, + { + "epoch": 1.76372509150061, + "grad_norm": 21.93089536439661, + "learning_rate": 1.9420342634699893e-05, + "loss": 0.7267, + "step": 4337 + }, + { + "epoch": 1.764131760878406, + "grad_norm": 18.129429022434017, + "learning_rate": 1.9420002644801045e-05, + "loss": 0.7038, + "step": 4338 + }, + { + "epoch": 1.7645384302562017, + "grad_norm": 13.531957922503857, + "learning_rate": 1.9419662558201122e-05, + "loss": 0.4078, + "step": 4339 + }, + { + "epoch": 1.7649450996339975, + "grad_norm": 6.82118949240619, + "learning_rate": 1.9419322374903626e-05, + "loss": 0.2248, + "step": 4340 + }, + { + "epoch": 1.7653517690117932, + "grad_norm": 21.864715733091014, + "learning_rate": 1.9418982094912047e-05, + "loss": 0.2971, + "step": 4341 + }, + { + "epoch": 1.7657584383895892, + "grad_norm": 6.53336616180466, + "learning_rate": 1.9418641718229874e-05, + "loss": 0.2058, + "step": 4342 + }, + { + "epoch": 1.7661651077673852, + "grad_norm": 6.829831609955648, + "learning_rate": 1.9418301244860602e-05, + "loss": 0.246, + "step": 4343 + }, + { + "epoch": 1.766571777145181, + "grad_norm": 33.531127505148675, + "learning_rate": 1.9417960674807734e-05, + "loss": 0.4168, + "step": 4344 + }, + { + "epoch": 1.7669784465229768, + "grad_norm": 14.461644481779203, + "learning_rate": 1.9417620008074755e-05, + "loss": 0.9367, + "step": 4345 + }, + { + "epoch": 1.7673851159007725, + "grad_norm": 16.76058715152833, + "learning_rate": 1.9417279244665166e-05, + "loss": 0.7047, + "step": 4346 + }, + { + "epoch": 1.7677917852785685, + "grad_norm": 10.723827282829086, + "learning_rate": 1.9416938384582463e-05, + "loss": 0.0591, + "step": 4347 + }, + { + "epoch": 1.7681984546563645, + "grad_norm": 8.976263056210234, + "learning_rate": 1.9416597427830156e-05, + "loss": 0.0453, + "step": 4348 + }, + { + "epoch": 1.7686051240341603, + "grad_norm": 2.910354502736721, + "learning_rate": 1.941625637441173e-05, + "loss": 0.0548, + "step": 4349 + }, + { + "epoch": 1.769011793411956, + "grad_norm": 4.749486120033341, + "learning_rate": 1.94159152243307e-05, + "loss": 0.113, + "step": 4350 + }, + { + "epoch": 1.7694184627897518, + "grad_norm": 12.458872472724284, + "learning_rate": 1.9415573977590557e-05, + "loss": 0.2356, + "step": 4351 + }, + { + "epoch": 1.7698251321675478, + "grad_norm": 11.828236775975594, + "learning_rate": 1.941523263419481e-05, + "loss": 0.6708, + "step": 4352 + }, + { + "epoch": 1.7702318015453438, + "grad_norm": 11.793283445295096, + "learning_rate": 1.941489119414696e-05, + "loss": 0.3323, + "step": 4353 + }, + { + "epoch": 1.7706384709231395, + "grad_norm": 20.80181990595806, + "learning_rate": 1.9414549657450513e-05, + "loss": 0.7527, + "step": 4354 + }, + { + "epoch": 1.7710451403009353, + "grad_norm": 2.269022370614964, + "learning_rate": 1.9414208024108977e-05, + "loss": 0.0374, + "step": 4355 + }, + { + "epoch": 1.771451809678731, + "grad_norm": 12.259102570090954, + "learning_rate": 1.9413866294125856e-05, + "loss": 0.433, + "step": 4356 + }, + { + "epoch": 1.771858479056527, + "grad_norm": 6.671559227708834, + "learning_rate": 1.9413524467504662e-05, + "loss": 0.1659, + "step": 4357 + }, + { + "epoch": 1.772265148434323, + "grad_norm": 3.9220609318070276, + "learning_rate": 1.94131825442489e-05, + "loss": 0.0483, + "step": 4358 + }, + { + "epoch": 1.7726718178121188, + "grad_norm": 22.536219129592546, + "learning_rate": 1.941284052436208e-05, + "loss": 0.583, + "step": 4359 + }, + { + "epoch": 1.7730784871899146, + "grad_norm": 5.894313388429807, + "learning_rate": 1.9412498407847718e-05, + "loss": 0.3697, + "step": 4360 + }, + { + "epoch": 1.7734851565677103, + "grad_norm": 27.12207207748248, + "learning_rate": 1.941215619470932e-05, + "loss": 0.9803, + "step": 4361 + }, + { + "epoch": 1.7738918259455063, + "grad_norm": 12.5028270005706, + "learning_rate": 1.9411813884950402e-05, + "loss": 0.4501, + "step": 4362 + }, + { + "epoch": 1.7742984953233023, + "grad_norm": 10.142564223746461, + "learning_rate": 1.941147147857448e-05, + "loss": 0.2207, + "step": 4363 + }, + { + "epoch": 1.774705164701098, + "grad_norm": 4.411035951051719, + "learning_rate": 1.9411128975585065e-05, + "loss": 0.1363, + "step": 4364 + }, + { + "epoch": 1.7751118340788938, + "grad_norm": 54.907646352504024, + "learning_rate": 1.9410786375985676e-05, + "loss": 0.5655, + "step": 4365 + }, + { + "epoch": 1.7755185034566896, + "grad_norm": 4.649145204577221, + "learning_rate": 1.9410443679779828e-05, + "loss": 0.1175, + "step": 4366 + }, + { + "epoch": 1.7759251728344856, + "grad_norm": 5.673299543992895, + "learning_rate": 1.9410100886971034e-05, + "loss": 0.1831, + "step": 4367 + }, + { + "epoch": 1.7763318422122816, + "grad_norm": 12.543596396636675, + "learning_rate": 1.9409757997562826e-05, + "loss": 0.2553, + "step": 4368 + }, + { + "epoch": 1.7767385115900773, + "grad_norm": 12.768542696653611, + "learning_rate": 1.9409415011558712e-05, + "loss": 0.2096, + "step": 4369 + }, + { + "epoch": 1.777145180967873, + "grad_norm": 9.289209082756704, + "learning_rate": 1.940907192896222e-05, + "loss": 0.4844, + "step": 4370 + }, + { + "epoch": 1.7775518503456689, + "grad_norm": 5.341237951376997, + "learning_rate": 1.9408728749776867e-05, + "loss": 0.25, + "step": 4371 + }, + { + "epoch": 1.7779585197234649, + "grad_norm": 24.93928518344424, + "learning_rate": 1.9408385474006177e-05, + "loss": 0.391, + "step": 4372 + }, + { + "epoch": 1.7783651891012606, + "grad_norm": 9.304043239866596, + "learning_rate": 1.9408042101653677e-05, + "loss": 0.2669, + "step": 4373 + }, + { + "epoch": 1.7787718584790566, + "grad_norm": 13.993175091979719, + "learning_rate": 1.940769863272289e-05, + "loss": 0.3591, + "step": 4374 + }, + { + "epoch": 1.7791785278568524, + "grad_norm": 23.963871133866878, + "learning_rate": 1.9407355067217343e-05, + "loss": 0.8395, + "step": 4375 + }, + { + "epoch": 1.7795851972346481, + "grad_norm": 11.01256280543469, + "learning_rate": 1.940701140514056e-05, + "loss": 0.2291, + "step": 4376 + }, + { + "epoch": 1.7799918666124441, + "grad_norm": 0.8636431275520609, + "learning_rate": 1.9406667646496065e-05, + "loss": 0.0214, + "step": 4377 + }, + { + "epoch": 1.78039853599024, + "grad_norm": 10.91052023363789, + "learning_rate": 1.94063237912874e-05, + "loss": 0.4577, + "step": 4378 + }, + { + "epoch": 1.7808052053680359, + "grad_norm": 25.370416144704205, + "learning_rate": 1.9405979839518084e-05, + "loss": 0.8799, + "step": 4379 + }, + { + "epoch": 1.7812118747458316, + "grad_norm": 0.9065430113037858, + "learning_rate": 1.940563579119165e-05, + "loss": 0.0103, + "step": 4380 + }, + { + "epoch": 1.7816185441236274, + "grad_norm": 13.7706656526584, + "learning_rate": 1.940529164631164e-05, + "loss": 0.2735, + "step": 4381 + }, + { + "epoch": 1.7820252135014232, + "grad_norm": 5.685043866412771, + "learning_rate": 1.9404947404881566e-05, + "loss": 0.2773, + "step": 4382 + }, + { + "epoch": 1.7824318828792192, + "grad_norm": 1.1678572397190363, + "learning_rate": 1.940460306690498e-05, + "loss": 0.0192, + "step": 4383 + }, + { + "epoch": 1.7828385522570152, + "grad_norm": 4.5956811173824414, + "learning_rate": 1.9404258632385408e-05, + "loss": 0.0592, + "step": 4384 + }, + { + "epoch": 1.783245221634811, + "grad_norm": 12.259065585123748, + "learning_rate": 1.9403914101326388e-05, + "loss": 0.2719, + "step": 4385 + }, + { + "epoch": 1.7836518910126067, + "grad_norm": 6.598032144444937, + "learning_rate": 1.9403569473731453e-05, + "loss": 0.1313, + "step": 4386 + }, + { + "epoch": 1.7840585603904024, + "grad_norm": 10.949674989617288, + "learning_rate": 1.940322474960415e-05, + "loss": 0.2272, + "step": 4387 + }, + { + "epoch": 1.7844652297681984, + "grad_norm": 7.588771587889393, + "learning_rate": 1.9402879928948012e-05, + "loss": 0.174, + "step": 4388 + }, + { + "epoch": 1.7848718991459944, + "grad_norm": 8.721522297569368, + "learning_rate": 1.940253501176658e-05, + "loss": 0.3485, + "step": 4389 + }, + { + "epoch": 1.7852785685237902, + "grad_norm": 8.974742292555172, + "learning_rate": 1.940218999806339e-05, + "loss": 0.4654, + "step": 4390 + }, + { + "epoch": 1.785685237901586, + "grad_norm": 2.448835939090383, + "learning_rate": 1.940184488784199e-05, + "loss": 0.0465, + "step": 4391 + }, + { + "epoch": 1.7860919072793817, + "grad_norm": 5.608006429454363, + "learning_rate": 1.940149968110592e-05, + "loss": 0.2167, + "step": 4392 + }, + { + "epoch": 1.7864985766571777, + "grad_norm": 11.022976332163283, + "learning_rate": 1.9401154377858725e-05, + "loss": 0.3824, + "step": 4393 + }, + { + "epoch": 1.7869052460349737, + "grad_norm": 5.480697888193172, + "learning_rate": 1.9400808978103948e-05, + "loss": 0.0958, + "step": 4394 + }, + { + "epoch": 1.7873119154127695, + "grad_norm": 32.115975024628476, + "learning_rate": 1.9400463481845135e-05, + "loss": 1.0723, + "step": 4395 + }, + { + "epoch": 1.7877185847905652, + "grad_norm": 13.644594444018807, + "learning_rate": 1.9400117889085833e-05, + "loss": 0.5004, + "step": 4396 + }, + { + "epoch": 1.788125254168361, + "grad_norm": 7.285517781263812, + "learning_rate": 1.939977219982959e-05, + "loss": 0.1569, + "step": 4397 + }, + { + "epoch": 1.788531923546157, + "grad_norm": 11.155270031639045, + "learning_rate": 1.9399426414079954e-05, + "loss": 0.7272, + "step": 4398 + }, + { + "epoch": 1.788938592923953, + "grad_norm": 16.273221189357642, + "learning_rate": 1.939908053184048e-05, + "loss": 0.9038, + "step": 4399 + }, + { + "epoch": 1.7893452623017487, + "grad_norm": 3.4200346460306865, + "learning_rate": 1.9398734553114707e-05, + "loss": 0.0525, + "step": 4400 + }, + { + "epoch": 1.7897519316795445, + "grad_norm": 9.025578247069182, + "learning_rate": 1.93983884779062e-05, + "loss": 0.486, + "step": 4401 + }, + { + "epoch": 1.7901586010573403, + "grad_norm": 5.717353891247341, + "learning_rate": 1.9398042306218502e-05, + "loss": 0.1814, + "step": 4402 + }, + { + "epoch": 1.7905652704351362, + "grad_norm": 6.925508095730624, + "learning_rate": 1.939769603805517e-05, + "loss": 0.1301, + "step": 4403 + }, + { + "epoch": 1.7909719398129322, + "grad_norm": 11.860442622382111, + "learning_rate": 1.9397349673419757e-05, + "loss": 0.5179, + "step": 4404 + }, + { + "epoch": 1.791378609190728, + "grad_norm": 8.06213324952786, + "learning_rate": 1.939700321231582e-05, + "loss": 0.2045, + "step": 4405 + }, + { + "epoch": 1.7917852785685238, + "grad_norm": 12.80581393416363, + "learning_rate": 1.9396656654746917e-05, + "loss": 0.6007, + "step": 4406 + }, + { + "epoch": 1.7921919479463195, + "grad_norm": 6.2866364929603185, + "learning_rate": 1.9396310000716603e-05, + "loss": 0.0889, + "step": 4407 + }, + { + "epoch": 1.7925986173241155, + "grad_norm": 9.476987817604162, + "learning_rate": 1.939596325022844e-05, + "loss": 0.2297, + "step": 4408 + }, + { + "epoch": 1.7930052867019115, + "grad_norm": 27.311773273044007, + "learning_rate": 1.9395616403285988e-05, + "loss": 0.3975, + "step": 4409 + }, + { + "epoch": 1.7934119560797073, + "grad_norm": 11.754451150509807, + "learning_rate": 1.9395269459892797e-05, + "loss": 0.4076, + "step": 4410 + }, + { + "epoch": 1.793818625457503, + "grad_norm": 18.032881274385836, + "learning_rate": 1.939492242005244e-05, + "loss": 0.6571, + "step": 4411 + }, + { + "epoch": 1.7942252948352988, + "grad_norm": 8.378217171575427, + "learning_rate": 1.939457528376848e-05, + "loss": 0.394, + "step": 4412 + }, + { + "epoch": 1.7946319642130948, + "grad_norm": 11.723963228468577, + "learning_rate": 1.9394228051044473e-05, + "loss": 0.3478, + "step": 4413 + }, + { + "epoch": 1.7950386335908906, + "grad_norm": 5.83369639412722, + "learning_rate": 1.9393880721883988e-05, + "loss": 0.0938, + "step": 4414 + }, + { + "epoch": 1.7954453029686865, + "grad_norm": 17.942403308310855, + "learning_rate": 1.9393533296290587e-05, + "loss": 0.9005, + "step": 4415 + }, + { + "epoch": 1.7958519723464823, + "grad_norm": 14.145505728814463, + "learning_rate": 1.9393185774267838e-05, + "loss": 0.8184, + "step": 4416 + }, + { + "epoch": 1.796258641724278, + "grad_norm": 11.198653897484617, + "learning_rate": 1.9392838155819316e-05, + "loss": 0.2931, + "step": 4417 + }, + { + "epoch": 1.796665311102074, + "grad_norm": 8.393832488841438, + "learning_rate": 1.9392490440948578e-05, + "loss": 0.2033, + "step": 4418 + }, + { + "epoch": 1.7970719804798698, + "grad_norm": 15.29852698533902, + "learning_rate": 1.9392142629659202e-05, + "loss": 0.388, + "step": 4419 + }, + { + "epoch": 1.7974786498576658, + "grad_norm": 2.8528181367317775, + "learning_rate": 1.939179472195475e-05, + "loss": 0.034, + "step": 4420 + }, + { + "epoch": 1.7978853192354616, + "grad_norm": 7.679456529498386, + "learning_rate": 1.93914467178388e-05, + "loss": 0.2752, + "step": 4421 + }, + { + "epoch": 1.7982919886132573, + "grad_norm": 7.818608412193892, + "learning_rate": 1.9391098617314924e-05, + "loss": 0.2571, + "step": 4422 + }, + { + "epoch": 1.7986986579910533, + "grad_norm": 9.145277703006698, + "learning_rate": 1.9390750420386694e-05, + "loss": 0.1992, + "step": 4423 + }, + { + "epoch": 1.799105327368849, + "grad_norm": 12.745008482705858, + "learning_rate": 1.9390402127057685e-05, + "loss": 0.4976, + "step": 4424 + }, + { + "epoch": 1.799511996746645, + "grad_norm": 3.2497288332601952, + "learning_rate": 1.939005373733147e-05, + "loss": 0.028, + "step": 4425 + }, + { + "epoch": 1.7999186661244408, + "grad_norm": 6.990649907197271, + "learning_rate": 1.938970525121163e-05, + "loss": 0.2162, + "step": 4426 + }, + { + "epoch": 1.8003253355022366, + "grad_norm": 3.0330229131204907, + "learning_rate": 1.938935666870174e-05, + "loss": 0.1015, + "step": 4427 + }, + { + "epoch": 1.8007320048800324, + "grad_norm": 24.84305496923035, + "learning_rate": 1.9389007989805378e-05, + "loss": 0.4527, + "step": 4428 + }, + { + "epoch": 1.8011386742578284, + "grad_norm": 9.87163290214573, + "learning_rate": 1.9388659214526124e-05, + "loss": 0.4047, + "step": 4429 + }, + { + "epoch": 1.8015453436356244, + "grad_norm": 18.713101004311884, + "learning_rate": 1.938831034286756e-05, + "loss": 0.9702, + "step": 4430 + }, + { + "epoch": 1.8019520130134201, + "grad_norm": 9.308752346228534, + "learning_rate": 1.9387961374833256e-05, + "loss": 0.3504, + "step": 4431 + }, + { + "epoch": 1.8023586823912159, + "grad_norm": 59.81714062441406, + "learning_rate": 1.938761231042681e-05, + "loss": 1.1036, + "step": 4432 + }, + { + "epoch": 1.8027653517690116, + "grad_norm": 9.06610805389553, + "learning_rate": 1.93872631496518e-05, + "loss": 0.2231, + "step": 4433 + }, + { + "epoch": 1.8031720211468076, + "grad_norm": 7.334638983583534, + "learning_rate": 1.938691389251181e-05, + "loss": 0.3091, + "step": 4434 + }, + { + "epoch": 1.8035786905246036, + "grad_norm": 11.503667766450079, + "learning_rate": 1.9386564539010422e-05, + "loss": 0.5033, + "step": 4435 + }, + { + "epoch": 1.8039853599023994, + "grad_norm": 6.918426520840113, + "learning_rate": 1.9386215089151226e-05, + "loss": 0.1133, + "step": 4436 + }, + { + "epoch": 1.8043920292801952, + "grad_norm": 5.5229962508565125, + "learning_rate": 1.9385865542937807e-05, + "loss": 0.2133, + "step": 4437 + }, + { + "epoch": 1.804798698657991, + "grad_norm": 12.60594173464841, + "learning_rate": 1.9385515900373753e-05, + "loss": 0.54, + "step": 4438 + }, + { + "epoch": 1.805205368035787, + "grad_norm": 5.207642936312509, + "learning_rate": 1.938516616146266e-05, + "loss": 0.2256, + "step": 4439 + }, + { + "epoch": 1.805612037413583, + "grad_norm": 12.93335299314965, + "learning_rate": 1.938481632620811e-05, + "loss": 0.7357, + "step": 4440 + }, + { + "epoch": 1.8060187067913787, + "grad_norm": 46.18605544601976, + "learning_rate": 1.9384466394613694e-05, + "loss": 1.4442, + "step": 4441 + }, + { + "epoch": 1.8064253761691744, + "grad_norm": 1.9759918845439064, + "learning_rate": 1.9384116366683013e-05, + "loss": 0.0324, + "step": 4442 + }, + { + "epoch": 1.8068320455469702, + "grad_norm": 6.52761246969105, + "learning_rate": 1.938376624241965e-05, + "loss": 0.1586, + "step": 4443 + }, + { + "epoch": 1.8072387149247662, + "grad_norm": 3.4795634199756957, + "learning_rate": 1.9383416021827207e-05, + "loss": 0.0784, + "step": 4444 + }, + { + "epoch": 1.8076453843025622, + "grad_norm": 11.884550141495485, + "learning_rate": 1.9383065704909274e-05, + "loss": 0.5802, + "step": 4445 + }, + { + "epoch": 1.808052053680358, + "grad_norm": 0.9437566020743766, + "learning_rate": 1.938271529166945e-05, + "loss": 0.0169, + "step": 4446 + }, + { + "epoch": 1.8084587230581537, + "grad_norm": 6.283602745091003, + "learning_rate": 1.938236478211133e-05, + "loss": 0.1972, + "step": 4447 + }, + { + "epoch": 1.8088653924359495, + "grad_norm": 9.628017612117322, + "learning_rate": 1.9382014176238513e-05, + "loss": 0.3449, + "step": 4448 + }, + { + "epoch": 1.8092720618137454, + "grad_norm": 12.941870197689925, + "learning_rate": 1.93816634740546e-05, + "loss": 0.5274, + "step": 4449 + }, + { + "epoch": 1.8096787311915414, + "grad_norm": 15.456755659284664, + "learning_rate": 1.938131267556319e-05, + "loss": 0.931, + "step": 4450 + }, + { + "epoch": 1.8100854005693372, + "grad_norm": 11.355561163642374, + "learning_rate": 1.9380961780767886e-05, + "loss": 0.5465, + "step": 4451 + }, + { + "epoch": 1.810492069947133, + "grad_norm": 7.133726677696882, + "learning_rate": 1.9380610789672282e-05, + "loss": 0.0945, + "step": 4452 + }, + { + "epoch": 1.8108987393249287, + "grad_norm": 6.661200112890813, + "learning_rate": 1.938025970227999e-05, + "loss": 0.1447, + "step": 4453 + }, + { + "epoch": 1.8113054087027247, + "grad_norm": 13.260870807264698, + "learning_rate": 1.9379908518594613e-05, + "loss": 0.5338, + "step": 4454 + }, + { + "epoch": 1.8117120780805205, + "grad_norm": 11.914247404289606, + "learning_rate": 1.937955723861975e-05, + "loss": 0.3799, + "step": 4455 + }, + { + "epoch": 1.8121187474583165, + "grad_norm": 12.468047471200995, + "learning_rate": 1.937920586235902e-05, + "loss": 0.5731, + "step": 4456 + }, + { + "epoch": 1.8125254168361122, + "grad_norm": 5.642553354583936, + "learning_rate": 1.9378854389816013e-05, + "loss": 0.1259, + "step": 4457 + }, + { + "epoch": 1.812932086213908, + "grad_norm": 9.458975464747548, + "learning_rate": 1.9378502820994348e-05, + "loss": 0.3185, + "step": 4458 + }, + { + "epoch": 1.813338755591704, + "grad_norm": 6.37238486975142, + "learning_rate": 1.937815115589763e-05, + "loss": 0.2775, + "step": 4459 + }, + { + "epoch": 1.8137454249694998, + "grad_norm": 2.2662791636511317, + "learning_rate": 1.937779939452947e-05, + "loss": 0.0354, + "step": 4460 + }, + { + "epoch": 1.8141520943472957, + "grad_norm": 10.74229708099827, + "learning_rate": 1.9377447536893484e-05, + "loss": 0.405, + "step": 4461 + }, + { + "epoch": 1.8145587637250915, + "grad_norm": 9.441433649256151, + "learning_rate": 1.9377095582993276e-05, + "loss": 0.2124, + "step": 4462 + }, + { + "epoch": 1.8149654331028873, + "grad_norm": 8.14745360257469, + "learning_rate": 1.9376743532832463e-05, + "loss": 0.3188, + "step": 4463 + }, + { + "epoch": 1.8153721024806833, + "grad_norm": 5.303848971390906, + "learning_rate": 1.937639138641466e-05, + "loss": 0.1303, + "step": 4464 + }, + { + "epoch": 1.815778771858479, + "grad_norm": 4.858653153734374, + "learning_rate": 1.937603914374348e-05, + "loss": 0.1671, + "step": 4465 + }, + { + "epoch": 1.816185441236275, + "grad_norm": 10.253848194502922, + "learning_rate": 1.9375686804822537e-05, + "loss": 0.292, + "step": 4466 + }, + { + "epoch": 1.8165921106140708, + "grad_norm": 10.637410829630458, + "learning_rate": 1.9375334369655455e-05, + "loss": 0.5189, + "step": 4467 + }, + { + "epoch": 1.8169987799918665, + "grad_norm": 23.766341379191918, + "learning_rate": 1.9374981838245842e-05, + "loss": 0.8871, + "step": 4468 + }, + { + "epoch": 1.8174054493696623, + "grad_norm": 5.065415308585856, + "learning_rate": 1.9374629210597326e-05, + "loss": 0.2099, + "step": 4469 + }, + { + "epoch": 1.8178121187474583, + "grad_norm": 11.183195991950885, + "learning_rate": 1.937427648671352e-05, + "loss": 0.4752, + "step": 4470 + }, + { + "epoch": 1.8182187881252543, + "grad_norm": 5.797382593865727, + "learning_rate": 1.937392366659805e-05, + "loss": 0.0769, + "step": 4471 + }, + { + "epoch": 1.81862545750305, + "grad_norm": 11.353028662060801, + "learning_rate": 1.9373570750254532e-05, + "loss": 0.2253, + "step": 4472 + }, + { + "epoch": 1.8190321268808458, + "grad_norm": 7.41773729367514, + "learning_rate": 1.9373217737686597e-05, + "loss": 0.2032, + "step": 4473 + }, + { + "epoch": 1.8194387962586416, + "grad_norm": 14.566860656959745, + "learning_rate": 1.937286462889786e-05, + "loss": 0.4784, + "step": 4474 + }, + { + "epoch": 1.8198454656364376, + "grad_norm": 13.563901410956465, + "learning_rate": 1.9372511423891955e-05, + "loss": 0.5468, + "step": 4475 + }, + { + "epoch": 1.8202521350142336, + "grad_norm": 18.6245531686205, + "learning_rate": 1.93721581226725e-05, + "loss": 0.3416, + "step": 4476 + }, + { + "epoch": 1.8206588043920293, + "grad_norm": 3.296417629921712, + "learning_rate": 1.937180472524313e-05, + "loss": 0.0512, + "step": 4477 + }, + { + "epoch": 1.821065473769825, + "grad_norm": 2.118056533101361, + "learning_rate": 1.937145123160746e-05, + "loss": 0.0332, + "step": 4478 + }, + { + "epoch": 1.8214721431476208, + "grad_norm": 16.3243094499035, + "learning_rate": 1.937109764176913e-05, + "loss": 0.4441, + "step": 4479 + }, + { + "epoch": 1.8218788125254168, + "grad_norm": 7.458748606939213, + "learning_rate": 1.937074395573177e-05, + "loss": 0.34, + "step": 4480 + }, + { + "epoch": 1.8222854819032128, + "grad_norm": 13.998021390552273, + "learning_rate": 1.9370390173499e-05, + "loss": 0.5043, + "step": 4481 + }, + { + "epoch": 1.8226921512810086, + "grad_norm": 8.230836132242455, + "learning_rate": 1.9370036295074465e-05, + "loss": 0.2014, + "step": 4482 + }, + { + "epoch": 1.8230988206588044, + "grad_norm": 7.872290201160129, + "learning_rate": 1.9369682320461793e-05, + "loss": 0.1747, + "step": 4483 + }, + { + "epoch": 1.8235054900366001, + "grad_norm": 4.57795666138593, + "learning_rate": 1.936932824966461e-05, + "loss": 0.0289, + "step": 4484 + }, + { + "epoch": 1.823912159414396, + "grad_norm": 0.6629891685169799, + "learning_rate": 1.936897408268656e-05, + "loss": 0.0106, + "step": 4485 + }, + { + "epoch": 1.824318828792192, + "grad_norm": 1.731398548267844, + "learning_rate": 1.9368619819531273e-05, + "loss": 0.0197, + "step": 4486 + }, + { + "epoch": 1.8247254981699879, + "grad_norm": 18.77659781366884, + "learning_rate": 1.9368265460202392e-05, + "loss": 0.1482, + "step": 4487 + }, + { + "epoch": 1.8251321675477836, + "grad_norm": 9.215977053306293, + "learning_rate": 1.9367911004703552e-05, + "loss": 0.3102, + "step": 4488 + }, + { + "epoch": 1.8255388369255794, + "grad_norm": 17.109816005109455, + "learning_rate": 1.9367556453038388e-05, + "loss": 0.9411, + "step": 4489 + }, + { + "epoch": 1.8259455063033754, + "grad_norm": 5.036497880133056, + "learning_rate": 1.9367201805210542e-05, + "loss": 0.1138, + "step": 4490 + }, + { + "epoch": 1.8263521756811714, + "grad_norm": 12.05653898232442, + "learning_rate": 1.9366847061223658e-05, + "loss": 0.3332, + "step": 4491 + }, + { + "epoch": 1.8267588450589671, + "grad_norm": 15.238495638683073, + "learning_rate": 1.936649222108137e-05, + "loss": 0.2967, + "step": 4492 + }, + { + "epoch": 1.827165514436763, + "grad_norm": 1.7544528643905288, + "learning_rate": 1.936613728478733e-05, + "loss": 0.03, + "step": 4493 + }, + { + "epoch": 1.8275721838145587, + "grad_norm": 8.204940515038233, + "learning_rate": 1.9365782252345173e-05, + "loss": 0.2854, + "step": 4494 + }, + { + "epoch": 1.8279788531923546, + "grad_norm": 6.151774940573761, + "learning_rate": 1.936542712375855e-05, + "loss": 0.0579, + "step": 4495 + }, + { + "epoch": 1.8283855225701506, + "grad_norm": 0.7572396637458668, + "learning_rate": 1.9365071899031103e-05, + "loss": 0.0089, + "step": 4496 + }, + { + "epoch": 1.8287921919479464, + "grad_norm": 10.229377131031272, + "learning_rate": 1.9364716578166484e-05, + "loss": 0.3911, + "step": 4497 + }, + { + "epoch": 1.8291988613257422, + "grad_norm": 13.817109751559451, + "learning_rate": 1.936436116116833e-05, + "loss": 0.6489, + "step": 4498 + }, + { + "epoch": 1.829605530703538, + "grad_norm": 0.9638237465129083, + "learning_rate": 1.93640056480403e-05, + "loss": 0.0128, + "step": 4499 + }, + { + "epoch": 1.830012200081334, + "grad_norm": 4.626896180525688, + "learning_rate": 1.9363650038786037e-05, + "loss": 0.0379, + "step": 4500 + }, + { + "epoch": 1.8304188694591297, + "grad_norm": 17.563595021548064, + "learning_rate": 1.9363294333409192e-05, + "loss": 0.7333, + "step": 4501 + }, + { + "epoch": 1.8308255388369257, + "grad_norm": 14.724194478449984, + "learning_rate": 1.9362938531913422e-05, + "loss": 0.4505, + "step": 4502 + }, + { + "epoch": 1.8312322082147214, + "grad_norm": 4.52573500133451, + "learning_rate": 1.9362582634302375e-05, + "loss": 0.1029, + "step": 4503 + }, + { + "epoch": 1.8316388775925172, + "grad_norm": 5.268205607625248, + "learning_rate": 1.93622266405797e-05, + "loss": 0.0996, + "step": 4504 + }, + { + "epoch": 1.8320455469703132, + "grad_norm": 8.458715714061748, + "learning_rate": 1.9361870550749065e-05, + "loss": 0.6877, + "step": 4505 + }, + { + "epoch": 1.832452216348109, + "grad_norm": 2.6630161984639713, + "learning_rate": 1.936151436481411e-05, + "loss": 0.0281, + "step": 4506 + }, + { + "epoch": 1.832858885725905, + "grad_norm": 1.966987594046204, + "learning_rate": 1.9361158082778502e-05, + "loss": 0.0222, + "step": 4507 + }, + { + "epoch": 1.8332655551037007, + "grad_norm": 14.794165848655673, + "learning_rate": 1.9360801704645894e-05, + "loss": 0.9672, + "step": 4508 + }, + { + "epoch": 1.8336722244814965, + "grad_norm": 26.599921062426752, + "learning_rate": 1.9360445230419946e-05, + "loss": 1.109, + "step": 4509 + }, + { + "epoch": 1.8340788938592922, + "grad_norm": 1.230721147158979, + "learning_rate": 1.9360088660104317e-05, + "loss": 0.016, + "step": 4510 + }, + { + "epoch": 1.8344855632370882, + "grad_norm": 11.047671380652108, + "learning_rate": 1.9359731993702667e-05, + "loss": 0.3343, + "step": 4511 + }, + { + "epoch": 1.8348922326148842, + "grad_norm": 12.091035036506666, + "learning_rate": 1.9359375231218654e-05, + "loss": 0.438, + "step": 4512 + }, + { + "epoch": 1.83529890199268, + "grad_norm": 8.601226779212485, + "learning_rate": 1.9359018372655948e-05, + "loss": 0.2098, + "step": 4513 + }, + { + "epoch": 1.8357055713704757, + "grad_norm": 11.127427030109892, + "learning_rate": 1.9358661418018205e-05, + "loss": 0.3118, + "step": 4514 + }, + { + "epoch": 1.8361122407482715, + "grad_norm": 6.843100297720435, + "learning_rate": 1.9358304367309097e-05, + "loss": 0.1186, + "step": 4515 + }, + { + "epoch": 1.8365189101260675, + "grad_norm": 4.909567594729235, + "learning_rate": 1.935794722053228e-05, + "loss": 0.1235, + "step": 4516 + }, + { + "epoch": 1.8369255795038635, + "grad_norm": 12.615726523877314, + "learning_rate": 1.9357589977691427e-05, + "loss": 0.4197, + "step": 4517 + }, + { + "epoch": 1.8373322488816592, + "grad_norm": 11.393522578539391, + "learning_rate": 1.9357232638790205e-05, + "loss": 0.2919, + "step": 4518 + }, + { + "epoch": 1.837738918259455, + "grad_norm": 63.263611211968765, + "learning_rate": 1.9356875203832278e-05, + "loss": 3.9461, + "step": 4519 + }, + { + "epoch": 1.8381455876372508, + "grad_norm": 11.142390504422579, + "learning_rate": 1.935651767282132e-05, + "loss": 0.2774, + "step": 4520 + }, + { + "epoch": 1.8385522570150468, + "grad_norm": 2.344216236869428, + "learning_rate": 1.9356160045760996e-05, + "loss": 0.0472, + "step": 4521 + }, + { + "epoch": 1.8389589263928428, + "grad_norm": 7.6780636065919925, + "learning_rate": 1.935580232265498e-05, + "loss": 0.1026, + "step": 4522 + }, + { + "epoch": 1.8393655957706385, + "grad_norm": 16.217545834316002, + "learning_rate": 1.935544450350695e-05, + "loss": 0.8064, + "step": 4523 + }, + { + "epoch": 1.8397722651484343, + "grad_norm": 8.503049225907237, + "learning_rate": 1.9355086588320566e-05, + "loss": 0.269, + "step": 4524 + }, + { + "epoch": 1.84017893452623, + "grad_norm": 7.650593169918792, + "learning_rate": 1.9354728577099515e-05, + "loss": 0.1377, + "step": 4525 + }, + { + "epoch": 1.840585603904026, + "grad_norm": 6.449161510333374, + "learning_rate": 1.9354370469847465e-05, + "loss": 0.1032, + "step": 4526 + }, + { + "epoch": 1.840992273281822, + "grad_norm": 8.103290687309109, + "learning_rate": 1.9354012266568096e-05, + "loss": 0.3307, + "step": 4527 + }, + { + "epoch": 1.8413989426596178, + "grad_norm": 21.86049114538043, + "learning_rate": 1.9353653967265082e-05, + "loss": 0.4135, + "step": 4528 + }, + { + "epoch": 1.8418056120374136, + "grad_norm": 43.655972219797185, + "learning_rate": 1.9353295571942104e-05, + "loss": 0.2944, + "step": 4529 + }, + { + "epoch": 1.8422122814152093, + "grad_norm": 1.995398243176032, + "learning_rate": 1.9352937080602836e-05, + "loss": 0.0356, + "step": 4530 + }, + { + "epoch": 1.8426189507930053, + "grad_norm": 6.896779678286548, + "learning_rate": 1.9352578493250966e-05, + "loss": 0.4261, + "step": 4531 + }, + { + "epoch": 1.8430256201708013, + "grad_norm": 11.493620046804155, + "learning_rate": 1.9352219809890166e-05, + "loss": 0.3241, + "step": 4532 + }, + { + "epoch": 1.843432289548597, + "grad_norm": 3.0236310607736168, + "learning_rate": 1.9351861030524127e-05, + "loss": 0.0319, + "step": 4533 + }, + { + "epoch": 1.8438389589263928, + "grad_norm": 11.470797036244614, + "learning_rate": 1.935150215515653e-05, + "loss": 0.3297, + "step": 4534 + }, + { + "epoch": 1.8442456283041886, + "grad_norm": 15.613411413445428, + "learning_rate": 1.935114318379105e-05, + "loss": 0.3467, + "step": 4535 + }, + { + "epoch": 1.8446522976819846, + "grad_norm": 15.139284057060964, + "learning_rate": 1.935078411643138e-05, + "loss": 0.8509, + "step": 4536 + }, + { + "epoch": 1.8450589670597806, + "grad_norm": 18.98766847776231, + "learning_rate": 1.9350424953081207e-05, + "loss": 1.4951, + "step": 4537 + }, + { + "epoch": 1.8454656364375763, + "grad_norm": 9.120277236163584, + "learning_rate": 1.9350065693744216e-05, + "loss": 0.3287, + "step": 4538 + }, + { + "epoch": 1.845872305815372, + "grad_norm": 16.44125515729486, + "learning_rate": 1.9349706338424098e-05, + "loss": 0.6888, + "step": 4539 + }, + { + "epoch": 1.8462789751931679, + "grad_norm": 32.32760296488141, + "learning_rate": 1.9349346887124535e-05, + "loss": 1.4218, + "step": 4540 + }, + { + "epoch": 1.8466856445709638, + "grad_norm": 4.853254682606415, + "learning_rate": 1.934898733984922e-05, + "loss": 0.0881, + "step": 4541 + }, + { + "epoch": 1.8470923139487596, + "grad_norm": 23.607668614275678, + "learning_rate": 1.9348627696601844e-05, + "loss": 0.6215, + "step": 4542 + }, + { + "epoch": 1.8474989833265556, + "grad_norm": 56.8437107863561, + "learning_rate": 1.9348267957386103e-05, + "loss": 0.215, + "step": 4543 + }, + { + "epoch": 1.8479056527043514, + "grad_norm": 55.991783822990186, + "learning_rate": 1.9347908122205685e-05, + "loss": 0.9802, + "step": 4544 + }, + { + "epoch": 1.8483123220821471, + "grad_norm": 13.942733582286277, + "learning_rate": 1.9347548191064285e-05, + "loss": 0.2699, + "step": 4545 + }, + { + "epoch": 1.8487189914599431, + "grad_norm": 12.069420076413861, + "learning_rate": 1.93471881639656e-05, + "loss": 0.2548, + "step": 4546 + }, + { + "epoch": 1.8491256608377389, + "grad_norm": 18.644222529101395, + "learning_rate": 1.9346828040913324e-05, + "loss": 0.6451, + "step": 4547 + }, + { + "epoch": 1.8495323302155349, + "grad_norm": 7.21377179913646, + "learning_rate": 1.934646782191115e-05, + "loss": 0.2466, + "step": 4548 + }, + { + "epoch": 1.8499389995933306, + "grad_norm": 7.076020762434138, + "learning_rate": 1.9346107506962785e-05, + "loss": 0.2607, + "step": 4549 + }, + { + "epoch": 1.8503456689711264, + "grad_norm": 2.0038510598267103, + "learning_rate": 1.934574709607192e-05, + "loss": 0.0284, + "step": 4550 + }, + { + "epoch": 1.8507523383489222, + "grad_norm": 14.904862064347482, + "learning_rate": 1.934538658924226e-05, + "loss": 0.8691, + "step": 4551 + }, + { + "epoch": 1.8511590077267182, + "grad_norm": 2.360646888565798, + "learning_rate": 1.9345025986477498e-05, + "loss": 0.0391, + "step": 4552 + }, + { + "epoch": 1.8515656771045141, + "grad_norm": 6.749968181591062, + "learning_rate": 1.9344665287781347e-05, + "loss": 0.2552, + "step": 4553 + }, + { + "epoch": 1.85197234648231, + "grad_norm": 5.778664070698525, + "learning_rate": 1.93443044931575e-05, + "loss": 0.0923, + "step": 4554 + }, + { + "epoch": 1.8523790158601057, + "grad_norm": 4.97561617258074, + "learning_rate": 1.934394360260966e-05, + "loss": 0.0848, + "step": 4555 + }, + { + "epoch": 1.8527856852379014, + "grad_norm": 31.186403557466516, + "learning_rate": 1.9343582616141544e-05, + "loss": 1.7727, + "step": 4556 + }, + { + "epoch": 1.8531923546156974, + "grad_norm": 15.460216318119631, + "learning_rate": 1.934322153375685e-05, + "loss": 0.85, + "step": 4557 + }, + { + "epoch": 1.8535990239934934, + "grad_norm": 10.286715471629034, + "learning_rate": 1.9342860355459282e-05, + "loss": 0.3371, + "step": 4558 + }, + { + "epoch": 1.8540056933712892, + "grad_norm": 6.28495776949279, + "learning_rate": 1.934249908125255e-05, + "loss": 0.2664, + "step": 4559 + }, + { + "epoch": 1.854412362749085, + "grad_norm": 46.3872630313463, + "learning_rate": 1.9342137711140362e-05, + "loss": 0.4694, + "step": 4560 + }, + { + "epoch": 1.8548190321268807, + "grad_norm": 7.802846050597186, + "learning_rate": 1.934177624512643e-05, + "loss": 0.352, + "step": 4561 + }, + { + "epoch": 1.8552257015046767, + "grad_norm": 14.201454959303092, + "learning_rate": 1.9341414683214467e-05, + "loss": 0.6407, + "step": 4562 + }, + { + "epoch": 1.8556323708824727, + "grad_norm": 14.089832766480761, + "learning_rate": 1.9341053025408175e-05, + "loss": 0.4495, + "step": 4563 + }, + { + "epoch": 1.8560390402602684, + "grad_norm": 10.490025733452478, + "learning_rate": 1.9340691271711274e-05, + "loss": 0.3924, + "step": 4564 + }, + { + "epoch": 1.8564457096380642, + "grad_norm": 4.079572056244151, + "learning_rate": 1.9340329422127476e-05, + "loss": 0.0516, + "step": 4565 + }, + { + "epoch": 1.85685237901586, + "grad_norm": 22.236043866082525, + "learning_rate": 1.9339967476660497e-05, + "loss": 1.3653, + "step": 4566 + }, + { + "epoch": 1.857259048393656, + "grad_norm": 51.42300489039746, + "learning_rate": 1.9339605435314047e-05, + "loss": 1.5885, + "step": 4567 + }, + { + "epoch": 1.857665717771452, + "grad_norm": 14.48276991288523, + "learning_rate": 1.9339243298091854e-05, + "loss": 0.4713, + "step": 4568 + }, + { + "epoch": 1.8580723871492477, + "grad_norm": 10.481784123837791, + "learning_rate": 1.9338881064997622e-05, + "loss": 0.6721, + "step": 4569 + }, + { + "epoch": 1.8584790565270435, + "grad_norm": 0.14266049423485788, + "learning_rate": 1.9338518736035077e-05, + "loss": 0.0021, + "step": 4570 + }, + { + "epoch": 1.8588857259048392, + "grad_norm": 15.823254337881757, + "learning_rate": 1.9338156311207942e-05, + "loss": 0.4982, + "step": 4571 + }, + { + "epoch": 1.8592923952826352, + "grad_norm": 17.450475434429862, + "learning_rate": 1.9337793790519926e-05, + "loss": 0.5582, + "step": 4572 + }, + { + "epoch": 1.8596990646604312, + "grad_norm": 5.047797620400322, + "learning_rate": 1.933743117397476e-05, + "loss": 0.1407, + "step": 4573 + }, + { + "epoch": 1.860105734038227, + "grad_norm": 32.972844356045044, + "learning_rate": 1.9337068461576162e-05, + "loss": 1.2147, + "step": 4574 + }, + { + "epoch": 1.8605124034160228, + "grad_norm": 12.053062420939542, + "learning_rate": 1.933670565332786e-05, + "loss": 0.2635, + "step": 4575 + }, + { + "epoch": 1.8609190727938185, + "grad_norm": 4.979142642197285, + "learning_rate": 1.9336342749233573e-05, + "loss": 0.133, + "step": 4576 + }, + { + "epoch": 1.8613257421716145, + "grad_norm": 3.3768414410959995, + "learning_rate": 1.933597974929703e-05, + "loss": 0.0159, + "step": 4577 + }, + { + "epoch": 1.8617324115494105, + "grad_norm": 19.985638255680296, + "learning_rate": 1.9335616653521957e-05, + "loss": 0.6091, + "step": 4578 + }, + { + "epoch": 1.8621390809272063, + "grad_norm": 9.141601085713768, + "learning_rate": 1.9335253461912076e-05, + "loss": 0.3243, + "step": 4579 + }, + { + "epoch": 1.862545750305002, + "grad_norm": 4.190425603128273, + "learning_rate": 1.9334890174471124e-05, + "loss": 0.0997, + "step": 4580 + }, + { + "epoch": 1.8629524196827978, + "grad_norm": 8.086816215222614, + "learning_rate": 1.9334526791202828e-05, + "loss": 0.3672, + "step": 4581 + }, + { + "epoch": 1.8633590890605938, + "grad_norm": 21.028479990270757, + "learning_rate": 1.9334163312110913e-05, + "loss": 1.0936, + "step": 4582 + }, + { + "epoch": 1.8637657584383895, + "grad_norm": 7.442929807069258, + "learning_rate": 1.9333799737199115e-05, + "loss": 0.1974, + "step": 4583 + }, + { + "epoch": 1.8641724278161855, + "grad_norm": 4.0521224287139725, + "learning_rate": 1.9333436066471163e-05, + "loss": 0.0584, + "step": 4584 + }, + { + "epoch": 1.8645790971939813, + "grad_norm": 11.192572641672554, + "learning_rate": 1.9333072299930795e-05, + "loss": 0.4363, + "step": 4585 + }, + { + "epoch": 1.864985766571777, + "grad_norm": 4.466291380609607, + "learning_rate": 1.933270843758174e-05, + "loss": 0.098, + "step": 4586 + }, + { + "epoch": 1.865392435949573, + "grad_norm": 9.48013604270037, + "learning_rate": 1.9332344479427738e-05, + "loss": 0.3376, + "step": 4587 + }, + { + "epoch": 1.8657991053273688, + "grad_norm": 17.938727170722217, + "learning_rate": 1.9331980425472523e-05, + "loss": 1.1709, + "step": 4588 + }, + { + "epoch": 1.8662057747051648, + "grad_norm": 6.63806697883151, + "learning_rate": 1.9331616275719833e-05, + "loss": 0.1534, + "step": 4589 + }, + { + "epoch": 1.8666124440829606, + "grad_norm": 2.6318980387471904, + "learning_rate": 1.9331252030173405e-05, + "loss": 0.041, + "step": 4590 + }, + { + "epoch": 1.8670191134607563, + "grad_norm": 11.799819272086056, + "learning_rate": 1.9330887688836977e-05, + "loss": 0.394, + "step": 4591 + }, + { + "epoch": 1.867425782838552, + "grad_norm": 15.110091091231682, + "learning_rate": 1.933052325171429e-05, + "loss": 0.3758, + "step": 4592 + }, + { + "epoch": 1.867832452216348, + "grad_norm": 4.505884494890262, + "learning_rate": 1.9330158718809088e-05, + "loss": 0.1072, + "step": 4593 + }, + { + "epoch": 1.868239121594144, + "grad_norm": 11.627280930265654, + "learning_rate": 1.932979409012511e-05, + "loss": 0.4061, + "step": 4594 + }, + { + "epoch": 1.8686457909719398, + "grad_norm": 15.435682768104657, + "learning_rate": 1.9329429365666103e-05, + "loss": 0.8851, + "step": 4595 + }, + { + "epoch": 1.8690524603497356, + "grad_norm": 3.5779426622974775, + "learning_rate": 1.9329064545435803e-05, + "loss": 0.0224, + "step": 4596 + }, + { + "epoch": 1.8694591297275314, + "grad_norm": 2.5706000057829743, + "learning_rate": 1.9328699629437963e-05, + "loss": 0.0379, + "step": 4597 + }, + { + "epoch": 1.8698657991053274, + "grad_norm": 8.422454470133566, + "learning_rate": 1.9328334617676326e-05, + "loss": 0.2396, + "step": 4598 + }, + { + "epoch": 1.8702724684831233, + "grad_norm": 8.110328667788943, + "learning_rate": 1.932796951015464e-05, + "loss": 0.2007, + "step": 4599 + }, + { + "epoch": 1.870679137860919, + "grad_norm": 15.635184944672412, + "learning_rate": 1.9327604306876654e-05, + "loss": 0.7149, + "step": 4600 + }, + { + "epoch": 1.8710858072387149, + "grad_norm": 16.03238916590567, + "learning_rate": 1.9327239007846114e-05, + "loss": 0.8527, + "step": 4601 + }, + { + "epoch": 1.8714924766165106, + "grad_norm": 6.114928973714992, + "learning_rate": 1.932687361306677e-05, + "loss": 0.0687, + "step": 4602 + }, + { + "epoch": 1.8718991459943066, + "grad_norm": 11.685979561603153, + "learning_rate": 1.932650812254237e-05, + "loss": 0.2455, + "step": 4603 + }, + { + "epoch": 1.8723058153721026, + "grad_norm": 32.77598641171575, + "learning_rate": 1.9326142536276677e-05, + "loss": 0.7423, + "step": 4604 + }, + { + "epoch": 1.8727124847498984, + "grad_norm": 11.485754185918152, + "learning_rate": 1.9325776854273434e-05, + "loss": 0.6484, + "step": 4605 + }, + { + "epoch": 1.8731191541276941, + "grad_norm": 2.3401407684141113, + "learning_rate": 1.93254110765364e-05, + "loss": 0.0357, + "step": 4606 + }, + { + "epoch": 1.87352582350549, + "grad_norm": 7.037926128809962, + "learning_rate": 1.9325045203069327e-05, + "loss": 0.1514, + "step": 4607 + }, + { + "epoch": 1.873932492883286, + "grad_norm": 49.069349274912604, + "learning_rate": 1.9324679233875972e-05, + "loss": 0.3863, + "step": 4608 + }, + { + "epoch": 1.8743391622610819, + "grad_norm": 27.985640999652414, + "learning_rate": 1.932431316896009e-05, + "loss": 1.1597, + "step": 4609 + }, + { + "epoch": 1.8747458316388776, + "grad_norm": 10.46329442517987, + "learning_rate": 1.9323947008325444e-05, + "loss": 0.6609, + "step": 4610 + }, + { + "epoch": 1.8751525010166734, + "grad_norm": 5.789789510399069, + "learning_rate": 1.9323580751975787e-05, + "loss": 0.15, + "step": 4611 + }, + { + "epoch": 1.8755591703944692, + "grad_norm": 9.947776590593957, + "learning_rate": 1.932321439991488e-05, + "loss": 0.2056, + "step": 4612 + }, + { + "epoch": 1.8759658397722652, + "grad_norm": 5.862803239782415, + "learning_rate": 1.9322847952146486e-05, + "loss": 0.1689, + "step": 4613 + }, + { + "epoch": 1.8763725091500612, + "grad_norm": 0.9374815943803803, + "learning_rate": 1.9322481408674364e-05, + "loss": 0.0166, + "step": 4614 + }, + { + "epoch": 1.876779178527857, + "grad_norm": 18.141671046552972, + "learning_rate": 1.932211476950228e-05, + "loss": 1.0039, + "step": 4615 + }, + { + "epoch": 1.8771858479056527, + "grad_norm": 6.612782692255168, + "learning_rate": 1.9321748034633998e-05, + "loss": 0.1056, + "step": 4616 + }, + { + "epoch": 1.8775925172834484, + "grad_norm": 7.33271275396753, + "learning_rate": 1.932138120407328e-05, + "loss": 0.1711, + "step": 4617 + }, + { + "epoch": 1.8779991866612444, + "grad_norm": 12.083443403119505, + "learning_rate": 1.932101427782389e-05, + "loss": 0.5749, + "step": 4618 + }, + { + "epoch": 1.8784058560390404, + "grad_norm": 11.846638797306666, + "learning_rate": 1.9320647255889603e-05, + "loss": 0.5975, + "step": 4619 + }, + { + "epoch": 1.8788125254168362, + "grad_norm": 19.93766096785942, + "learning_rate": 1.9320280138274175e-05, + "loss": 0.7434, + "step": 4620 + }, + { + "epoch": 1.879219194794632, + "grad_norm": 10.59950478780526, + "learning_rate": 1.9319912924981382e-05, + "loss": 0.5586, + "step": 4621 + }, + { + "epoch": 1.8796258641724277, + "grad_norm": 9.627799941745206, + "learning_rate": 1.9319545616014994e-05, + "loss": 0.2257, + "step": 4622 + }, + { + "epoch": 1.8800325335502237, + "grad_norm": 64.98158622035383, + "learning_rate": 1.931917821137878e-05, + "loss": 0.7919, + "step": 4623 + }, + { + "epoch": 1.8804392029280195, + "grad_norm": 10.518799673604782, + "learning_rate": 1.9318810711076508e-05, + "loss": 0.3132, + "step": 4624 + }, + { + "epoch": 1.8808458723058155, + "grad_norm": 3.853705772831479, + "learning_rate": 1.9318443115111956e-05, + "loss": 0.0467, + "step": 4625 + }, + { + "epoch": 1.8812525416836112, + "grad_norm": 21.884976803987588, + "learning_rate": 1.9318075423488896e-05, + "loss": 0.7682, + "step": 4626 + }, + { + "epoch": 1.881659211061407, + "grad_norm": 7.837381497938199, + "learning_rate": 1.9317707636211104e-05, + "loss": 0.1535, + "step": 4627 + }, + { + "epoch": 1.882065880439203, + "grad_norm": 18.939050511328126, + "learning_rate": 1.931733975328235e-05, + "loss": 0.9062, + "step": 4628 + }, + { + "epoch": 1.8824725498169987, + "grad_norm": 9.336330306178365, + "learning_rate": 1.9316971774706416e-05, + "loss": 0.386, + "step": 4629 + }, + { + "epoch": 1.8828792191947947, + "grad_norm": 2.513483991231874, + "learning_rate": 1.9316603700487075e-05, + "loss": 0.038, + "step": 4630 + }, + { + "epoch": 1.8832858885725905, + "grad_norm": 17.09719875299694, + "learning_rate": 1.9316235530628112e-05, + "loss": 0.5412, + "step": 4631 + }, + { + "epoch": 1.8836925579503863, + "grad_norm": 12.105855204730528, + "learning_rate": 1.93158672651333e-05, + "loss": 0.7413, + "step": 4632 + }, + { + "epoch": 1.884099227328182, + "grad_norm": 11.516820738456978, + "learning_rate": 1.9315498904006422e-05, + "loss": 0.3141, + "step": 4633 + }, + { + "epoch": 1.884505896705978, + "grad_norm": 21.347710709009394, + "learning_rate": 1.931513044725126e-05, + "loss": 0.5111, + "step": 4634 + }, + { + "epoch": 1.884912566083774, + "grad_norm": 9.789396013559305, + "learning_rate": 1.9314761894871596e-05, + "loss": 0.2493, + "step": 4635 + }, + { + "epoch": 1.8853192354615698, + "grad_norm": 12.245063599693522, + "learning_rate": 1.9314393246871212e-05, + "loss": 0.2926, + "step": 4636 + }, + { + "epoch": 1.8857259048393655, + "grad_norm": 2.5739918696078887, + "learning_rate": 1.9314024503253893e-05, + "loss": 0.029, + "step": 4637 + }, + { + "epoch": 1.8861325742171613, + "grad_norm": 9.061529571104291, + "learning_rate": 1.931365566402343e-05, + "loss": 0.2529, + "step": 4638 + }, + { + "epoch": 1.8865392435949573, + "grad_norm": 9.002643815283308, + "learning_rate": 1.93132867291836e-05, + "loss": 0.2765, + "step": 4639 + }, + { + "epoch": 1.8869459129727533, + "grad_norm": 14.914451201005466, + "learning_rate": 1.9312917698738192e-05, + "loss": 0.4785, + "step": 4640 + }, + { + "epoch": 1.887352582350549, + "grad_norm": 18.617747764475258, + "learning_rate": 1.9312548572691e-05, + "loss": 0.8342, + "step": 4641 + }, + { + "epoch": 1.8877592517283448, + "grad_norm": 0.28774977067555346, + "learning_rate": 1.931217935104581e-05, + "loss": 0.0037, + "step": 4642 + }, + { + "epoch": 1.8881659211061406, + "grad_norm": 4.3853708291604265, + "learning_rate": 1.9311810033806414e-05, + "loss": 0.1067, + "step": 4643 + }, + { + "epoch": 1.8885725904839366, + "grad_norm": 10.418373249549498, + "learning_rate": 1.9311440620976597e-05, + "loss": 0.2405, + "step": 4644 + }, + { + "epoch": 1.8889792598617325, + "grad_norm": 5.971676893362264, + "learning_rate": 1.9311071112560158e-05, + "loss": 0.0908, + "step": 4645 + }, + { + "epoch": 1.8893859292395283, + "grad_norm": 7.638156821809928, + "learning_rate": 1.9310701508560886e-05, + "loss": 0.4833, + "step": 4646 + }, + { + "epoch": 1.889792598617324, + "grad_norm": 10.786332837551612, + "learning_rate": 1.9310331808982583e-05, + "loss": 0.2444, + "step": 4647 + }, + { + "epoch": 1.8901992679951198, + "grad_norm": 12.73565590648705, + "learning_rate": 1.9309962013829032e-05, + "loss": 0.7277, + "step": 4648 + }, + { + "epoch": 1.8906059373729158, + "grad_norm": 7.581755954812468, + "learning_rate": 1.930959212310404e-05, + "loss": 0.1428, + "step": 4649 + }, + { + "epoch": 1.8910126067507118, + "grad_norm": 13.643563723614056, + "learning_rate": 1.9309222136811394e-05, + "loss": 0.4995, + "step": 4650 + }, + { + "epoch": 1.8914192761285076, + "grad_norm": 12.457691562472377, + "learning_rate": 1.93088520549549e-05, + "loss": 0.4028, + "step": 4651 + }, + { + "epoch": 1.8918259455063033, + "grad_norm": 20.162586275817194, + "learning_rate": 1.9308481877538356e-05, + "loss": 0.9243, + "step": 4652 + }, + { + "epoch": 1.892232614884099, + "grad_norm": 11.591828857352045, + "learning_rate": 1.9308111604565557e-05, + "loss": 0.2571, + "step": 4653 + }, + { + "epoch": 1.892639284261895, + "grad_norm": 5.19125348638487, + "learning_rate": 1.930774123604031e-05, + "loss": 0.1378, + "step": 4654 + }, + { + "epoch": 1.893045953639691, + "grad_norm": 8.387670755850062, + "learning_rate": 1.9307370771966416e-05, + "loss": 0.2959, + "step": 4655 + }, + { + "epoch": 1.8934526230174868, + "grad_norm": 12.671825167993642, + "learning_rate": 1.9307000212347677e-05, + "loss": 0.5474, + "step": 4656 + }, + { + "epoch": 1.8938592923952826, + "grad_norm": 8.347199393696082, + "learning_rate": 1.9306629557187893e-05, + "loss": 0.5171, + "step": 4657 + }, + { + "epoch": 1.8942659617730784, + "grad_norm": 19.502558207091102, + "learning_rate": 1.9306258806490878e-05, + "loss": 1.2827, + "step": 4658 + }, + { + "epoch": 1.8946726311508744, + "grad_norm": 16.951934345795866, + "learning_rate": 1.9305887960260428e-05, + "loss": 0.5288, + "step": 4659 + }, + { + "epoch": 1.8950793005286704, + "grad_norm": 10.139231355147537, + "learning_rate": 1.9305517018500355e-05, + "loss": 0.2465, + "step": 4660 + }, + { + "epoch": 1.8954859699064661, + "grad_norm": 12.708804155107822, + "learning_rate": 1.9305145981214468e-05, + "loss": 1.079, + "step": 4661 + }, + { + "epoch": 1.8958926392842619, + "grad_norm": 16.521855202879777, + "learning_rate": 1.9304774848406575e-05, + "loss": 1.4327, + "step": 4662 + }, + { + "epoch": 1.8962993086620576, + "grad_norm": 5.327370053352101, + "learning_rate": 1.9304403620080484e-05, + "loss": 0.1094, + "step": 4663 + }, + { + "epoch": 1.8967059780398536, + "grad_norm": 2.1902133718586283, + "learning_rate": 1.9304032296240006e-05, + "loss": 0.0305, + "step": 4664 + }, + { + "epoch": 1.8971126474176494, + "grad_norm": 7.4258673660311905, + "learning_rate": 1.9303660876888958e-05, + "loss": 0.2369, + "step": 4665 + }, + { + "epoch": 1.8975193167954454, + "grad_norm": 9.672100595653426, + "learning_rate": 1.930328936203114e-05, + "loss": 0.4039, + "step": 4666 + }, + { + "epoch": 1.8979259861732412, + "grad_norm": 5.945365098125002, + "learning_rate": 1.930291775167038e-05, + "loss": 0.1809, + "step": 4667 + }, + { + "epoch": 1.898332655551037, + "grad_norm": 12.071908870703076, + "learning_rate": 1.9302546045810492e-05, + "loss": 0.6433, + "step": 4668 + }, + { + "epoch": 1.898739324928833, + "grad_norm": 18.217583039798686, + "learning_rate": 1.930217424445528e-05, + "loss": 0.8293, + "step": 4669 + }, + { + "epoch": 1.8991459943066287, + "grad_norm": 14.108294459353429, + "learning_rate": 1.930180234760857e-05, + "loss": 0.3328, + "step": 4670 + }, + { + "epoch": 1.8995526636844247, + "grad_norm": 8.27124589263237, + "learning_rate": 1.9301430355274177e-05, + "loss": 0.2441, + "step": 4671 + }, + { + "epoch": 1.8999593330622204, + "grad_norm": 19.387682303365345, + "learning_rate": 1.9301058267455924e-05, + "loss": 1.1289, + "step": 4672 + }, + { + "epoch": 1.9003660024400162, + "grad_norm": 14.59832470949465, + "learning_rate": 1.930068608415762e-05, + "loss": 0.6482, + "step": 4673 + }, + { + "epoch": 1.900772671817812, + "grad_norm": 4.52283319196575, + "learning_rate": 1.9300313805383095e-05, + "loss": 0.0884, + "step": 4674 + }, + { + "epoch": 1.901179341195608, + "grad_norm": 11.093396148937948, + "learning_rate": 1.929994143113617e-05, + "loss": 0.4623, + "step": 4675 + }, + { + "epoch": 1.901586010573404, + "grad_norm": 32.59569056157134, + "learning_rate": 1.9299568961420664e-05, + "loss": 1.0507, + "step": 4676 + }, + { + "epoch": 1.9019926799511997, + "grad_norm": 8.265242096723895, + "learning_rate": 1.9299196396240402e-05, + "loss": 0.2945, + "step": 4677 + }, + { + "epoch": 1.9023993493289955, + "grad_norm": 6.6055588258412, + "learning_rate": 1.929882373559921e-05, + "loss": 0.3786, + "step": 4678 + }, + { + "epoch": 1.9028060187067912, + "grad_norm": 5.456808807342623, + "learning_rate": 1.9298450979500913e-05, + "loss": 0.1342, + "step": 4679 + }, + { + "epoch": 1.9032126880845872, + "grad_norm": 13.252290491533905, + "learning_rate": 1.9298078127949335e-05, + "loss": 0.5768, + "step": 4680 + }, + { + "epoch": 1.9036193574623832, + "grad_norm": 18.161504266719277, + "learning_rate": 1.9297705180948308e-05, + "loss": 0.8668, + "step": 4681 + }, + { + "epoch": 1.904026026840179, + "grad_norm": 8.662159943809328, + "learning_rate": 1.9297332138501654e-05, + "loss": 0.1815, + "step": 4682 + }, + { + "epoch": 1.9044326962179747, + "grad_norm": 18.470310738154993, + "learning_rate": 1.9296959000613207e-05, + "loss": 1.0389, + "step": 4683 + }, + { + "epoch": 1.9048393655957705, + "grad_norm": 9.556082515467276, + "learning_rate": 1.92965857672868e-05, + "loss": 0.6231, + "step": 4684 + }, + { + "epoch": 1.9052460349735665, + "grad_norm": 9.231307550276249, + "learning_rate": 1.929621243852626e-05, + "loss": 0.2834, + "step": 4685 + }, + { + "epoch": 1.9056527043513625, + "grad_norm": 0.49152431140136976, + "learning_rate": 1.929583901433542e-05, + "loss": 0.006, + "step": 4686 + }, + { + "epoch": 1.9060593737291582, + "grad_norm": 14.4319048582074, + "learning_rate": 1.929546549471811e-05, + "loss": 0.9892, + "step": 4687 + }, + { + "epoch": 1.906466043106954, + "grad_norm": 11.205289090293185, + "learning_rate": 1.9295091879678173e-05, + "loss": 0.3876, + "step": 4688 + }, + { + "epoch": 1.9068727124847498, + "grad_norm": 14.710921237387558, + "learning_rate": 1.929471816921944e-05, + "loss": 0.7615, + "step": 4689 + }, + { + "epoch": 1.9072793818625458, + "grad_norm": 13.505440023006637, + "learning_rate": 1.9294344363345746e-05, + "loss": 0.5, + "step": 4690 + }, + { + "epoch": 1.9076860512403417, + "grad_norm": 4.143658225613253, + "learning_rate": 1.9293970462060928e-05, + "loss": 0.0915, + "step": 4691 + }, + { + "epoch": 1.9080927206181375, + "grad_norm": 7.061271517119897, + "learning_rate": 1.929359646536883e-05, + "loss": 0.189, + "step": 4692 + }, + { + "epoch": 1.9084993899959333, + "grad_norm": 10.026703333373918, + "learning_rate": 1.929322237327328e-05, + "loss": 0.4229, + "step": 4693 + }, + { + "epoch": 1.908906059373729, + "grad_norm": 10.845509296176282, + "learning_rate": 1.929284818577813e-05, + "loss": 0.5421, + "step": 4694 + }, + { + "epoch": 1.909312728751525, + "grad_norm": 5.605099681206174, + "learning_rate": 1.9292473902887216e-05, + "loss": 0.054, + "step": 4695 + }, + { + "epoch": 1.909719398129321, + "grad_norm": 9.048506512311672, + "learning_rate": 1.9292099524604382e-05, + "loss": 0.3548, + "step": 4696 + }, + { + "epoch": 1.9101260675071168, + "grad_norm": 13.593248527967438, + "learning_rate": 1.929172505093347e-05, + "loss": 0.2985, + "step": 4697 + }, + { + "epoch": 1.9105327368849125, + "grad_norm": 14.338585879444345, + "learning_rate": 1.929135048187832e-05, + "loss": 0.7509, + "step": 4698 + }, + { + "epoch": 1.9109394062627083, + "grad_norm": 7.6601663767723025, + "learning_rate": 1.9290975817442783e-05, + "loss": 0.4016, + "step": 4699 + }, + { + "epoch": 1.9113460756405043, + "grad_norm": 8.740661486160757, + "learning_rate": 1.9290601057630703e-05, + "loss": 0.2233, + "step": 4700 + }, + { + "epoch": 1.9117527450183003, + "grad_norm": 6.530701655174065, + "learning_rate": 1.9290226202445924e-05, + "loss": 0.2464, + "step": 4701 + }, + { + "epoch": 1.912159414396096, + "grad_norm": 13.282241806976252, + "learning_rate": 1.9289851251892304e-05, + "loss": 0.6539, + "step": 4702 + }, + { + "epoch": 1.9125660837738918, + "grad_norm": 9.586412627112473, + "learning_rate": 1.928947620597368e-05, + "loss": 0.3646, + "step": 4703 + }, + { + "epoch": 1.9129727531516876, + "grad_norm": 1.0102487628335626, + "learning_rate": 1.928910106469391e-05, + "loss": 0.0174, + "step": 4704 + }, + { + "epoch": 1.9133794225294836, + "grad_norm": 11.771441955720997, + "learning_rate": 1.9288725828056843e-05, + "loss": 0.625, + "step": 4705 + }, + { + "epoch": 1.9137860919072793, + "grad_norm": 7.247515349377008, + "learning_rate": 1.928835049606633e-05, + "loss": 0.1914, + "step": 4706 + }, + { + "epoch": 1.9141927612850753, + "grad_norm": 6.114963974635307, + "learning_rate": 1.9287975068726224e-05, + "loss": 0.3051, + "step": 4707 + }, + { + "epoch": 1.914599430662871, + "grad_norm": 9.649953546486165, + "learning_rate": 1.9287599546040378e-05, + "loss": 0.2085, + "step": 4708 + }, + { + "epoch": 1.9150061000406668, + "grad_norm": 19.427417251780604, + "learning_rate": 1.928722392801265e-05, + "loss": 0.5939, + "step": 4709 + }, + { + "epoch": 1.9154127694184628, + "grad_norm": 9.568754840567506, + "learning_rate": 1.9286848214646895e-05, + "loss": 0.5956, + "step": 4710 + }, + { + "epoch": 1.9158194387962586, + "grad_norm": 8.383645918401163, + "learning_rate": 1.9286472405946966e-05, + "loss": 0.2223, + "step": 4711 + }, + { + "epoch": 1.9162261081740546, + "grad_norm": 15.609473052852882, + "learning_rate": 1.9286096501916726e-05, + "loss": 0.5929, + "step": 4712 + }, + { + "epoch": 1.9166327775518504, + "grad_norm": 11.60850769548377, + "learning_rate": 1.9285720502560032e-05, + "loss": 0.5048, + "step": 4713 + }, + { + "epoch": 1.9170394469296461, + "grad_norm": 34.65464516475276, + "learning_rate": 1.9285344407880744e-05, + "loss": 1.1543, + "step": 4714 + }, + { + "epoch": 1.9174461163074419, + "grad_norm": 7.450472858432942, + "learning_rate": 1.9284968217882724e-05, + "loss": 0.2134, + "step": 4715 + }, + { + "epoch": 1.9178527856852379, + "grad_norm": 24.652333734226506, + "learning_rate": 1.9284591932569828e-05, + "loss": 1.1195, + "step": 4716 + }, + { + "epoch": 1.9182594550630339, + "grad_norm": 9.003288913227832, + "learning_rate": 1.9284215551945925e-05, + "loss": 0.1404, + "step": 4717 + }, + { + "epoch": 1.9186661244408296, + "grad_norm": 0.5053286640815534, + "learning_rate": 1.9283839076014878e-05, + "loss": 0.0084, + "step": 4718 + }, + { + "epoch": 1.9190727938186254, + "grad_norm": 5.706789956729607, + "learning_rate": 1.928346250478055e-05, + "loss": 0.1482, + "step": 4719 + }, + { + "epoch": 1.9194794631964212, + "grad_norm": 2.192563979843798, + "learning_rate": 1.92830858382468e-05, + "loss": 0.039, + "step": 4720 + }, + { + "epoch": 1.9198861325742171, + "grad_norm": 17.99930866307912, + "learning_rate": 1.9282709076417512e-05, + "loss": 1.1602, + "step": 4721 + }, + { + "epoch": 1.9202928019520131, + "grad_norm": 18.141277729348985, + "learning_rate": 1.928233221929654e-05, + "loss": 1.5193, + "step": 4722 + }, + { + "epoch": 1.920699471329809, + "grad_norm": 11.156720857016246, + "learning_rate": 1.9281955266887755e-05, + "loss": 0.1908, + "step": 4723 + }, + { + "epoch": 1.9211061407076047, + "grad_norm": 4.615938048879251, + "learning_rate": 1.928157821919503e-05, + "loss": 0.0628, + "step": 4724 + }, + { + "epoch": 1.9215128100854004, + "grad_norm": 9.327020943447982, + "learning_rate": 1.928120107622223e-05, + "loss": 0.2529, + "step": 4725 + }, + { + "epoch": 1.9219194794631964, + "grad_norm": 15.599867426365476, + "learning_rate": 1.928082383797323e-05, + "loss": 0.5645, + "step": 4726 + }, + { + "epoch": 1.9223261488409924, + "grad_norm": 4.107545664867792, + "learning_rate": 1.9280446504451903e-05, + "loss": 0.0804, + "step": 4727 + }, + { + "epoch": 1.9227328182187882, + "grad_norm": 24.132598091511692, + "learning_rate": 1.9280069075662123e-05, + "loss": 0.7191, + "step": 4728 + }, + { + "epoch": 1.923139487596584, + "grad_norm": 0.6239278907132312, + "learning_rate": 1.9279691551607764e-05, + "loss": 0.0091, + "step": 4729 + }, + { + "epoch": 1.9235461569743797, + "grad_norm": 6.743593647088014, + "learning_rate": 1.9279313932292698e-05, + "loss": 0.4307, + "step": 4730 + }, + { + "epoch": 1.9239528263521757, + "grad_norm": 3.023425366363425, + "learning_rate": 1.9278936217720808e-05, + "loss": 0.0428, + "step": 4731 + }, + { + "epoch": 1.9243594957299717, + "grad_norm": 14.33513712131537, + "learning_rate": 1.9278558407895963e-05, + "loss": 0.5508, + "step": 4732 + }, + { + "epoch": 1.9247661651077674, + "grad_norm": 16.234522825730092, + "learning_rate": 1.927818050282205e-05, + "loss": 0.58, + "step": 4733 + }, + { + "epoch": 1.9251728344855632, + "grad_norm": 2.6075107293892796, + "learning_rate": 1.9277802502502946e-05, + "loss": 0.0356, + "step": 4734 + }, + { + "epoch": 1.925579503863359, + "grad_norm": 7.996520932699867, + "learning_rate": 1.9277424406942528e-05, + "loss": 0.3548, + "step": 4735 + }, + { + "epoch": 1.925986173241155, + "grad_norm": 8.477334069709498, + "learning_rate": 1.9277046216144676e-05, + "loss": 0.1894, + "step": 4736 + }, + { + "epoch": 1.926392842618951, + "grad_norm": 4.947688545881248, + "learning_rate": 1.9276667930113277e-05, + "loss": 0.087, + "step": 4737 + }, + { + "epoch": 1.9267995119967467, + "grad_norm": 15.133076055073792, + "learning_rate": 1.9276289548852217e-05, + "loss": 0.6828, + "step": 4738 + }, + { + "epoch": 1.9272061813745425, + "grad_norm": 1.030640684013853, + "learning_rate": 1.9275911072365372e-05, + "loss": 0.0155, + "step": 4739 + }, + { + "epoch": 1.9276128507523382, + "grad_norm": 7.300558746456868, + "learning_rate": 1.927553250065663e-05, + "loss": 0.411, + "step": 4740 + }, + { + "epoch": 1.9280195201301342, + "grad_norm": 21.972778604124397, + "learning_rate": 1.9275153833729884e-05, + "loss": 1.2202, + "step": 4741 + }, + { + "epoch": 1.9284261895079302, + "grad_norm": 2.459375661285801, + "learning_rate": 1.9274775071589014e-05, + "loss": 0.039, + "step": 4742 + }, + { + "epoch": 1.928832858885726, + "grad_norm": 0.8435162973215279, + "learning_rate": 1.9274396214237908e-05, + "loss": 0.0136, + "step": 4743 + }, + { + "epoch": 1.9292395282635217, + "grad_norm": 10.366786278908956, + "learning_rate": 1.9274017261680457e-05, + "loss": 0.317, + "step": 4744 + }, + { + "epoch": 1.9296461976413175, + "grad_norm": 20.706794049860534, + "learning_rate": 1.927363821392055e-05, + "loss": 0.6915, + "step": 4745 + }, + { + "epoch": 1.9300528670191135, + "grad_norm": 7.6555489094951295, + "learning_rate": 1.927325907096208e-05, + "loss": 0.2643, + "step": 4746 + }, + { + "epoch": 1.9304595363969093, + "grad_norm": 10.468358959538337, + "learning_rate": 1.9272879832808946e-05, + "loss": 0.6919, + "step": 4747 + }, + { + "epoch": 1.9308662057747052, + "grad_norm": 10.458694657240885, + "learning_rate": 1.9272500499465024e-05, + "loss": 0.6542, + "step": 4748 + }, + { + "epoch": 1.931272875152501, + "grad_norm": 9.726589515253067, + "learning_rate": 1.927212107093422e-05, + "loss": 0.4227, + "step": 4749 + }, + { + "epoch": 1.9316795445302968, + "grad_norm": 15.199353518705939, + "learning_rate": 1.9271741547220434e-05, + "loss": 0.4839, + "step": 4750 + }, + { + "epoch": 1.9320862139080928, + "grad_norm": 9.021768522758412, + "learning_rate": 1.9271361928327547e-05, + "loss": 0.2506, + "step": 4751 + }, + { + "epoch": 1.9324928832858885, + "grad_norm": 1.4473161035150592, + "learning_rate": 1.9270982214259467e-05, + "loss": 0.0215, + "step": 4752 + }, + { + "epoch": 1.9328995526636845, + "grad_norm": 6.87041956297806, + "learning_rate": 1.9270602405020083e-05, + "loss": 0.3811, + "step": 4753 + }, + { + "epoch": 1.9333062220414803, + "grad_norm": 10.225223879396895, + "learning_rate": 1.9270222500613305e-05, + "loss": 0.5264, + "step": 4754 + }, + { + "epoch": 1.933712891419276, + "grad_norm": 13.81867887794812, + "learning_rate": 1.9269842501043026e-05, + "loss": 0.5384, + "step": 4755 + }, + { + "epoch": 1.934119560797072, + "grad_norm": 4.672435487714005, + "learning_rate": 1.9269462406313148e-05, + "loss": 0.1403, + "step": 4756 + }, + { + "epoch": 1.9345262301748678, + "grad_norm": 7.96467459770542, + "learning_rate": 1.9269082216427573e-05, + "loss": 0.2148, + "step": 4757 + }, + { + "epoch": 1.9349328995526638, + "grad_norm": 10.350118577315762, + "learning_rate": 1.9268701931390205e-05, + "loss": 0.4015, + "step": 4758 + }, + { + "epoch": 1.9353395689304596, + "grad_norm": 3.062048751991558, + "learning_rate": 1.9268321551204946e-05, + "loss": 0.0986, + "step": 4759 + }, + { + "epoch": 1.9357462383082553, + "grad_norm": 10.113791954611642, + "learning_rate": 1.92679410758757e-05, + "loss": 0.3126, + "step": 4760 + }, + { + "epoch": 1.936152907686051, + "grad_norm": 7.745320720400823, + "learning_rate": 1.9267560505406378e-05, + "loss": 0.1847, + "step": 4761 + }, + { + "epoch": 1.936559577063847, + "grad_norm": 4.646690229599793, + "learning_rate": 1.9267179839800882e-05, + "loss": 0.1198, + "step": 4762 + }, + { + "epoch": 1.936966246441643, + "grad_norm": 14.206207164303544, + "learning_rate": 1.926679907906312e-05, + "loss": 0.7177, + "step": 4763 + }, + { + "epoch": 1.9373729158194388, + "grad_norm": 8.413442303401036, + "learning_rate": 1.9266418223197e-05, + "loss": 0.2151, + "step": 4764 + }, + { + "epoch": 1.9377795851972346, + "grad_norm": 7.889287169910794, + "learning_rate": 1.9266037272206438e-05, + "loss": 0.1926, + "step": 4765 + }, + { + "epoch": 1.9381862545750304, + "grad_norm": 8.378516901858168, + "learning_rate": 1.9265656226095334e-05, + "loss": 0.4416, + "step": 4766 + }, + { + "epoch": 1.9385929239528263, + "grad_norm": 23.170565338576555, + "learning_rate": 1.926527508486761e-05, + "loss": 0.9617, + "step": 4767 + }, + { + "epoch": 1.9389995933306223, + "grad_norm": 0.8185818732176989, + "learning_rate": 1.9264893848527174e-05, + "loss": 0.0146, + "step": 4768 + }, + { + "epoch": 1.939406262708418, + "grad_norm": 12.27293218770277, + "learning_rate": 1.9264512517077935e-05, + "loss": 0.4523, + "step": 4769 + }, + { + "epoch": 1.9398129320862139, + "grad_norm": 12.58980454025412, + "learning_rate": 1.9264131090523816e-05, + "loss": 0.5134, + "step": 4770 + }, + { + "epoch": 1.9402196014640096, + "grad_norm": 9.78852742662531, + "learning_rate": 1.926374956886873e-05, + "loss": 0.2935, + "step": 4771 + }, + { + "epoch": 1.9406262708418056, + "grad_norm": 6.8343755453835, + "learning_rate": 1.926336795211659e-05, + "loss": 0.301, + "step": 4772 + }, + { + "epoch": 1.9410329402196016, + "grad_norm": 1.454252137892868, + "learning_rate": 1.9262986240271317e-05, + "loss": 0.0161, + "step": 4773 + }, + { + "epoch": 1.9414396095973974, + "grad_norm": 13.510448376928537, + "learning_rate": 1.926260443333683e-05, + "loss": 0.4761, + "step": 4774 + }, + { + "epoch": 1.9418462789751931, + "grad_norm": 7.319414169868101, + "learning_rate": 1.9262222531317042e-05, + "loss": 0.1719, + "step": 4775 + }, + { + "epoch": 1.942252948352989, + "grad_norm": 8.666761369667288, + "learning_rate": 1.9261840534215882e-05, + "loss": 0.2581, + "step": 4776 + }, + { + "epoch": 1.9426596177307849, + "grad_norm": 17.40252576520353, + "learning_rate": 1.9261458442037266e-05, + "loss": 0.5018, + "step": 4777 + }, + { + "epoch": 1.9430662871085809, + "grad_norm": 12.833317284665851, + "learning_rate": 1.9261076254785117e-05, + "loss": 0.7859, + "step": 4778 + }, + { + "epoch": 1.9434729564863766, + "grad_norm": 16.762762649105312, + "learning_rate": 1.926069397246336e-05, + "loss": 0.5365, + "step": 4779 + }, + { + "epoch": 1.9438796258641724, + "grad_norm": 7.588180476836137, + "learning_rate": 1.9260311595075922e-05, + "loss": 0.2454, + "step": 4780 + }, + { + "epoch": 1.9442862952419682, + "grad_norm": 5.0555148863604025, + "learning_rate": 1.9259929122626723e-05, + "loss": 0.0895, + "step": 4781 + }, + { + "epoch": 1.9446929646197642, + "grad_norm": 2.831364832111475, + "learning_rate": 1.9259546555119688e-05, + "loss": 0.054, + "step": 4782 + }, + { + "epoch": 1.9450996339975601, + "grad_norm": 11.389780090791627, + "learning_rate": 1.9259163892558748e-05, + "loss": 0.6, + "step": 4783 + }, + { + "epoch": 1.945506303375356, + "grad_norm": 9.954928986021805, + "learning_rate": 1.9258781134947833e-05, + "loss": 0.3606, + "step": 4784 + }, + { + "epoch": 1.9459129727531517, + "grad_norm": 3.4134048463684126, + "learning_rate": 1.925839828229087e-05, + "loss": 0.0851, + "step": 4785 + }, + { + "epoch": 1.9463196421309474, + "grad_norm": 2.3449350986329476, + "learning_rate": 1.9258015334591786e-05, + "loss": 0.036, + "step": 4786 + }, + { + "epoch": 1.9467263115087434, + "grad_norm": 14.465103597712462, + "learning_rate": 1.9257632291854517e-05, + "loss": 0.6695, + "step": 4787 + }, + { + "epoch": 1.9471329808865392, + "grad_norm": 6.684013550774916, + "learning_rate": 1.9257249154082994e-05, + "loss": 0.1659, + "step": 4788 + }, + { + "epoch": 1.9475396502643352, + "grad_norm": 7.945301412180344, + "learning_rate": 1.9256865921281148e-05, + "loss": 0.3121, + "step": 4789 + }, + { + "epoch": 1.947946319642131, + "grad_norm": 16.300593702913456, + "learning_rate": 1.9256482593452914e-05, + "loss": 1.0103, + "step": 4790 + }, + { + "epoch": 1.9483529890199267, + "grad_norm": 5.202917428143451, + "learning_rate": 1.9256099170602228e-05, + "loss": 0.1023, + "step": 4791 + }, + { + "epoch": 1.9487596583977227, + "grad_norm": 18.637159817852766, + "learning_rate": 1.9255715652733025e-05, + "loss": 0.6927, + "step": 4792 + }, + { + "epoch": 1.9491663277755185, + "grad_norm": 12.098725094860118, + "learning_rate": 1.9255332039849244e-05, + "loss": 1.0717, + "step": 4793 + }, + { + "epoch": 1.9495729971533144, + "grad_norm": 4.354374584731374, + "learning_rate": 1.925494833195482e-05, + "loss": 0.1278, + "step": 4794 + }, + { + "epoch": 1.9499796665311102, + "grad_norm": 22.927861074982584, + "learning_rate": 1.9254564529053697e-05, + "loss": 0.7808, + "step": 4795 + }, + { + "epoch": 1.950386335908906, + "grad_norm": 12.658625736378912, + "learning_rate": 1.9254180631149807e-05, + "loss": 0.7043, + "step": 4796 + }, + { + "epoch": 1.950793005286702, + "grad_norm": 7.304941023925148, + "learning_rate": 1.92537966382471e-05, + "loss": 0.1837, + "step": 4797 + }, + { + "epoch": 1.9511996746644977, + "grad_norm": 15.769436483054724, + "learning_rate": 1.9253412550349507e-05, + "loss": 0.9486, + "step": 4798 + }, + { + "epoch": 1.9516063440422937, + "grad_norm": 16.45874539217988, + "learning_rate": 1.9253028367460982e-05, + "loss": 1.1188, + "step": 4799 + }, + { + "epoch": 1.9520130134200895, + "grad_norm": 3.2429940644490185, + "learning_rate": 1.925264408958546e-05, + "loss": 0.0542, + "step": 4800 + }, + { + "epoch": 1.9524196827978852, + "grad_norm": 7.390809900840705, + "learning_rate": 1.9252259716726893e-05, + "loss": 0.3662, + "step": 4801 + }, + { + "epoch": 1.952826352175681, + "grad_norm": 9.711206424857105, + "learning_rate": 1.9251875248889223e-05, + "loss": 0.3226, + "step": 4802 + }, + { + "epoch": 1.953233021553477, + "grad_norm": 12.18862640825759, + "learning_rate": 1.9251490686076395e-05, + "loss": 0.902, + "step": 4803 + }, + { + "epoch": 1.953639690931273, + "grad_norm": 24.112436632930375, + "learning_rate": 1.925110602829236e-05, + "loss": 0.6827, + "step": 4804 + }, + { + "epoch": 1.9540463603090688, + "grad_norm": 12.913688474067413, + "learning_rate": 1.9250721275541067e-05, + "loss": 0.3941, + "step": 4805 + }, + { + "epoch": 1.9544530296868645, + "grad_norm": 8.696757840155353, + "learning_rate": 1.9250336427826465e-05, + "loss": 0.4028, + "step": 4806 + }, + { + "epoch": 1.9548596990646603, + "grad_norm": 2.5802206218734076, + "learning_rate": 1.92499514851525e-05, + "loss": 0.0521, + "step": 4807 + }, + { + "epoch": 1.9552663684424563, + "grad_norm": 9.533951227406192, + "learning_rate": 1.9249566447523132e-05, + "loss": 0.1948, + "step": 4808 + }, + { + "epoch": 1.9556730378202523, + "grad_norm": 13.231835442248714, + "learning_rate": 1.9249181314942306e-05, + "loss": 0.4417, + "step": 4809 + }, + { + "epoch": 1.956079707198048, + "grad_norm": 1.9759883830068021, + "learning_rate": 1.924879608741398e-05, + "loss": 0.0215, + "step": 4810 + }, + { + "epoch": 1.9564863765758438, + "grad_norm": 19.703467507149618, + "learning_rate": 1.9248410764942106e-05, + "loss": 0.2373, + "step": 4811 + }, + { + "epoch": 1.9568930459536396, + "grad_norm": 8.296161400531654, + "learning_rate": 1.9248025347530643e-05, + "loss": 0.269, + "step": 4812 + }, + { + "epoch": 1.9572997153314355, + "grad_norm": 19.610321339725047, + "learning_rate": 1.9247639835183544e-05, + "loss": 1.0003, + "step": 4813 + }, + { + "epoch": 1.9577063847092315, + "grad_norm": 6.206673792115348, + "learning_rate": 1.924725422790477e-05, + "loss": 0.0721, + "step": 4814 + }, + { + "epoch": 1.9581130540870273, + "grad_norm": 15.085163176343977, + "learning_rate": 1.9246868525698276e-05, + "loss": 0.8153, + "step": 4815 + }, + { + "epoch": 1.958519723464823, + "grad_norm": 112.35830791095583, + "learning_rate": 1.924648272856802e-05, + "loss": 0.818, + "step": 4816 + }, + { + "epoch": 1.9589263928426188, + "grad_norm": 9.298907596793978, + "learning_rate": 1.924609683651797e-05, + "loss": 0.4599, + "step": 4817 + }, + { + "epoch": 1.9593330622204148, + "grad_norm": 8.860755521708548, + "learning_rate": 1.9245710849552076e-05, + "loss": 0.4954, + "step": 4818 + }, + { + "epoch": 1.9597397315982108, + "grad_norm": 15.223370711281056, + "learning_rate": 1.9245324767674314e-05, + "loss": 0.7361, + "step": 4819 + }, + { + "epoch": 1.9601464009760066, + "grad_norm": 8.821991700206205, + "learning_rate": 1.924493859088864e-05, + "loss": 0.2342, + "step": 4820 + }, + { + "epoch": 1.9605530703538023, + "grad_norm": 8.476316300575828, + "learning_rate": 1.9244552319199014e-05, + "loss": 0.3829, + "step": 4821 + }, + { + "epoch": 1.960959739731598, + "grad_norm": 2.665151541833774, + "learning_rate": 1.9244165952609406e-05, + "loss": 0.0404, + "step": 4822 + }, + { + "epoch": 1.961366409109394, + "grad_norm": 9.215960938976536, + "learning_rate": 1.9243779491123783e-05, + "loss": 0.199, + "step": 4823 + }, + { + "epoch": 1.96177307848719, + "grad_norm": 10.802523117175738, + "learning_rate": 1.9243392934746113e-05, + "loss": 0.4292, + "step": 4824 + }, + { + "epoch": 1.9621797478649858, + "grad_norm": 9.375577622540215, + "learning_rate": 1.924300628348036e-05, + "loss": 0.3664, + "step": 4825 + }, + { + "epoch": 1.9625864172427816, + "grad_norm": 9.746450408494752, + "learning_rate": 1.9242619537330495e-05, + "loss": 0.248, + "step": 4826 + }, + { + "epoch": 1.9629930866205774, + "grad_norm": 4.519411676246105, + "learning_rate": 1.924223269630049e-05, + "loss": 0.0642, + "step": 4827 + }, + { + "epoch": 1.9633997559983734, + "grad_norm": 3.1470645733713036, + "learning_rate": 1.9241845760394317e-05, + "loss": 0.0512, + "step": 4828 + }, + { + "epoch": 1.9638064253761693, + "grad_norm": 15.02356737055167, + "learning_rate": 1.9241458729615943e-05, + "loss": 0.6542, + "step": 4829 + }, + { + "epoch": 1.964213094753965, + "grad_norm": 12.549051567344511, + "learning_rate": 1.9241071603969344e-05, + "loss": 0.5184, + "step": 4830 + }, + { + "epoch": 1.9646197641317609, + "grad_norm": 5.062195375951375, + "learning_rate": 1.9240684383458496e-05, + "loss": 0.1272, + "step": 4831 + }, + { + "epoch": 1.9650264335095566, + "grad_norm": 16.50887321477799, + "learning_rate": 1.924029706808737e-05, + "loss": 0.4102, + "step": 4832 + }, + { + "epoch": 1.9654331028873526, + "grad_norm": 11.001980485032226, + "learning_rate": 1.9239909657859946e-05, + "loss": 0.2825, + "step": 4833 + }, + { + "epoch": 1.9658397722651484, + "grad_norm": 7.05632761080379, + "learning_rate": 1.9239522152780198e-05, + "loss": 0.3117, + "step": 4834 + }, + { + "epoch": 1.9662464416429444, + "grad_norm": 12.64233406415634, + "learning_rate": 1.9239134552852106e-05, + "loss": 0.4689, + "step": 4835 + }, + { + "epoch": 1.9666531110207401, + "grad_norm": 5.061572963950741, + "learning_rate": 1.9238746858079648e-05, + "loss": 0.2724, + "step": 4836 + }, + { + "epoch": 1.967059780398536, + "grad_norm": 34.091086284342225, + "learning_rate": 1.92383590684668e-05, + "loss": 0.3852, + "step": 4837 + }, + { + "epoch": 1.967466449776332, + "grad_norm": 12.331038923125835, + "learning_rate": 1.923797118401755e-05, + "loss": 0.454, + "step": 4838 + }, + { + "epoch": 1.9678731191541277, + "grad_norm": 1.3729462046616623, + "learning_rate": 1.9237583204735877e-05, + "loss": 0.0252, + "step": 4839 + }, + { + "epoch": 1.9682797885319236, + "grad_norm": 29.17790464804428, + "learning_rate": 1.9237195130625764e-05, + "loss": 0.6636, + "step": 4840 + }, + { + "epoch": 1.9686864579097194, + "grad_norm": 4.697850387404925, + "learning_rate": 1.923680696169119e-05, + "loss": 0.0987, + "step": 4841 + }, + { + "epoch": 1.9690931272875152, + "grad_norm": 14.437343652815818, + "learning_rate": 1.9236418697936145e-05, + "loss": 0.92, + "step": 4842 + }, + { + "epoch": 1.969499796665311, + "grad_norm": 2.026113843894073, + "learning_rate": 1.9236030339364613e-05, + "loss": 0.0419, + "step": 4843 + }, + { + "epoch": 1.969906466043107, + "grad_norm": 11.45572091276406, + "learning_rate": 1.9235641885980584e-05, + "loss": 0.3339, + "step": 4844 + }, + { + "epoch": 1.970313135420903, + "grad_norm": 2.649278938949608, + "learning_rate": 1.923525333778804e-05, + "loss": 0.0301, + "step": 4845 + }, + { + "epoch": 1.9707198047986987, + "grad_norm": 12.37322412756604, + "learning_rate": 1.9234864694790975e-05, + "loss": 0.6652, + "step": 4846 + }, + { + "epoch": 1.9711264741764944, + "grad_norm": 5.3538398762449475, + "learning_rate": 1.9234475956993374e-05, + "loss": 0.1139, + "step": 4847 + }, + { + "epoch": 1.9715331435542902, + "grad_norm": 0.9527401780501583, + "learning_rate": 1.923408712439923e-05, + "loss": 0.0101, + "step": 4848 + }, + { + "epoch": 1.9719398129320862, + "grad_norm": 5.595040604618749, + "learning_rate": 1.9233698197012535e-05, + "loss": 0.1685, + "step": 4849 + }, + { + "epoch": 1.9723464823098822, + "grad_norm": 17.880140354329555, + "learning_rate": 1.9233309174837283e-05, + "loss": 1.2919, + "step": 4850 + }, + { + "epoch": 1.972753151687678, + "grad_norm": 25.379020097350075, + "learning_rate": 1.923292005787746e-05, + "loss": 0.9139, + "step": 4851 + }, + { + "epoch": 1.9731598210654737, + "grad_norm": 0.5035838507536023, + "learning_rate": 1.923253084613707e-05, + "loss": 0.0059, + "step": 4852 + }, + { + "epoch": 1.9735664904432695, + "grad_norm": 7.676125952854286, + "learning_rate": 1.92321415396201e-05, + "loss": 0.1806, + "step": 4853 + }, + { + "epoch": 1.9739731598210655, + "grad_norm": 8.581104005572241, + "learning_rate": 1.9231752138330556e-05, + "loss": 0.179, + "step": 4854 + }, + { + "epoch": 1.9743798291988615, + "grad_norm": 1.3860777665161945, + "learning_rate": 1.923136264227243e-05, + "loss": 0.0196, + "step": 4855 + }, + { + "epoch": 1.9747864985766572, + "grad_norm": 8.126831146137095, + "learning_rate": 1.9230973051449715e-05, + "loss": 0.1342, + "step": 4856 + }, + { + "epoch": 1.975193167954453, + "grad_norm": 13.414137936078946, + "learning_rate": 1.923058336586642e-05, + "loss": 0.3657, + "step": 4857 + }, + { + "epoch": 1.9755998373322488, + "grad_norm": 8.398209195242831, + "learning_rate": 1.923019358552654e-05, + "loss": 0.1632, + "step": 4858 + }, + { + "epoch": 1.9760065067100447, + "grad_norm": 11.914162979874819, + "learning_rate": 1.9229803710434077e-05, + "loss": 0.2888, + "step": 4859 + }, + { + "epoch": 1.9764131760878407, + "grad_norm": 13.086970586808913, + "learning_rate": 1.9229413740593034e-05, + "loss": 0.604, + "step": 4860 + }, + { + "epoch": 1.9768198454656365, + "grad_norm": 6.819813107943543, + "learning_rate": 1.922902367600741e-05, + "loss": 0.4219, + "step": 4861 + }, + { + "epoch": 1.9772265148434323, + "grad_norm": 3.5854374901471786, + "learning_rate": 1.922863351668122e-05, + "loss": 0.0838, + "step": 4862 + }, + { + "epoch": 1.977633184221228, + "grad_norm": 24.986419026330154, + "learning_rate": 1.922824326261846e-05, + "loss": 1.0492, + "step": 4863 + }, + { + "epoch": 1.978039853599024, + "grad_norm": 5.937129007218296, + "learning_rate": 1.9227852913823136e-05, + "loss": 0.0894, + "step": 4864 + }, + { + "epoch": 1.97844652297682, + "grad_norm": 11.424008199352224, + "learning_rate": 1.922746247029926e-05, + "loss": 0.5887, + "step": 4865 + }, + { + "epoch": 1.9788531923546158, + "grad_norm": 19.181752341197303, + "learning_rate": 1.9227071932050835e-05, + "loss": 0.1961, + "step": 4866 + }, + { + "epoch": 1.9792598617324115, + "grad_norm": 30.839044395316222, + "learning_rate": 1.9226681299081877e-05, + "loss": 0.6913, + "step": 4867 + }, + { + "epoch": 1.9796665311102073, + "grad_norm": 15.565503509698722, + "learning_rate": 1.922629057139639e-05, + "loss": 0.8575, + "step": 4868 + }, + { + "epoch": 1.9800732004880033, + "grad_norm": 2.515569646765972, + "learning_rate": 1.9225899748998383e-05, + "loss": 0.0691, + "step": 4869 + }, + { + "epoch": 1.9804798698657993, + "grad_norm": 9.34498261330174, + "learning_rate": 1.9225508831891877e-05, + "loss": 0.6332, + "step": 4870 + }, + { + "epoch": 1.980886539243595, + "grad_norm": 9.986387381680004, + "learning_rate": 1.9225117820080877e-05, + "loss": 0.4923, + "step": 4871 + }, + { + "epoch": 1.9812932086213908, + "grad_norm": 16.716123396999606, + "learning_rate": 1.9224726713569398e-05, + "loss": 1.2544, + "step": 4872 + }, + { + "epoch": 1.9816998779991866, + "grad_norm": 7.158142864263455, + "learning_rate": 1.9224335512361462e-05, + "loss": 0.2604, + "step": 4873 + }, + { + "epoch": 1.9821065473769826, + "grad_norm": 1.7503632821559314, + "learning_rate": 1.9223944216461077e-05, + "loss": 0.018, + "step": 4874 + }, + { + "epoch": 1.9825132167547783, + "grad_norm": 7.376981819002088, + "learning_rate": 1.922355282587226e-05, + "loss": 0.1756, + "step": 4875 + }, + { + "epoch": 1.9829198861325743, + "grad_norm": 38.649748802108995, + "learning_rate": 1.9223161340599032e-05, + "loss": 1.1402, + "step": 4876 + }, + { + "epoch": 1.98332655551037, + "grad_norm": 8.351395598636028, + "learning_rate": 1.9222769760645412e-05, + "loss": 0.1082, + "step": 4877 + }, + { + "epoch": 1.9837332248881658, + "grad_norm": 7.017954737948558, + "learning_rate": 1.9222378086015417e-05, + "loss": 0.1283, + "step": 4878 + }, + { + "epoch": 1.9841398942659618, + "grad_norm": 2.6010167326517495, + "learning_rate": 1.922198631671307e-05, + "loss": 0.0822, + "step": 4879 + }, + { + "epoch": 1.9845465636437576, + "grad_norm": 5.162762159427014, + "learning_rate": 1.9221594452742396e-05, + "loss": 0.094, + "step": 4880 + }, + { + "epoch": 1.9849532330215536, + "grad_norm": 10.992091691830515, + "learning_rate": 1.922120249410741e-05, + "loss": 0.6346, + "step": 4881 + }, + { + "epoch": 1.9853599023993493, + "grad_norm": 9.93607502166858, + "learning_rate": 1.922081044081214e-05, + "loss": 0.276, + "step": 4882 + }, + { + "epoch": 1.985766571777145, + "grad_norm": 10.152188515086369, + "learning_rate": 1.922041829286061e-05, + "loss": 0.2763, + "step": 4883 + }, + { + "epoch": 1.9861732411549409, + "grad_norm": 13.101069126609422, + "learning_rate": 1.9220026050256847e-05, + "loss": 0.6291, + "step": 4884 + }, + { + "epoch": 1.9865799105327369, + "grad_norm": 5.904783599185037, + "learning_rate": 1.9219633713004876e-05, + "loss": 0.1454, + "step": 4885 + }, + { + "epoch": 1.9869865799105328, + "grad_norm": 11.39118891106363, + "learning_rate": 1.9219241281108723e-05, + "loss": 0.2342, + "step": 4886 + }, + { + "epoch": 1.9873932492883286, + "grad_norm": 8.64854024891307, + "learning_rate": 1.9218848754572417e-05, + "loss": 0.233, + "step": 4887 + }, + { + "epoch": 1.9877999186661244, + "grad_norm": 2.794626813340746, + "learning_rate": 1.9218456133399993e-05, + "loss": 0.0641, + "step": 4888 + }, + { + "epoch": 1.9882065880439201, + "grad_norm": 3.6458688000676145, + "learning_rate": 1.9218063417595476e-05, + "loss": 0.0613, + "step": 4889 + }, + { + "epoch": 1.9886132574217161, + "grad_norm": 33.896773178553964, + "learning_rate": 1.92176706071629e-05, + "loss": 2.0531, + "step": 4890 + }, + { + "epoch": 1.9890199267995121, + "grad_norm": 11.421000696789028, + "learning_rate": 1.9217277702106288e-05, + "loss": 0.3082, + "step": 4891 + }, + { + "epoch": 1.9894265961773079, + "grad_norm": 9.210819589832457, + "learning_rate": 1.9216884702429693e-05, + "loss": 0.1755, + "step": 4892 + }, + { + "epoch": 1.9898332655551036, + "grad_norm": 9.513591873727906, + "learning_rate": 1.921649160813713e-05, + "loss": 0.4113, + "step": 4893 + }, + { + "epoch": 1.9902399349328994, + "grad_norm": 7.6830271685683, + "learning_rate": 1.9216098419232644e-05, + "loss": 0.1567, + "step": 4894 + }, + { + "epoch": 1.9906466043106954, + "grad_norm": 3.84726802925115, + "learning_rate": 1.921570513572027e-05, + "loss": 0.0851, + "step": 4895 + }, + { + "epoch": 1.9910532736884914, + "grad_norm": 7.7568066226301315, + "learning_rate": 1.9215311757604044e-05, + "loss": 0.2718, + "step": 4896 + }, + { + "epoch": 1.9914599430662872, + "grad_norm": 6.965761719709538, + "learning_rate": 1.9214918284888006e-05, + "loss": 0.2007, + "step": 4897 + }, + { + "epoch": 1.991866612444083, + "grad_norm": 5.519377424508803, + "learning_rate": 1.9214524717576192e-05, + "loss": 0.0704, + "step": 4898 + }, + { + "epoch": 1.9922732818218787, + "grad_norm": 1.557706529787146, + "learning_rate": 1.9214131055672648e-05, + "loss": 0.0238, + "step": 4899 + }, + { + "epoch": 1.9926799511996747, + "grad_norm": 12.520558748840092, + "learning_rate": 1.921373729918141e-05, + "loss": 0.8089, + "step": 4900 + }, + { + "epoch": 1.9930866205774707, + "grad_norm": 13.213243932680607, + "learning_rate": 1.9213343448106516e-05, + "loss": 0.2538, + "step": 4901 + }, + { + "epoch": 1.9934932899552664, + "grad_norm": 3.8728620597916503, + "learning_rate": 1.9212949502452017e-05, + "loss": 0.2838, + "step": 4902 + }, + { + "epoch": 1.9938999593330622, + "grad_norm": 7.737892281840445, + "learning_rate": 1.921255546222196e-05, + "loss": 0.3071, + "step": 4903 + }, + { + "epoch": 1.994306628710858, + "grad_norm": 21.402740058146943, + "learning_rate": 1.9212161327420378e-05, + "loss": 0.8162, + "step": 4904 + }, + { + "epoch": 1.994713298088654, + "grad_norm": 12.477518517918483, + "learning_rate": 1.9211767098051325e-05, + "loss": 0.426, + "step": 4905 + }, + { + "epoch": 1.99511996746645, + "grad_norm": 11.679531784359213, + "learning_rate": 1.921137277411885e-05, + "loss": 0.4792, + "step": 4906 + }, + { + "epoch": 1.9955266368442457, + "grad_norm": 4.981986531623464, + "learning_rate": 1.9210978355626996e-05, + "loss": 0.1259, + "step": 4907 + }, + { + "epoch": 1.9959333062220415, + "grad_norm": 5.876995882906914, + "learning_rate": 1.9210583842579813e-05, + "loss": 0.1408, + "step": 4908 + }, + { + "epoch": 1.9963399755998372, + "grad_norm": 11.053913523905976, + "learning_rate": 1.921018923498135e-05, + "loss": 0.4281, + "step": 4909 + }, + { + "epoch": 1.9967466449776332, + "grad_norm": 13.384278198094087, + "learning_rate": 1.920979453283566e-05, + "loss": 0.5627, + "step": 4910 + }, + { + "epoch": 1.9971533143554292, + "grad_norm": 1.854553841247804, + "learning_rate": 1.9209399736146794e-05, + "loss": 0.034, + "step": 4911 + }, + { + "epoch": 1.997559983733225, + "grad_norm": 8.743336425683205, + "learning_rate": 1.9209004844918807e-05, + "loss": 0.3934, + "step": 4912 + }, + { + "epoch": 1.9979666531110207, + "grad_norm": 9.046550453720752, + "learning_rate": 1.920860985915575e-05, + "loss": 0.2176, + "step": 4913 + }, + { + "epoch": 1.9983733224888165, + "grad_norm": 14.427363585999009, + "learning_rate": 1.9208214778861674e-05, + "loss": 0.7454, + "step": 4914 + }, + { + "epoch": 1.9987799918666125, + "grad_norm": 6.54341817953645, + "learning_rate": 1.9207819604040643e-05, + "loss": 0.1727, + "step": 4915 + }, + { + "epoch": 1.9991866612444082, + "grad_norm": 44.65535432465899, + "learning_rate": 1.9207424334696708e-05, + "loss": 0.68, + "step": 4916 + }, + { + "epoch": 1.9995933306222042, + "grad_norm": 17.19146227832896, + "learning_rate": 1.9207028970833925e-05, + "loss": 0.4894, + "step": 4917 + }, + { + "epoch": 2.0, + "grad_norm": 1.609390237929443, + "learning_rate": 1.920663351245636e-05, + "loss": 0.0265, + "step": 4918 + }, + { + "epoch": 2.0004066693777958, + "grad_norm": 5.337831676095192, + "learning_rate": 1.9206237959568068e-05, + "loss": 0.0906, + "step": 4919 + }, + { + "epoch": 2.0008133387555915, + "grad_norm": 22.410058687591828, + "learning_rate": 1.920584231217311e-05, + "loss": 1.2509, + "step": 4920 + }, + { + "epoch": 2.0012200081333877, + "grad_norm": 12.36913959710454, + "learning_rate": 1.9205446570275547e-05, + "loss": 0.4261, + "step": 4921 + }, + { + "epoch": 2.0016266775111835, + "grad_norm": 19.487194110859637, + "learning_rate": 1.9205050733879442e-05, + "loss": 0.4775, + "step": 4922 + }, + { + "epoch": 2.0020333468889793, + "grad_norm": 4.312606851680667, + "learning_rate": 1.9204654802988857e-05, + "loss": 0.13, + "step": 4923 + }, + { + "epoch": 2.002440016266775, + "grad_norm": 1.3391244009626484, + "learning_rate": 1.9204258777607857e-05, + "loss": 0.0061, + "step": 4924 + }, + { + "epoch": 2.002846685644571, + "grad_norm": 10.997990272564648, + "learning_rate": 1.920386265774051e-05, + "loss": 0.6185, + "step": 4925 + }, + { + "epoch": 2.003253355022367, + "grad_norm": 6.741146465188283, + "learning_rate": 1.9203466443390882e-05, + "loss": 0.1265, + "step": 4926 + }, + { + "epoch": 2.0036600244001628, + "grad_norm": 13.733417933252941, + "learning_rate": 1.920307013456304e-05, + "loss": 0.8243, + "step": 4927 + }, + { + "epoch": 2.0040666937779585, + "grad_norm": 5.213691814076717, + "learning_rate": 1.9202673731261048e-05, + "loss": 0.1931, + "step": 4928 + }, + { + "epoch": 2.0044733631557543, + "grad_norm": 31.1992702261616, + "learning_rate": 1.920227723348898e-05, + "loss": 0.0342, + "step": 4929 + }, + { + "epoch": 2.00488003253355, + "grad_norm": 1.3589312042137827, + "learning_rate": 1.9201880641250902e-05, + "loss": 0.0169, + "step": 4930 + }, + { + "epoch": 2.0052867019113463, + "grad_norm": 14.993661934716016, + "learning_rate": 1.9201483954550894e-05, + "loss": 0.5653, + "step": 4931 + }, + { + "epoch": 2.005693371289142, + "grad_norm": 6.738903925427236, + "learning_rate": 1.9201087173393017e-05, + "loss": 0.2786, + "step": 4932 + }, + { + "epoch": 2.006100040666938, + "grad_norm": 5.686657747999854, + "learning_rate": 1.920069029778135e-05, + "loss": 0.2053, + "step": 4933 + }, + { + "epoch": 2.0065067100447336, + "grad_norm": 0.9552963485021445, + "learning_rate": 1.920029332771997e-05, + "loss": 0.0064, + "step": 4934 + }, + { + "epoch": 2.0069133794225293, + "grad_norm": 6.626336021237889, + "learning_rate": 1.9199896263212945e-05, + "loss": 0.1274, + "step": 4935 + }, + { + "epoch": 2.0073200488003256, + "grad_norm": 11.155460849511531, + "learning_rate": 1.9199499104264357e-05, + "loss": 0.2995, + "step": 4936 + }, + { + "epoch": 2.0077267181781213, + "grad_norm": 14.41946279344473, + "learning_rate": 1.9199101850878277e-05, + "loss": 1.1068, + "step": 4937 + }, + { + "epoch": 2.008133387555917, + "grad_norm": 5.914546705595868, + "learning_rate": 1.919870450305879e-05, + "loss": 0.1718, + "step": 4938 + }, + { + "epoch": 2.008540056933713, + "grad_norm": 9.279729763990725, + "learning_rate": 1.919830706080997e-05, + "loss": 0.2998, + "step": 4939 + }, + { + "epoch": 2.0089467263115086, + "grad_norm": 12.785585585154731, + "learning_rate": 1.91979095241359e-05, + "loss": 0.6588, + "step": 4940 + }, + { + "epoch": 2.0093533956893044, + "grad_norm": 13.128989238627112, + "learning_rate": 1.919751189304066e-05, + "loss": 0.6808, + "step": 4941 + }, + { + "epoch": 2.0097600650671006, + "grad_norm": 5.978874304594614, + "learning_rate": 1.919711416752833e-05, + "loss": 0.2613, + "step": 4942 + }, + { + "epoch": 2.0101667344448964, + "grad_norm": 5.526853660231354, + "learning_rate": 1.919671634760299e-05, + "loss": 0.0831, + "step": 4943 + }, + { + "epoch": 2.010573403822692, + "grad_norm": 14.249056643706878, + "learning_rate": 1.9196318433268737e-05, + "loss": 0.4269, + "step": 4944 + }, + { + "epoch": 2.010980073200488, + "grad_norm": 6.260891281737062, + "learning_rate": 1.9195920424529642e-05, + "loss": 0.1299, + "step": 4945 + }, + { + "epoch": 2.0113867425782836, + "grad_norm": 13.039788635632926, + "learning_rate": 1.9195522321389795e-05, + "loss": 0.3149, + "step": 4946 + }, + { + "epoch": 2.01179341195608, + "grad_norm": 8.693421708620363, + "learning_rate": 1.9195124123853284e-05, + "loss": 0.2409, + "step": 4947 + }, + { + "epoch": 2.0122000813338756, + "grad_norm": 7.287564799779439, + "learning_rate": 1.9194725831924196e-05, + "loss": 0.3128, + "step": 4948 + }, + { + "epoch": 2.0126067507116714, + "grad_norm": 11.154961693431916, + "learning_rate": 1.9194327445606623e-05, + "loss": 0.6388, + "step": 4949 + }, + { + "epoch": 2.013013420089467, + "grad_norm": 3.2644804283924205, + "learning_rate": 1.919392896490465e-05, + "loss": 0.0601, + "step": 4950 + }, + { + "epoch": 2.013420089467263, + "grad_norm": 0.6764778866651312, + "learning_rate": 1.9193530389822364e-05, + "loss": 0.013, + "step": 4951 + }, + { + "epoch": 2.013826758845059, + "grad_norm": 13.662320720392412, + "learning_rate": 1.919313172036387e-05, + "loss": 0.2647, + "step": 4952 + }, + { + "epoch": 2.014233428222855, + "grad_norm": 0.6923710476035769, + "learning_rate": 1.919273295653324e-05, + "loss": 0.0105, + "step": 4953 + }, + { + "epoch": 2.0146400976006507, + "grad_norm": 11.113050554584538, + "learning_rate": 1.919233409833459e-05, + "loss": 0.4816, + "step": 4954 + }, + { + "epoch": 2.0150467669784464, + "grad_norm": 12.719731433596564, + "learning_rate": 1.9191935145772e-05, + "loss": 0.4877, + "step": 4955 + }, + { + "epoch": 2.015453436356242, + "grad_norm": 5.9564087002767385, + "learning_rate": 1.919153609884957e-05, + "loss": 0.1749, + "step": 4956 + }, + { + "epoch": 2.0158601057340384, + "grad_norm": 1.546031306319809, + "learning_rate": 1.91911369575714e-05, + "loss": 0.0214, + "step": 4957 + }, + { + "epoch": 2.016266775111834, + "grad_norm": 8.060448229070342, + "learning_rate": 1.9190737721941577e-05, + "loss": 0.4718, + "step": 4958 + }, + { + "epoch": 2.01667344448963, + "grad_norm": 2.2243525418417383, + "learning_rate": 1.9190338391964207e-05, + "loss": 0.0324, + "step": 4959 + }, + { + "epoch": 2.0170801138674257, + "grad_norm": 2.092862509002592, + "learning_rate": 1.918993896764339e-05, + "loss": 0.0285, + "step": 4960 + }, + { + "epoch": 2.0174867832452215, + "grad_norm": 7.0975095915023605, + "learning_rate": 1.9189539448983224e-05, + "loss": 0.216, + "step": 4961 + }, + { + "epoch": 2.0178934526230177, + "grad_norm": 5.598878265874718, + "learning_rate": 1.9189139835987813e-05, + "loss": 0.1259, + "step": 4962 + }, + { + "epoch": 2.0183001220008134, + "grad_norm": 1.4523987580785247, + "learning_rate": 1.9188740128661253e-05, + "loss": 0.0258, + "step": 4963 + }, + { + "epoch": 2.018706791378609, + "grad_norm": 1.8752204864722515, + "learning_rate": 1.918834032700765e-05, + "loss": 0.0278, + "step": 4964 + }, + { + "epoch": 2.019113460756405, + "grad_norm": 8.142253386143285, + "learning_rate": 1.9187940431031112e-05, + "loss": 0.3264, + "step": 4965 + }, + { + "epoch": 2.0195201301342007, + "grad_norm": 12.052390614319126, + "learning_rate": 1.918754044073574e-05, + "loss": 0.332, + "step": 4966 + }, + { + "epoch": 2.019926799511997, + "grad_norm": 6.054651998374477, + "learning_rate": 1.9187140356125642e-05, + "loss": 0.1552, + "step": 4967 + }, + { + "epoch": 2.0203334688897927, + "grad_norm": 7.1183916992897025, + "learning_rate": 1.9186740177204927e-05, + "loss": 0.2161, + "step": 4968 + }, + { + "epoch": 2.0207401382675885, + "grad_norm": 11.089130842895026, + "learning_rate": 1.9186339903977695e-05, + "loss": 0.5223, + "step": 4969 + }, + { + "epoch": 2.0211468076453842, + "grad_norm": 11.16719403903877, + "learning_rate": 1.9185939536448065e-05, + "loss": 0.2673, + "step": 4970 + }, + { + "epoch": 2.02155347702318, + "grad_norm": 6.597689966556895, + "learning_rate": 1.9185539074620138e-05, + "loss": 0.0996, + "step": 4971 + }, + { + "epoch": 2.021960146400976, + "grad_norm": 16.602424235474423, + "learning_rate": 1.9185138518498033e-05, + "loss": 0.9215, + "step": 4972 + }, + { + "epoch": 2.022366815778772, + "grad_norm": 44.02534090427444, + "learning_rate": 1.9184737868085857e-05, + "loss": 1.0171, + "step": 4973 + }, + { + "epoch": 2.0227734851565677, + "grad_norm": 6.876860889333869, + "learning_rate": 1.9184337123387723e-05, + "loss": 0.1204, + "step": 4974 + }, + { + "epoch": 2.0231801545343635, + "grad_norm": 11.907803487735194, + "learning_rate": 1.918393628440775e-05, + "loss": 0.7828, + "step": 4975 + }, + { + "epoch": 2.0235868239121593, + "grad_norm": 14.926708089828658, + "learning_rate": 1.9183535351150047e-05, + "loss": 0.4236, + "step": 4976 + }, + { + "epoch": 2.0239934932899555, + "grad_norm": 7.645938219708548, + "learning_rate": 1.918313432361873e-05, + "loss": 0.2303, + "step": 4977 + }, + { + "epoch": 2.0244001626677512, + "grad_norm": 4.276751001858594, + "learning_rate": 1.918273320181792e-05, + "loss": 0.1337, + "step": 4978 + }, + { + "epoch": 2.024806832045547, + "grad_norm": 8.534137125626101, + "learning_rate": 1.918233198575173e-05, + "loss": 0.1703, + "step": 4979 + }, + { + "epoch": 2.0252135014233428, + "grad_norm": 8.424798524044457, + "learning_rate": 1.9181930675424284e-05, + "loss": 0.3289, + "step": 4980 + }, + { + "epoch": 2.0256201708011385, + "grad_norm": 6.777948196250545, + "learning_rate": 1.9181529270839695e-05, + "loss": 0.1542, + "step": 4981 + }, + { + "epoch": 2.0260268401789343, + "grad_norm": 185.05966387903817, + "learning_rate": 1.9181127772002087e-05, + "loss": 1.1197, + "step": 4982 + }, + { + "epoch": 2.0264335095567305, + "grad_norm": 2.420982072985098, + "learning_rate": 1.918072617891558e-05, + "loss": 0.0431, + "step": 4983 + }, + { + "epoch": 2.0268401789345263, + "grad_norm": 17.21318563928966, + "learning_rate": 1.9180324491584304e-05, + "loss": 0.5763, + "step": 4984 + }, + { + "epoch": 2.027246848312322, + "grad_norm": 10.405906410960155, + "learning_rate": 1.9179922710012374e-05, + "loss": 0.6265, + "step": 4985 + }, + { + "epoch": 2.027653517690118, + "grad_norm": 3.3383998561729284, + "learning_rate": 1.9179520834203915e-05, + "loss": 0.0431, + "step": 4986 + }, + { + "epoch": 2.0280601870679136, + "grad_norm": 6.894587537526279, + "learning_rate": 1.9179118864163057e-05, + "loss": 0.6858, + "step": 4987 + }, + { + "epoch": 2.02846685644571, + "grad_norm": 9.350326628787114, + "learning_rate": 1.9178716799893924e-05, + "loss": 0.3073, + "step": 4988 + }, + { + "epoch": 2.0288735258235056, + "grad_norm": 24.742883344913416, + "learning_rate": 1.9178314641400643e-05, + "loss": 0.2546, + "step": 4989 + }, + { + "epoch": 2.0292801952013013, + "grad_norm": 12.123049065324055, + "learning_rate": 1.9177912388687344e-05, + "loss": 0.5625, + "step": 4990 + }, + { + "epoch": 2.029686864579097, + "grad_norm": 0.6289598815969019, + "learning_rate": 1.9177510041758153e-05, + "loss": 0.0053, + "step": 4991 + }, + { + "epoch": 2.030093533956893, + "grad_norm": 104.53111325926454, + "learning_rate": 1.9177107600617204e-05, + "loss": 0.8558, + "step": 4992 + }, + { + "epoch": 2.030500203334689, + "grad_norm": 11.651854621501279, + "learning_rate": 1.9176705065268624e-05, + "loss": 0.8339, + "step": 4993 + }, + { + "epoch": 2.030906872712485, + "grad_norm": 6.9468513666936795, + "learning_rate": 1.9176302435716554e-05, + "loss": 0.2656, + "step": 4994 + }, + { + "epoch": 2.0313135420902806, + "grad_norm": 0.17275008426872532, + "learning_rate": 1.9175899711965115e-05, + "loss": 0.002, + "step": 4995 + }, + { + "epoch": 2.0317202114680764, + "grad_norm": 1.0042259433171379, + "learning_rate": 1.917549689401845e-05, + "loss": 0.0116, + "step": 4996 + }, + { + "epoch": 2.032126880845872, + "grad_norm": 5.648494602527431, + "learning_rate": 1.9175093981880694e-05, + "loss": 0.0839, + "step": 4997 + }, + { + "epoch": 2.0325335502236683, + "grad_norm": 18.487955733689184, + "learning_rate": 1.9174690975555978e-05, + "loss": 0.6281, + "step": 4998 + }, + { + "epoch": 2.032940219601464, + "grad_norm": 2.2805392586706925, + "learning_rate": 1.9174287875048438e-05, + "loss": 0.0491, + "step": 4999 + }, + { + "epoch": 2.03334688897926, + "grad_norm": 3.1513128656175184, + "learning_rate": 1.917388468036222e-05, + "loss": 0.0642, + "step": 5000 + }, + { + "epoch": 2.0337535583570556, + "grad_norm": 15.649315843675637, + "learning_rate": 1.9173481391501457e-05, + "loss": 0.1728, + "step": 5001 + }, + { + "epoch": 2.0341602277348514, + "grad_norm": 1.5278467277626628, + "learning_rate": 1.9173078008470293e-05, + "loss": 0.0163, + "step": 5002 + }, + { + "epoch": 2.0345668971126476, + "grad_norm": 12.59448089971553, + "learning_rate": 1.9172674531272863e-05, + "loss": 0.6645, + "step": 5003 + }, + { + "epoch": 2.0349735664904434, + "grad_norm": 12.671201861796611, + "learning_rate": 1.9172270959913315e-05, + "loss": 0.143, + "step": 5004 + }, + { + "epoch": 2.035380235868239, + "grad_norm": 9.208103436944373, + "learning_rate": 1.9171867294395786e-05, + "loss": 0.1959, + "step": 5005 + }, + { + "epoch": 2.035786905246035, + "grad_norm": 16.58940342226038, + "learning_rate": 1.917146353472443e-05, + "loss": 1.113, + "step": 5006 + }, + { + "epoch": 2.0361935746238307, + "grad_norm": 12.175432696476769, + "learning_rate": 1.917105968090338e-05, + "loss": 0.1412, + "step": 5007 + }, + { + "epoch": 2.036600244001627, + "grad_norm": 12.900523486958445, + "learning_rate": 1.9170655732936785e-05, + "loss": 0.7636, + "step": 5008 + }, + { + "epoch": 2.0370069133794226, + "grad_norm": 66.10374056833761, + "learning_rate": 1.9170251690828797e-05, + "loss": 0.1178, + "step": 5009 + }, + { + "epoch": 2.0374135827572184, + "grad_norm": 13.955274351354102, + "learning_rate": 1.916984755458356e-05, + "loss": 0.3018, + "step": 5010 + }, + { + "epoch": 2.037820252135014, + "grad_norm": 1.2911132966134904, + "learning_rate": 1.916944332420522e-05, + "loss": 0.0183, + "step": 5011 + }, + { + "epoch": 2.03822692151281, + "grad_norm": 7.106796188362845, + "learning_rate": 1.916903899969793e-05, + "loss": 0.111, + "step": 5012 + }, + { + "epoch": 2.038633590890606, + "grad_norm": 40.35485771905773, + "learning_rate": 1.9168634581065843e-05, + "loss": 3.1792, + "step": 5013 + }, + { + "epoch": 2.039040260268402, + "grad_norm": 10.643202476333439, + "learning_rate": 1.9168230068313105e-05, + "loss": 0.7934, + "step": 5014 + }, + { + "epoch": 2.0394469296461977, + "grad_norm": 10.241261644893537, + "learning_rate": 1.9167825461443873e-05, + "loss": 0.1353, + "step": 5015 + }, + { + "epoch": 2.0398535990239934, + "grad_norm": 10.179334265882229, + "learning_rate": 1.9167420760462297e-05, + "loss": 0.3904, + "step": 5016 + }, + { + "epoch": 2.040260268401789, + "grad_norm": 43.633642882024574, + "learning_rate": 1.9167015965372533e-05, + "loss": 1.6535, + "step": 5017 + }, + { + "epoch": 2.0406669377795854, + "grad_norm": 13.233809894207461, + "learning_rate": 1.9166611076178735e-05, + "loss": 0.6111, + "step": 5018 + }, + { + "epoch": 2.041073607157381, + "grad_norm": 78.53484936001747, + "learning_rate": 1.9166206092885063e-05, + "loss": 0.4553, + "step": 5019 + }, + { + "epoch": 2.041480276535177, + "grad_norm": 10.825357704381304, + "learning_rate": 1.916580101549567e-05, + "loss": 0.6373, + "step": 5020 + }, + { + "epoch": 2.0418869459129727, + "grad_norm": 17.909084780143242, + "learning_rate": 1.916539584401472e-05, + "loss": 0.8334, + "step": 5021 + }, + { + "epoch": 2.0422936152907685, + "grad_norm": 7.745702833217688, + "learning_rate": 1.916499057844637e-05, + "loss": 0.1691, + "step": 5022 + }, + { + "epoch": 2.0427002846685642, + "grad_norm": 13.991583653779658, + "learning_rate": 1.916458521879477e-05, + "loss": 0.2896, + "step": 5023 + }, + { + "epoch": 2.0431069540463604, + "grad_norm": 4.498055952082034, + "learning_rate": 1.9164179765064096e-05, + "loss": 0.114, + "step": 5024 + }, + { + "epoch": 2.043513623424156, + "grad_norm": 5.1053452701656505, + "learning_rate": 1.9163774217258506e-05, + "loss": 0.0856, + "step": 5025 + }, + { + "epoch": 2.043920292801952, + "grad_norm": 26.528976404652425, + "learning_rate": 1.9163368575382162e-05, + "loss": 0.6275, + "step": 5026 + }, + { + "epoch": 2.0443269621797477, + "grad_norm": 14.195799014756261, + "learning_rate": 1.9162962839439224e-05, + "loss": 0.4024, + "step": 5027 + }, + { + "epoch": 2.0447336315575435, + "grad_norm": 13.848859500932855, + "learning_rate": 1.9162557009433864e-05, + "loss": 0.1472, + "step": 5028 + }, + { + "epoch": 2.0451403009353397, + "grad_norm": 3.762623779478327, + "learning_rate": 1.9162151085370245e-05, + "loss": 0.0716, + "step": 5029 + }, + { + "epoch": 2.0455469703131355, + "grad_norm": 7.989051841911263, + "learning_rate": 1.9161745067252532e-05, + "loss": 0.13, + "step": 5030 + }, + { + "epoch": 2.0459536396909312, + "grad_norm": 7.479600125791086, + "learning_rate": 1.9161338955084895e-05, + "loss": 0.1719, + "step": 5031 + }, + { + "epoch": 2.046360309068727, + "grad_norm": 10.383162534059352, + "learning_rate": 1.9160932748871504e-05, + "loss": 0.3047, + "step": 5032 + }, + { + "epoch": 2.0467669784465228, + "grad_norm": 27.436777588440595, + "learning_rate": 1.9160526448616528e-05, + "loss": 0.2399, + "step": 5033 + }, + { + "epoch": 2.047173647824319, + "grad_norm": 11.53021794591167, + "learning_rate": 1.9160120054324137e-05, + "loss": 0.3378, + "step": 5034 + }, + { + "epoch": 2.0475803172021148, + "grad_norm": 0.7977055358443123, + "learning_rate": 1.9159713565998502e-05, + "loss": 0.0147, + "step": 5035 + }, + { + "epoch": 2.0479869865799105, + "grad_norm": 4.824462052437815, + "learning_rate": 1.91593069836438e-05, + "loss": 0.0941, + "step": 5036 + }, + { + "epoch": 2.0483936559577063, + "grad_norm": 3.1874064431473412, + "learning_rate": 1.9158900307264197e-05, + "loss": 0.0682, + "step": 5037 + }, + { + "epoch": 2.048800325335502, + "grad_norm": 7.942232220322077, + "learning_rate": 1.915849353686388e-05, + "loss": 0.2233, + "step": 5038 + }, + { + "epoch": 2.0492069947132983, + "grad_norm": 10.053145851221872, + "learning_rate": 1.9158086672447013e-05, + "loss": 0.3085, + "step": 5039 + }, + { + "epoch": 2.049613664091094, + "grad_norm": 9.044465226934802, + "learning_rate": 1.9157679714017777e-05, + "loss": 0.3947, + "step": 5040 + }, + { + "epoch": 2.05002033346889, + "grad_norm": 55.88626424538356, + "learning_rate": 1.9157272661580352e-05, + "loss": 0.3891, + "step": 5041 + }, + { + "epoch": 2.0504270028466856, + "grad_norm": 1.0664186605345507, + "learning_rate": 1.915686551513891e-05, + "loss": 0.0219, + "step": 5042 + }, + { + "epoch": 2.0508336722244813, + "grad_norm": 5.655804613013607, + "learning_rate": 1.915645827469764e-05, + "loss": 0.0961, + "step": 5043 + }, + { + "epoch": 2.0512403416022775, + "grad_norm": 3.241479689501849, + "learning_rate": 1.9156050940260718e-05, + "loss": 0.053, + "step": 5044 + }, + { + "epoch": 2.0516470109800733, + "grad_norm": 6.394564219703588, + "learning_rate": 1.915564351183232e-05, + "loss": 0.0799, + "step": 5045 + }, + { + "epoch": 2.052053680357869, + "grad_norm": 16.156111914931877, + "learning_rate": 1.9155235989416636e-05, + "loss": 0.5145, + "step": 5046 + }, + { + "epoch": 2.052460349735665, + "grad_norm": 0.41653290792234504, + "learning_rate": 1.9154828373017848e-05, + "loss": 0.006, + "step": 5047 + }, + { + "epoch": 2.0528670191134606, + "grad_norm": 11.517610402552837, + "learning_rate": 1.9154420662640135e-05, + "loss": 0.3446, + "step": 5048 + }, + { + "epoch": 2.053273688491257, + "grad_norm": 13.372017578238873, + "learning_rate": 1.915401285828769e-05, + "loss": 0.2622, + "step": 5049 + }, + { + "epoch": 2.0536803578690526, + "grad_norm": 8.975129576979038, + "learning_rate": 1.9153604959964696e-05, + "loss": 0.2256, + "step": 5050 + }, + { + "epoch": 2.0540870272468483, + "grad_norm": 17.13325098649017, + "learning_rate": 1.915319696767534e-05, + "loss": 0.5143, + "step": 5051 + }, + { + "epoch": 2.054493696624644, + "grad_norm": 3.5693845469420493, + "learning_rate": 1.915278888142381e-05, + "loss": 0.0416, + "step": 5052 + }, + { + "epoch": 2.05490036600244, + "grad_norm": 7.872368659770252, + "learning_rate": 1.9152380701214296e-05, + "loss": 0.2702, + "step": 5053 + }, + { + "epoch": 2.055307035380236, + "grad_norm": 0.5168019100519695, + "learning_rate": 1.9151972427050985e-05, + "loss": 0.0073, + "step": 5054 + }, + { + "epoch": 2.055713704758032, + "grad_norm": 59.08056371928081, + "learning_rate": 1.9151564058938077e-05, + "loss": 1.1195, + "step": 5055 + }, + { + "epoch": 2.0561203741358276, + "grad_norm": 12.511486629383764, + "learning_rate": 1.9151155596879755e-05, + "loss": 0.3078, + "step": 5056 + }, + { + "epoch": 2.0565270435136234, + "grad_norm": 8.362432755190467, + "learning_rate": 1.915074704088021e-05, + "loss": 0.1492, + "step": 5057 + }, + { + "epoch": 2.056933712891419, + "grad_norm": 3.3065179712586046, + "learning_rate": 1.9150338390943648e-05, + "loss": 0.0345, + "step": 5058 + }, + { + "epoch": 2.0573403822692153, + "grad_norm": 42.37360724021263, + "learning_rate": 1.9149929647074256e-05, + "loss": 1.3914, + "step": 5059 + }, + { + "epoch": 2.057747051647011, + "grad_norm": 13.4658771403968, + "learning_rate": 1.9149520809276228e-05, + "loss": 1.1381, + "step": 5060 + }, + { + "epoch": 2.058153721024807, + "grad_norm": 9.233549701111311, + "learning_rate": 1.9149111877553766e-05, + "loss": 0.5625, + "step": 5061 + }, + { + "epoch": 2.0585603904026026, + "grad_norm": 16.761683209314228, + "learning_rate": 1.9148702851911065e-05, + "loss": 0.7612, + "step": 5062 + }, + { + "epoch": 2.0589670597803984, + "grad_norm": 8.928513030206059, + "learning_rate": 1.9148293732352328e-05, + "loss": 0.1667, + "step": 5063 + }, + { + "epoch": 2.059373729158194, + "grad_norm": 16.430393713360814, + "learning_rate": 1.9147884518881748e-05, + "loss": 1.2651, + "step": 5064 + }, + { + "epoch": 2.0597803985359904, + "grad_norm": 3.449253233367523, + "learning_rate": 1.914747521150353e-05, + "loss": 0.0766, + "step": 5065 + }, + { + "epoch": 2.060187067913786, + "grad_norm": 2.512170081844657, + "learning_rate": 1.9147065810221877e-05, + "loss": 0.0423, + "step": 5066 + }, + { + "epoch": 2.060593737291582, + "grad_norm": 43.28001714665096, + "learning_rate": 1.9146656315040986e-05, + "loss": 1.0385, + "step": 5067 + }, + { + "epoch": 2.0610004066693777, + "grad_norm": 1.9188938408594833, + "learning_rate": 1.914624672596507e-05, + "loss": 0.0352, + "step": 5068 + }, + { + "epoch": 2.0614070760471734, + "grad_norm": 41.40617672319946, + "learning_rate": 1.9145837042998323e-05, + "loss": 3.1267, + "step": 5069 + }, + { + "epoch": 2.0618137454249696, + "grad_norm": 12.219676102255342, + "learning_rate": 1.9145427266144957e-05, + "loss": 0.7415, + "step": 5070 + }, + { + "epoch": 2.0622204148027654, + "grad_norm": 7.9844930730778, + "learning_rate": 1.914501739540918e-05, + "loss": 0.1767, + "step": 5071 + }, + { + "epoch": 2.062627084180561, + "grad_norm": 9.798915635110339, + "learning_rate": 1.9144607430795196e-05, + "loss": 0.2079, + "step": 5072 + }, + { + "epoch": 2.063033753558357, + "grad_norm": 32.870501148777606, + "learning_rate": 1.9144197372307214e-05, + "loss": 1.1721, + "step": 5073 + }, + { + "epoch": 2.0634404229361527, + "grad_norm": 6.838190298324997, + "learning_rate": 1.914378721994944e-05, + "loss": 0.1119, + "step": 5074 + }, + { + "epoch": 2.063847092313949, + "grad_norm": 7.099112916889129, + "learning_rate": 1.9143376973726092e-05, + "loss": 0.3015, + "step": 5075 + }, + { + "epoch": 2.0642537616917447, + "grad_norm": 18.18450722652213, + "learning_rate": 1.9142966633641377e-05, + "loss": 0.8235, + "step": 5076 + }, + { + "epoch": 2.0646604310695404, + "grad_norm": 3.271806478905791, + "learning_rate": 1.9142556199699508e-05, + "loss": 0.0516, + "step": 5077 + }, + { + "epoch": 2.065067100447336, + "grad_norm": 7.344573616451001, + "learning_rate": 1.9142145671904702e-05, + "loss": 0.2897, + "step": 5078 + }, + { + "epoch": 2.065473769825132, + "grad_norm": 13.481760476589615, + "learning_rate": 1.9141735050261166e-05, + "loss": 0.3653, + "step": 5079 + }, + { + "epoch": 2.065880439202928, + "grad_norm": 4.767069101093125, + "learning_rate": 1.9141324334773118e-05, + "loss": 0.2357, + "step": 5080 + }, + { + "epoch": 2.066287108580724, + "grad_norm": 16.875644194704826, + "learning_rate": 1.9140913525444773e-05, + "loss": 0.705, + "step": 5081 + }, + { + "epoch": 2.0666937779585197, + "grad_norm": 19.973024477969158, + "learning_rate": 1.9140502622280354e-05, + "loss": 0.9221, + "step": 5082 + }, + { + "epoch": 2.0671004473363155, + "grad_norm": 7.556183028320597, + "learning_rate": 1.9140091625284074e-05, + "loss": 0.1129, + "step": 5083 + }, + { + "epoch": 2.0675071167141112, + "grad_norm": 0.5462693632977286, + "learning_rate": 1.9139680534460156e-05, + "loss": 0.0124, + "step": 5084 + }, + { + "epoch": 2.0679137860919075, + "grad_norm": 11.988615469365456, + "learning_rate": 1.9139269349812813e-05, + "loss": 0.6638, + "step": 5085 + }, + { + "epoch": 2.0683204554697032, + "grad_norm": 6.519243172672977, + "learning_rate": 1.9138858071346272e-05, + "loss": 0.3346, + "step": 5086 + }, + { + "epoch": 2.068727124847499, + "grad_norm": 9.227655189147344, + "learning_rate": 1.9138446699064754e-05, + "loss": 0.1883, + "step": 5087 + }, + { + "epoch": 2.0691337942252948, + "grad_norm": 11.331502083223695, + "learning_rate": 1.913803523297248e-05, + "loss": 0.4214, + "step": 5088 + }, + { + "epoch": 2.0695404636030905, + "grad_norm": 13.847433327338623, + "learning_rate": 1.9137623673073675e-05, + "loss": 0.4119, + "step": 5089 + }, + { + "epoch": 2.0699471329808867, + "grad_norm": 12.311494120017295, + "learning_rate": 1.9137212019372565e-05, + "loss": 0.8154, + "step": 5090 + }, + { + "epoch": 2.0703538023586825, + "grad_norm": 14.700861987763815, + "learning_rate": 1.9136800271873377e-05, + "loss": 0.3131, + "step": 5091 + }, + { + "epoch": 2.0707604717364783, + "grad_norm": 6.523781246419792, + "learning_rate": 1.9136388430580334e-05, + "loss": 0.1796, + "step": 5092 + }, + { + "epoch": 2.071167141114274, + "grad_norm": 45.5961127293225, + "learning_rate": 1.9135976495497664e-05, + "loss": 0.6617, + "step": 5093 + }, + { + "epoch": 2.07157381049207, + "grad_norm": 9.587339181184406, + "learning_rate": 1.91355644666296e-05, + "loss": 0.2501, + "step": 5094 + }, + { + "epoch": 2.071980479869866, + "grad_norm": 15.276114923776458, + "learning_rate": 1.913515234398037e-05, + "loss": 0.1491, + "step": 5095 + }, + { + "epoch": 2.0723871492476618, + "grad_norm": 40.9242832427251, + "learning_rate": 1.9134740127554198e-05, + "loss": 1.4805, + "step": 5096 + }, + { + "epoch": 2.0727938186254575, + "grad_norm": 9.213377246303613, + "learning_rate": 1.9134327817355326e-05, + "loss": 0.1916, + "step": 5097 + }, + { + "epoch": 2.0732004880032533, + "grad_norm": 8.752904051998842, + "learning_rate": 1.9133915413387976e-05, + "loss": 0.0639, + "step": 5098 + }, + { + "epoch": 2.073607157381049, + "grad_norm": 8.905009522314778, + "learning_rate": 1.9133502915656393e-05, + "loss": 0.1349, + "step": 5099 + }, + { + "epoch": 2.0740138267588453, + "grad_norm": 8.325971949560909, + "learning_rate": 1.9133090324164805e-05, + "loss": 0.7049, + "step": 5100 + }, + { + "epoch": 2.074420496136641, + "grad_norm": 13.172217004324889, + "learning_rate": 1.913267763891745e-05, + "loss": 0.7385, + "step": 5101 + }, + { + "epoch": 2.074827165514437, + "grad_norm": 6.946988175346985, + "learning_rate": 1.9132264859918556e-05, + "loss": 0.3305, + "step": 5102 + }, + { + "epoch": 2.0752338348922326, + "grad_norm": 3.098281841996588, + "learning_rate": 1.9131851987172375e-05, + "loss": 0.0411, + "step": 5103 + }, + { + "epoch": 2.0756405042700283, + "grad_norm": 6.869189586734293, + "learning_rate": 1.9131439020683133e-05, + "loss": 0.3086, + "step": 5104 + }, + { + "epoch": 2.076047173647824, + "grad_norm": 9.807698106866365, + "learning_rate": 1.9131025960455072e-05, + "loss": 0.7968, + "step": 5105 + }, + { + "epoch": 2.0764538430256203, + "grad_norm": 1.7202658388028562, + "learning_rate": 1.9130612806492438e-05, + "loss": 0.019, + "step": 5106 + }, + { + "epoch": 2.076860512403416, + "grad_norm": 1.5825616960767515, + "learning_rate": 1.9130199558799466e-05, + "loss": 0.0226, + "step": 5107 + }, + { + "epoch": 2.077267181781212, + "grad_norm": 0.7547765563786197, + "learning_rate": 1.91297862173804e-05, + "loss": 0.0127, + "step": 5108 + }, + { + "epoch": 2.0776738511590076, + "grad_norm": 11.130151186789506, + "learning_rate": 1.9129372782239484e-05, + "loss": 0.4286, + "step": 5109 + }, + { + "epoch": 2.0780805205368034, + "grad_norm": 17.692600085305685, + "learning_rate": 1.9128959253380962e-05, + "loss": 0.8227, + "step": 5110 + }, + { + "epoch": 2.0784871899145996, + "grad_norm": 9.039516329500053, + "learning_rate": 1.912854563080908e-05, + "loss": 0.3204, + "step": 5111 + }, + { + "epoch": 2.0788938592923953, + "grad_norm": 6.823201511981679, + "learning_rate": 1.9128131914528082e-05, + "loss": 0.1202, + "step": 5112 + }, + { + "epoch": 2.079300528670191, + "grad_norm": 7.443793215854409, + "learning_rate": 1.9127718104542215e-05, + "loss": 0.2079, + "step": 5113 + }, + { + "epoch": 2.079707198047987, + "grad_norm": 1.3403649668575, + "learning_rate": 1.9127304200855727e-05, + "loss": 0.02, + "step": 5114 + }, + { + "epoch": 2.0801138674257826, + "grad_norm": 2.081866893881276, + "learning_rate": 1.912689020347287e-05, + "loss": 0.0469, + "step": 5115 + }, + { + "epoch": 2.080520536803579, + "grad_norm": 0.15843615099515587, + "learning_rate": 1.912647611239789e-05, + "loss": 0.0024, + "step": 5116 + }, + { + "epoch": 2.0809272061813746, + "grad_norm": 6.983872086962783, + "learning_rate": 1.912606192763504e-05, + "loss": 0.2513, + "step": 5117 + }, + { + "epoch": 2.0813338755591704, + "grad_norm": 4.521882716239474, + "learning_rate": 1.912564764918857e-05, + "loss": 0.3072, + "step": 5118 + }, + { + "epoch": 2.081740544936966, + "grad_norm": 10.267778485702532, + "learning_rate": 1.9125233277062737e-05, + "loss": 0.2164, + "step": 5119 + }, + { + "epoch": 2.082147214314762, + "grad_norm": 6.185910824858558, + "learning_rate": 1.912481881126179e-05, + "loss": 0.1515, + "step": 5120 + }, + { + "epoch": 2.082553883692558, + "grad_norm": 9.14320643534213, + "learning_rate": 1.9124404251789987e-05, + "loss": 0.2858, + "step": 5121 + }, + { + "epoch": 2.082960553070354, + "grad_norm": 12.110662243166862, + "learning_rate": 1.912398959865158e-05, + "loss": 0.2985, + "step": 5122 + }, + { + "epoch": 2.0833672224481496, + "grad_norm": 7.298166310270581, + "learning_rate": 1.9123574851850826e-05, + "loss": 0.1621, + "step": 5123 + }, + { + "epoch": 2.0837738918259454, + "grad_norm": 13.682906526231745, + "learning_rate": 1.9123160011391987e-05, + "loss": 0.2881, + "step": 5124 + }, + { + "epoch": 2.084180561203741, + "grad_norm": 6.11394813391883, + "learning_rate": 1.9122745077279317e-05, + "loss": 0.1014, + "step": 5125 + }, + { + "epoch": 2.0845872305815374, + "grad_norm": 15.551026478613489, + "learning_rate": 1.9122330049517082e-05, + "loss": 1.122, + "step": 5126 + }, + { + "epoch": 2.084993899959333, + "grad_norm": 3.1846849593216713, + "learning_rate": 1.9121914928109534e-05, + "loss": 0.0593, + "step": 5127 + }, + { + "epoch": 2.085400569337129, + "grad_norm": 7.4078436202670845, + "learning_rate": 1.9121499713060938e-05, + "loss": 0.0883, + "step": 5128 + }, + { + "epoch": 2.0858072387149247, + "grad_norm": 10.852875857317796, + "learning_rate": 1.9121084404375557e-05, + "loss": 0.6414, + "step": 5129 + }, + { + "epoch": 2.0862139080927204, + "grad_norm": 16.129485321898017, + "learning_rate": 1.9120669002057653e-05, + "loss": 0.7276, + "step": 5130 + }, + { + "epoch": 2.0866205774705167, + "grad_norm": 12.644449491386785, + "learning_rate": 1.9120253506111495e-05, + "loss": 0.7728, + "step": 5131 + }, + { + "epoch": 2.0870272468483124, + "grad_norm": 1.9248606593694897, + "learning_rate": 1.9119837916541342e-05, + "loss": 0.0313, + "step": 5132 + }, + { + "epoch": 2.087433916226108, + "grad_norm": 8.852704318198418, + "learning_rate": 1.9119422233351467e-05, + "loss": 0.7387, + "step": 5133 + }, + { + "epoch": 2.087840585603904, + "grad_norm": 8.271973822996502, + "learning_rate": 1.9119006456546128e-05, + "loss": 0.2052, + "step": 5134 + }, + { + "epoch": 2.0882472549816997, + "grad_norm": 5.750893940617928, + "learning_rate": 1.91185905861296e-05, + "loss": 0.1108, + "step": 5135 + }, + { + "epoch": 2.088653924359496, + "grad_norm": 5.844615097564513, + "learning_rate": 1.911817462210615e-05, + "loss": 0.2225, + "step": 5136 + }, + { + "epoch": 2.0890605937372917, + "grad_norm": 9.934335773908963, + "learning_rate": 1.9117758564480053e-05, + "loss": 0.2072, + "step": 5137 + }, + { + "epoch": 2.0894672631150875, + "grad_norm": 13.987232781550116, + "learning_rate": 1.9117342413255566e-05, + "loss": 0.566, + "step": 5138 + }, + { + "epoch": 2.0898739324928832, + "grad_norm": 5.203023340460483, + "learning_rate": 1.9116926168436978e-05, + "loss": 0.0952, + "step": 5139 + }, + { + "epoch": 2.090280601870679, + "grad_norm": 12.991258075543712, + "learning_rate": 1.911650983002855e-05, + "loss": 0.4273, + "step": 5140 + }, + { + "epoch": 2.090687271248475, + "grad_norm": 13.791149660903477, + "learning_rate": 1.9116093398034564e-05, + "loss": 0.4161, + "step": 5141 + }, + { + "epoch": 2.091093940626271, + "grad_norm": 5.0944125736987615, + "learning_rate": 1.911567687245929e-05, + "loss": 0.3127, + "step": 5142 + }, + { + "epoch": 2.0915006100040667, + "grad_norm": 4.377572811051124, + "learning_rate": 1.9115260253307006e-05, + "loss": 0.0682, + "step": 5143 + }, + { + "epoch": 2.0919072793818625, + "grad_norm": 7.886360303994383, + "learning_rate": 1.9114843540581986e-05, + "loss": 0.3766, + "step": 5144 + }, + { + "epoch": 2.0923139487596583, + "grad_norm": 12.204481959929693, + "learning_rate": 1.911442673428851e-05, + "loss": 0.6736, + "step": 5145 + }, + { + "epoch": 2.092720618137454, + "grad_norm": 1.2788475825104626, + "learning_rate": 1.9114009834430857e-05, + "loss": 0.0255, + "step": 5146 + }, + { + "epoch": 2.0931272875152502, + "grad_norm": 6.283773494903541, + "learning_rate": 1.9113592841013305e-05, + "loss": 0.2842, + "step": 5147 + }, + { + "epoch": 2.093533956893046, + "grad_norm": 15.072687181339113, + "learning_rate": 1.9113175754040138e-05, + "loss": 0.5594, + "step": 5148 + }, + { + "epoch": 2.0939406262708418, + "grad_norm": 8.595042909335957, + "learning_rate": 1.911275857351563e-05, + "loss": 0.1953, + "step": 5149 + }, + { + "epoch": 2.0943472956486375, + "grad_norm": 6.996793885420908, + "learning_rate": 1.9112341299444076e-05, + "loss": 0.1537, + "step": 5150 + }, + { + "epoch": 2.0947539650264333, + "grad_norm": 13.278178995267982, + "learning_rate": 1.9111923931829746e-05, + "loss": 0.431, + "step": 5151 + }, + { + "epoch": 2.0951606344042295, + "grad_norm": 5.443873277558214, + "learning_rate": 1.9111506470676935e-05, + "loss": 0.1971, + "step": 5152 + }, + { + "epoch": 2.0955673037820253, + "grad_norm": 8.174442130245856, + "learning_rate": 1.9111088915989924e-05, + "loss": 0.4288, + "step": 5153 + }, + { + "epoch": 2.095973973159821, + "grad_norm": 0.32653940947893334, + "learning_rate": 1.9110671267773e-05, + "loss": 0.0047, + "step": 5154 + }, + { + "epoch": 2.096380642537617, + "grad_norm": 7.077062787730868, + "learning_rate": 1.911025352603045e-05, + "loss": 0.2503, + "step": 5155 + }, + { + "epoch": 2.0967873119154126, + "grad_norm": 5.439755998121419, + "learning_rate": 1.910983569076656e-05, + "loss": 0.2128, + "step": 5156 + }, + { + "epoch": 2.0971939812932088, + "grad_norm": 12.478531265084793, + "learning_rate": 1.9109417761985622e-05, + "loss": 0.2966, + "step": 5157 + }, + { + "epoch": 2.0976006506710045, + "grad_norm": 9.150971052397823, + "learning_rate": 1.9108999739691928e-05, + "loss": 0.2081, + "step": 5158 + }, + { + "epoch": 2.0980073200488003, + "grad_norm": 0.6221829282272373, + "learning_rate": 1.9108581623889768e-05, + "loss": 0.0123, + "step": 5159 + }, + { + "epoch": 2.098413989426596, + "grad_norm": 4.216360922399921, + "learning_rate": 1.910816341458343e-05, + "loss": 0.0639, + "step": 5160 + }, + { + "epoch": 2.098820658804392, + "grad_norm": 14.047186370550294, + "learning_rate": 1.9107745111777215e-05, + "loss": 0.1654, + "step": 5161 + }, + { + "epoch": 2.099227328182188, + "grad_norm": 17.706790407412353, + "learning_rate": 1.910732671547541e-05, + "loss": 0.4584, + "step": 5162 + }, + { + "epoch": 2.099633997559984, + "grad_norm": 12.10368557620309, + "learning_rate": 1.9106908225682313e-05, + "loss": 0.2774, + "step": 5163 + }, + { + "epoch": 2.1000406669377796, + "grad_norm": 12.921216908179382, + "learning_rate": 1.910648964240222e-05, + "loss": 0.6008, + "step": 5164 + }, + { + "epoch": 2.1004473363155753, + "grad_norm": 8.656461448606626, + "learning_rate": 1.9106070965639425e-05, + "loss": 0.2924, + "step": 5165 + }, + { + "epoch": 2.100854005693371, + "grad_norm": 4.712358723533816, + "learning_rate": 1.9105652195398234e-05, + "loss": 0.1213, + "step": 5166 + }, + { + "epoch": 2.1012606750711673, + "grad_norm": 14.581907889035694, + "learning_rate": 1.9105233331682935e-05, + "loss": 0.5774, + "step": 5167 + }, + { + "epoch": 2.101667344448963, + "grad_norm": 11.867418889331958, + "learning_rate": 1.9104814374497835e-05, + "loss": 0.6513, + "step": 5168 + }, + { + "epoch": 2.102074013826759, + "grad_norm": 36.53829905656678, + "learning_rate": 1.9104395323847234e-05, + "loss": 0.6101, + "step": 5169 + }, + { + "epoch": 2.1024806832045546, + "grad_norm": 18.67338412861909, + "learning_rate": 1.9103976179735434e-05, + "loss": 0.6643, + "step": 5170 + }, + { + "epoch": 2.1028873525823504, + "grad_norm": 0.11937113331812516, + "learning_rate": 1.9103556942166732e-05, + "loss": 0.0018, + "step": 5171 + }, + { + "epoch": 2.1032940219601466, + "grad_norm": 13.30475672408736, + "learning_rate": 1.910313761114544e-05, + "loss": 0.2077, + "step": 5172 + }, + { + "epoch": 2.1037006913379424, + "grad_norm": 15.358956251319006, + "learning_rate": 1.910271818667586e-05, + "loss": 0.7997, + "step": 5173 + }, + { + "epoch": 2.104107360715738, + "grad_norm": 6.360782785482582, + "learning_rate": 1.9102298668762293e-05, + "loss": 0.1319, + "step": 5174 + }, + { + "epoch": 2.104514030093534, + "grad_norm": 18.828318709152608, + "learning_rate": 1.910187905740905e-05, + "loss": 1.1027, + "step": 5175 + }, + { + "epoch": 2.1049206994713296, + "grad_norm": 12.85236814170908, + "learning_rate": 1.910145935262044e-05, + "loss": 0.638, + "step": 5176 + }, + { + "epoch": 2.105327368849126, + "grad_norm": 0.5077754597506379, + "learning_rate": 1.9101039554400766e-05, + "loss": 0.008, + "step": 5177 + }, + { + "epoch": 2.1057340382269216, + "grad_norm": 10.454848613677111, + "learning_rate": 1.9100619662754343e-05, + "loss": 0.371, + "step": 5178 + }, + { + "epoch": 2.1061407076047174, + "grad_norm": 14.185020862964592, + "learning_rate": 1.9100199677685475e-05, + "loss": 1.2431, + "step": 5179 + }, + { + "epoch": 2.106547376982513, + "grad_norm": 17.80301878896033, + "learning_rate": 1.9099779599198482e-05, + "loss": 1.3356, + "step": 5180 + }, + { + "epoch": 2.106954046360309, + "grad_norm": 0.7269215875961047, + "learning_rate": 1.909935942729767e-05, + "loss": 0.0139, + "step": 5181 + }, + { + "epoch": 2.107360715738105, + "grad_norm": 10.308011382189788, + "learning_rate": 1.909893916198735e-05, + "loss": 0.2849, + "step": 5182 + }, + { + "epoch": 2.107767385115901, + "grad_norm": 9.11732125153534, + "learning_rate": 1.9098518803271843e-05, + "loss": 0.4297, + "step": 5183 + }, + { + "epoch": 2.1081740544936967, + "grad_norm": 11.871813882145213, + "learning_rate": 1.9098098351155463e-05, + "loss": 0.4011, + "step": 5184 + }, + { + "epoch": 2.1085807238714924, + "grad_norm": 3.031839975157725, + "learning_rate": 1.9097677805642525e-05, + "loss": 0.0425, + "step": 5185 + }, + { + "epoch": 2.108987393249288, + "grad_norm": 9.28578360177045, + "learning_rate": 1.909725716673734e-05, + "loss": 0.1847, + "step": 5186 + }, + { + "epoch": 2.109394062627084, + "grad_norm": 11.15636011912123, + "learning_rate": 1.909683643444424e-05, + "loss": 0.3712, + "step": 5187 + }, + { + "epoch": 2.10980073200488, + "grad_norm": 17.39810403753523, + "learning_rate": 1.909641560876753e-05, + "loss": 0.968, + "step": 5188 + }, + { + "epoch": 2.110207401382676, + "grad_norm": 13.910847290793757, + "learning_rate": 1.9095994689711538e-05, + "loss": 0.2086, + "step": 5189 + }, + { + "epoch": 2.1106140707604717, + "grad_norm": 9.779120212617832, + "learning_rate": 1.909557367728058e-05, + "loss": 0.3685, + "step": 5190 + }, + { + "epoch": 2.1110207401382675, + "grad_norm": 13.623649569001213, + "learning_rate": 1.9095152571478982e-05, + "loss": 0.3612, + "step": 5191 + }, + { + "epoch": 2.1114274095160637, + "grad_norm": 6.538049145332214, + "learning_rate": 1.9094731372311065e-05, + "loss": 0.1504, + "step": 5192 + }, + { + "epoch": 2.1118340788938594, + "grad_norm": 30.44009469475676, + "learning_rate": 1.9094310079781154e-05, + "loss": 0.9705, + "step": 5193 + }, + { + "epoch": 2.112240748271655, + "grad_norm": 3.08353524775953, + "learning_rate": 1.9093888693893572e-05, + "loss": 0.1467, + "step": 5194 + }, + { + "epoch": 2.112647417649451, + "grad_norm": 2.5588302616513023, + "learning_rate": 1.9093467214652646e-05, + "loss": 0.0322, + "step": 5195 + }, + { + "epoch": 2.1130540870272467, + "grad_norm": 4.661099555062556, + "learning_rate": 1.9093045642062702e-05, + "loss": 0.085, + "step": 5196 + }, + { + "epoch": 2.1134607564050425, + "grad_norm": 4.576210360308225, + "learning_rate": 1.9092623976128067e-05, + "loss": 0.1068, + "step": 5197 + }, + { + "epoch": 2.1138674257828387, + "grad_norm": 9.135638465679932, + "learning_rate": 1.9092202216853074e-05, + "loss": 0.2005, + "step": 5198 + }, + { + "epoch": 2.1142740951606345, + "grad_norm": 12.691728912459904, + "learning_rate": 1.9091780364242048e-05, + "loss": 0.3908, + "step": 5199 + }, + { + "epoch": 2.1146807645384302, + "grad_norm": 2.0070889133322822, + "learning_rate": 1.909135841829932e-05, + "loss": 0.0361, + "step": 5200 + }, + { + "epoch": 2.115087433916226, + "grad_norm": 6.832126439046186, + "learning_rate": 1.909093637902922e-05, + "loss": 0.1554, + "step": 5201 + }, + { + "epoch": 2.1154941032940218, + "grad_norm": 16.751657095627706, + "learning_rate": 1.9090514246436085e-05, + "loss": 0.9425, + "step": 5202 + }, + { + "epoch": 2.115900772671818, + "grad_norm": 9.758708967903518, + "learning_rate": 1.909009202052425e-05, + "loss": 0.2995, + "step": 5203 + }, + { + "epoch": 2.1163074420496137, + "grad_norm": 4.505250087364215, + "learning_rate": 1.9089669701298042e-05, + "loss": 0.0613, + "step": 5204 + }, + { + "epoch": 2.1167141114274095, + "grad_norm": 9.767373280292764, + "learning_rate": 1.9089247288761797e-05, + "loss": 0.629, + "step": 5205 + }, + { + "epoch": 2.1171207808052053, + "grad_norm": 6.836490312362595, + "learning_rate": 1.9088824782919856e-05, + "loss": 0.1623, + "step": 5206 + }, + { + "epoch": 2.117527450183001, + "grad_norm": 1.4098132910354306, + "learning_rate": 1.9088402183776555e-05, + "loss": 0.0272, + "step": 5207 + }, + { + "epoch": 2.1179341195607972, + "grad_norm": 5.584397042787709, + "learning_rate": 1.9087979491336233e-05, + "loss": 0.1394, + "step": 5208 + }, + { + "epoch": 2.118340788938593, + "grad_norm": 0.27370283071808627, + "learning_rate": 1.9087556705603225e-05, + "loss": 0.0053, + "step": 5209 + }, + { + "epoch": 2.1187474583163888, + "grad_norm": 5.989223259491538, + "learning_rate": 1.9087133826581875e-05, + "loss": 0.2675, + "step": 5210 + }, + { + "epoch": 2.1191541276941845, + "grad_norm": 1.6735654058606007, + "learning_rate": 1.908671085427652e-05, + "loss": 0.0264, + "step": 5211 + }, + { + "epoch": 2.1195607970719803, + "grad_norm": 7.64677435811049, + "learning_rate": 1.9086287788691508e-05, + "loss": 0.2094, + "step": 5212 + }, + { + "epoch": 2.1199674664497765, + "grad_norm": 11.43995503630712, + "learning_rate": 1.908586462983118e-05, + "loss": 0.5901, + "step": 5213 + }, + { + "epoch": 2.1203741358275723, + "grad_norm": 18.597654838217426, + "learning_rate": 1.9085441377699876e-05, + "loss": 1.531, + "step": 5214 + }, + { + "epoch": 2.120780805205368, + "grad_norm": 1.200595990517947, + "learning_rate": 1.9085018032301944e-05, + "loss": 0.0365, + "step": 5215 + }, + { + "epoch": 2.121187474583164, + "grad_norm": 35.535794023751585, + "learning_rate": 1.9084594593641734e-05, + "loss": 0.5233, + "step": 5216 + }, + { + "epoch": 2.1215941439609596, + "grad_norm": 9.01909520866383, + "learning_rate": 1.9084171061723582e-05, + "loss": 0.3038, + "step": 5217 + }, + { + "epoch": 2.122000813338756, + "grad_norm": 2.9360907525929187, + "learning_rate": 1.9083747436551843e-05, + "loss": 0.0389, + "step": 5218 + }, + { + "epoch": 2.1224074827165516, + "grad_norm": 15.385494918313242, + "learning_rate": 1.9083323718130867e-05, + "loss": 0.369, + "step": 5219 + }, + { + "epoch": 2.1228141520943473, + "grad_norm": 16.469996203407344, + "learning_rate": 1.9082899906465e-05, + "loss": 0.5481, + "step": 5220 + }, + { + "epoch": 2.123220821472143, + "grad_norm": 8.24906435328107, + "learning_rate": 1.9082476001558595e-05, + "loss": 0.1528, + "step": 5221 + }, + { + "epoch": 2.123627490849939, + "grad_norm": 8.958674084868397, + "learning_rate": 1.9082052003416006e-05, + "loss": 0.4817, + "step": 5222 + }, + { + "epoch": 2.124034160227735, + "grad_norm": 10.338849901699426, + "learning_rate": 1.908162791204158e-05, + "loss": 0.2645, + "step": 5223 + }, + { + "epoch": 2.124440829605531, + "grad_norm": 7.470320142437942, + "learning_rate": 1.908120372743967e-05, + "loss": 0.2897, + "step": 5224 + }, + { + "epoch": 2.1248474989833266, + "grad_norm": 12.441257122402138, + "learning_rate": 1.9080779449614634e-05, + "loss": 0.4462, + "step": 5225 + }, + { + "epoch": 2.1252541683611224, + "grad_norm": 9.454524320462939, + "learning_rate": 1.908035507857083e-05, + "loss": 0.318, + "step": 5226 + }, + { + "epoch": 2.125660837738918, + "grad_norm": 10.915811382031038, + "learning_rate": 1.9079930614312605e-05, + "loss": 0.253, + "step": 5227 + }, + { + "epoch": 2.126067507116714, + "grad_norm": 8.35434165737601, + "learning_rate": 1.9079506056844326e-05, + "loss": 0.254, + "step": 5228 + }, + { + "epoch": 2.12647417649451, + "grad_norm": 11.790954324292178, + "learning_rate": 1.9079081406170346e-05, + "loss": 0.3523, + "step": 5229 + }, + { + "epoch": 2.126880845872306, + "grad_norm": 14.451623612645829, + "learning_rate": 1.907865666229503e-05, + "loss": 1.0158, + "step": 5230 + }, + { + "epoch": 2.1272875152501016, + "grad_norm": 11.720931002714398, + "learning_rate": 1.907823182522273e-05, + "loss": 0.3399, + "step": 5231 + }, + { + "epoch": 2.1276941846278974, + "grad_norm": 9.709370026754643, + "learning_rate": 1.9077806894957816e-05, + "loss": 0.4857, + "step": 5232 + }, + { + "epoch": 2.1281008540056936, + "grad_norm": 12.670668985332618, + "learning_rate": 1.9077381871504643e-05, + "loss": 0.4727, + "step": 5233 + }, + { + "epoch": 2.1285075233834894, + "grad_norm": 4.517755855970482, + "learning_rate": 1.9076956754867574e-05, + "loss": 0.0877, + "step": 5234 + }, + { + "epoch": 2.128914192761285, + "grad_norm": 10.895792011956287, + "learning_rate": 1.907653154505098e-05, + "loss": 0.3929, + "step": 5235 + }, + { + "epoch": 2.129320862139081, + "grad_norm": 4.782702189658046, + "learning_rate": 1.907610624205922e-05, + "loss": 0.1625, + "step": 5236 + }, + { + "epoch": 2.1297275315168767, + "grad_norm": 5.121162645174695, + "learning_rate": 1.907568084589666e-05, + "loss": 0.0944, + "step": 5237 + }, + { + "epoch": 2.1301342008946724, + "grad_norm": 10.865398159293528, + "learning_rate": 1.9075255356567667e-05, + "loss": 0.2827, + "step": 5238 + }, + { + "epoch": 2.1305408702724686, + "grad_norm": 7.17146186915208, + "learning_rate": 1.9074829774076616e-05, + "loss": 0.1574, + "step": 5239 + }, + { + "epoch": 2.1309475396502644, + "grad_norm": 9.293442250150676, + "learning_rate": 1.9074404098427867e-05, + "loss": 0.4057, + "step": 5240 + }, + { + "epoch": 2.13135420902806, + "grad_norm": 8.984150972890381, + "learning_rate": 1.907397832962579e-05, + "loss": 0.2326, + "step": 5241 + }, + { + "epoch": 2.131760878405856, + "grad_norm": 6.241634655185229, + "learning_rate": 1.9073552467674763e-05, + "loss": 0.1349, + "step": 5242 + }, + { + "epoch": 2.1321675477836517, + "grad_norm": 3.1269547379548825, + "learning_rate": 1.907312651257915e-05, + "loss": 0.0505, + "step": 5243 + }, + { + "epoch": 2.132574217161448, + "grad_norm": 6.650071428287758, + "learning_rate": 1.907270046434333e-05, + "loss": 0.0794, + "step": 5244 + }, + { + "epoch": 2.1329808865392437, + "grad_norm": 123.19860176014902, + "learning_rate": 1.907227432297167e-05, + "loss": 0.642, + "step": 5245 + }, + { + "epoch": 2.1333875559170394, + "grad_norm": 4.516763017964733, + "learning_rate": 1.9071848088468553e-05, + "loss": 0.1607, + "step": 5246 + }, + { + "epoch": 2.133794225294835, + "grad_norm": 0.4950244874474556, + "learning_rate": 1.9071421760838345e-05, + "loss": 0.007, + "step": 5247 + }, + { + "epoch": 2.134200894672631, + "grad_norm": 9.314403266537328, + "learning_rate": 1.907099534008543e-05, + "loss": 0.3641, + "step": 5248 + }, + { + "epoch": 2.134607564050427, + "grad_norm": 8.091252482273035, + "learning_rate": 1.907056882621418e-05, + "loss": 0.332, + "step": 5249 + }, + { + "epoch": 2.135014233428223, + "grad_norm": 5.0503204774029, + "learning_rate": 1.9070142219228976e-05, + "loss": 0.0899, + "step": 5250 + }, + { + "epoch": 2.1354209028060187, + "grad_norm": 4.830744764503592, + "learning_rate": 1.90697155191342e-05, + "loss": 0.0681, + "step": 5251 + }, + { + "epoch": 2.1358275721838145, + "grad_norm": 12.221021997401726, + "learning_rate": 1.9069288725934227e-05, + "loss": 0.561, + "step": 5252 + }, + { + "epoch": 2.1362342415616102, + "grad_norm": 13.81308250445003, + "learning_rate": 1.9068861839633443e-05, + "loss": 1.0072, + "step": 5253 + }, + { + "epoch": 2.1366409109394064, + "grad_norm": 2.965859566638528, + "learning_rate": 1.9068434860236227e-05, + "loss": 0.0332, + "step": 5254 + }, + { + "epoch": 2.137047580317202, + "grad_norm": 9.512839469447876, + "learning_rate": 1.906800778774696e-05, + "loss": 0.1635, + "step": 5255 + }, + { + "epoch": 2.137454249694998, + "grad_norm": 10.390757357668141, + "learning_rate": 1.9067580622170032e-05, + "loss": 0.2742, + "step": 5256 + }, + { + "epoch": 2.1378609190727937, + "grad_norm": 64.20062426637905, + "learning_rate": 1.9067153363509825e-05, + "loss": 1.1346, + "step": 5257 + }, + { + "epoch": 2.1382675884505895, + "grad_norm": 4.898582799044082, + "learning_rate": 1.9066726011770725e-05, + "loss": 0.1194, + "step": 5258 + }, + { + "epoch": 2.1386742578283857, + "grad_norm": 9.554256738390245, + "learning_rate": 1.906629856695712e-05, + "loss": 0.3322, + "step": 5259 + }, + { + "epoch": 2.1390809272061815, + "grad_norm": 12.958402524302915, + "learning_rate": 1.90658710290734e-05, + "loss": 0.9165, + "step": 5260 + }, + { + "epoch": 2.1394875965839772, + "grad_norm": 9.615632480806656, + "learning_rate": 1.9065443398123947e-05, + "loss": 0.1791, + "step": 5261 + }, + { + "epoch": 2.139894265961773, + "grad_norm": 5.206586146593759, + "learning_rate": 1.9065015674113155e-05, + "loss": 0.2512, + "step": 5262 + }, + { + "epoch": 2.1403009353395688, + "grad_norm": 2.8383993265863436, + "learning_rate": 1.9064587857045417e-05, + "loss": 0.0484, + "step": 5263 + }, + { + "epoch": 2.140707604717365, + "grad_norm": 16.43862829040338, + "learning_rate": 1.906415994692512e-05, + "loss": 0.8904, + "step": 5264 + }, + { + "epoch": 2.1411142740951608, + "grad_norm": 1.7949586038624532, + "learning_rate": 1.9063731943756663e-05, + "loss": 0.0239, + "step": 5265 + }, + { + "epoch": 2.1415209434729565, + "grad_norm": 4.411145365798295, + "learning_rate": 1.9063303847544432e-05, + "loss": 0.134, + "step": 5266 + }, + { + "epoch": 2.1419276128507523, + "grad_norm": 3.467056406019244, + "learning_rate": 1.906287565829283e-05, + "loss": 0.0547, + "step": 5267 + }, + { + "epoch": 2.142334282228548, + "grad_norm": 5.01457767538792, + "learning_rate": 1.9062447376006244e-05, + "loss": 0.078, + "step": 5268 + }, + { + "epoch": 2.142740951606344, + "grad_norm": 5.335935719921822, + "learning_rate": 1.9062019000689075e-05, + "loss": 0.1096, + "step": 5269 + }, + { + "epoch": 2.14314762098414, + "grad_norm": 5.180230318184736, + "learning_rate": 1.9061590532345723e-05, + "loss": 0.1251, + "step": 5270 + }, + { + "epoch": 2.143554290361936, + "grad_norm": 7.215215735354357, + "learning_rate": 1.906116197098058e-05, + "loss": 0.1341, + "step": 5271 + }, + { + "epoch": 2.1439609597397316, + "grad_norm": 50.075423559681944, + "learning_rate": 1.906073331659805e-05, + "loss": 0.9019, + "step": 5272 + }, + { + "epoch": 2.1443676291175273, + "grad_norm": 12.711321487250942, + "learning_rate": 1.9060304569202534e-05, + "loss": 0.5115, + "step": 5273 + }, + { + "epoch": 2.1447742984953235, + "grad_norm": 7.172074830293343, + "learning_rate": 1.905987572879843e-05, + "loss": 0.2698, + "step": 5274 + }, + { + "epoch": 2.1451809678731193, + "grad_norm": 14.733069860827783, + "learning_rate": 1.9059446795390143e-05, + "loss": 0.337, + "step": 5275 + }, + { + "epoch": 2.145587637250915, + "grad_norm": 17.323373491811186, + "learning_rate": 1.9059017768982075e-05, + "loss": 0.8142, + "step": 5276 + }, + { + "epoch": 2.145994306628711, + "grad_norm": 12.296143234004482, + "learning_rate": 1.905858864957863e-05, + "loss": 0.4352, + "step": 5277 + }, + { + "epoch": 2.1464009760065066, + "grad_norm": 5.2402382326831685, + "learning_rate": 1.9058159437184215e-05, + "loss": 0.1804, + "step": 5278 + }, + { + "epoch": 2.1468076453843024, + "grad_norm": 29.91575804552989, + "learning_rate": 1.905773013180323e-05, + "loss": 0.7871, + "step": 5279 + }, + { + "epoch": 2.1472143147620986, + "grad_norm": 2.159912923831018, + "learning_rate": 1.905730073344009e-05, + "loss": 0.0375, + "step": 5280 + }, + { + "epoch": 2.1476209841398943, + "grad_norm": 6.699527506483511, + "learning_rate": 1.90568712420992e-05, + "loss": 0.1514, + "step": 5281 + }, + { + "epoch": 2.14802765351769, + "grad_norm": 10.011602445709014, + "learning_rate": 1.9056441657784965e-05, + "loss": 0.4516, + "step": 5282 + }, + { + "epoch": 2.148434322895486, + "grad_norm": 7.313954380765694, + "learning_rate": 1.9056011980501804e-05, + "loss": 0.218, + "step": 5283 + }, + { + "epoch": 2.1488409922732816, + "grad_norm": 5.393659014627873, + "learning_rate": 1.9055582210254118e-05, + "loss": 0.1172, + "step": 5284 + }, + { + "epoch": 2.149247661651078, + "grad_norm": 2.788192510034659, + "learning_rate": 1.9055152347046324e-05, + "loss": 0.0236, + "step": 5285 + }, + { + "epoch": 2.1496543310288736, + "grad_norm": 11.971248068614925, + "learning_rate": 1.9054722390882834e-05, + "loss": 0.9616, + "step": 5286 + }, + { + "epoch": 2.1500610004066694, + "grad_norm": 6.387431208550323, + "learning_rate": 1.905429234176806e-05, + "loss": 0.2774, + "step": 5287 + }, + { + "epoch": 2.150467669784465, + "grad_norm": 6.958905665659549, + "learning_rate": 1.905386219970642e-05, + "loss": 0.2294, + "step": 5288 + }, + { + "epoch": 2.150874339162261, + "grad_norm": 7.326630450442408, + "learning_rate": 1.9053431964702328e-05, + "loss": 0.1592, + "step": 5289 + }, + { + "epoch": 2.151281008540057, + "grad_norm": 8.490679276325006, + "learning_rate": 1.90530016367602e-05, + "loss": 0.339, + "step": 5290 + }, + { + "epoch": 2.151687677917853, + "grad_norm": 2.8481472627927342, + "learning_rate": 1.9052571215884455e-05, + "loss": 0.0505, + "step": 5291 + }, + { + "epoch": 2.1520943472956486, + "grad_norm": 13.498154423533201, + "learning_rate": 1.9052140702079507e-05, + "loss": 0.68, + "step": 5292 + }, + { + "epoch": 2.1525010166734444, + "grad_norm": 43.47289738685474, + "learning_rate": 1.905171009534978e-05, + "loss": 1.2984, + "step": 5293 + }, + { + "epoch": 2.15290768605124, + "grad_norm": 0.7244395895729788, + "learning_rate": 1.9051279395699696e-05, + "loss": 0.0099, + "step": 5294 + }, + { + "epoch": 2.1533143554290364, + "grad_norm": 7.6834834281921625, + "learning_rate": 1.905084860313367e-05, + "loss": 0.3983, + "step": 5295 + }, + { + "epoch": 2.153721024806832, + "grad_norm": 18.140265607115424, + "learning_rate": 1.905041771765613e-05, + "loss": 0.5065, + "step": 5296 + }, + { + "epoch": 2.154127694184628, + "grad_norm": 19.74146778289654, + "learning_rate": 1.90499867392715e-05, + "loss": 0.6956, + "step": 5297 + }, + { + "epoch": 2.1545343635624237, + "grad_norm": 1.038100312759539, + "learning_rate": 1.9049555667984197e-05, + "loss": 0.017, + "step": 5298 + }, + { + "epoch": 2.1549410329402194, + "grad_norm": 8.245311616921335, + "learning_rate": 1.9049124503798652e-05, + "loss": 0.23, + "step": 5299 + }, + { + "epoch": 2.1553477023180156, + "grad_norm": 13.783805185082377, + "learning_rate": 1.9048693246719288e-05, + "loss": 0.4653, + "step": 5300 + }, + { + "epoch": 2.1557543716958114, + "grad_norm": 1.3751343057344625, + "learning_rate": 1.9048261896750534e-05, + "loss": 0.0223, + "step": 5301 + }, + { + "epoch": 2.156161041073607, + "grad_norm": 1.1096569705793382, + "learning_rate": 1.904783045389682e-05, + "loss": 0.0117, + "step": 5302 + }, + { + "epoch": 2.156567710451403, + "grad_norm": 9.83793827871525, + "learning_rate": 1.904739891816257e-05, + "loss": 0.5405, + "step": 5303 + }, + { + "epoch": 2.1569743798291987, + "grad_norm": 2.8557994197723864, + "learning_rate": 1.9046967289552218e-05, + "loss": 0.1424, + "step": 5304 + }, + { + "epoch": 2.157381049206995, + "grad_norm": 15.98052061810791, + "learning_rate": 1.9046535568070195e-05, + "loss": 1.0148, + "step": 5305 + }, + { + "epoch": 2.1577877185847907, + "grad_norm": 8.994088452340067, + "learning_rate": 1.904610375372093e-05, + "loss": 0.615, + "step": 5306 + }, + { + "epoch": 2.1581943879625864, + "grad_norm": 8.161762594570545, + "learning_rate": 1.9045671846508854e-05, + "loss": 0.3544, + "step": 5307 + }, + { + "epoch": 2.158601057340382, + "grad_norm": 7.993747155110343, + "learning_rate": 1.904523984643841e-05, + "loss": 0.227, + "step": 5308 + }, + { + "epoch": 2.159007726718178, + "grad_norm": 5.585595929474771, + "learning_rate": 1.9044807753514022e-05, + "loss": 0.1438, + "step": 5309 + }, + { + "epoch": 2.1594143960959737, + "grad_norm": 3.3715492270073315, + "learning_rate": 1.9044375567740135e-05, + "loss": 0.126, + "step": 5310 + }, + { + "epoch": 2.15982106547377, + "grad_norm": 3.6182696799063803, + "learning_rate": 1.9043943289121175e-05, + "loss": 0.0585, + "step": 5311 + }, + { + "epoch": 2.1602277348515657, + "grad_norm": 12.10522355645373, + "learning_rate": 1.904351091766159e-05, + "loss": 0.5879, + "step": 5312 + }, + { + "epoch": 2.1606344042293615, + "grad_norm": 20.274216603135713, + "learning_rate": 1.904307845336581e-05, + "loss": 0.347, + "step": 5313 + }, + { + "epoch": 2.1610410736071572, + "grad_norm": 2.030278802656357, + "learning_rate": 1.904264589623828e-05, + "loss": 0.044, + "step": 5314 + }, + { + "epoch": 2.1614477429849535, + "grad_norm": 13.505920636402909, + "learning_rate": 1.904221324628344e-05, + "loss": 0.9753, + "step": 5315 + }, + { + "epoch": 2.1618544123627492, + "grad_norm": 12.519205871933782, + "learning_rate": 1.904178050350573e-05, + "loss": 0.558, + "step": 5316 + }, + { + "epoch": 2.162261081740545, + "grad_norm": 9.890143897165325, + "learning_rate": 1.9041347667909594e-05, + "loss": 0.3873, + "step": 5317 + }, + { + "epoch": 2.1626677511183408, + "grad_norm": 1.874249863182932, + "learning_rate": 1.904091473949947e-05, + "loss": 0.0321, + "step": 5318 + }, + { + "epoch": 2.1630744204961365, + "grad_norm": 0.24788659232143478, + "learning_rate": 1.904048171827981e-05, + "loss": 0.0026, + "step": 5319 + }, + { + "epoch": 2.1634810898739323, + "grad_norm": 16.694098368313032, + "learning_rate": 1.904004860425505e-05, + "loss": 0.2729, + "step": 5320 + }, + { + "epoch": 2.1638877592517285, + "grad_norm": 3.88241909730022, + "learning_rate": 1.9039615397429645e-05, + "loss": 0.1423, + "step": 5321 + }, + { + "epoch": 2.1642944286295243, + "grad_norm": 5.308656580730708, + "learning_rate": 1.903918209780804e-05, + "loss": 0.0668, + "step": 5322 + }, + { + "epoch": 2.16470109800732, + "grad_norm": 13.156519278995136, + "learning_rate": 1.9038748705394675e-05, + "loss": 0.4979, + "step": 5323 + }, + { + "epoch": 2.165107767385116, + "grad_norm": 9.635322628550393, + "learning_rate": 1.903831522019401e-05, + "loss": 0.2108, + "step": 5324 + }, + { + "epoch": 2.1655144367629116, + "grad_norm": 8.491707431389369, + "learning_rate": 1.9037881642210488e-05, + "loss": 0.2447, + "step": 5325 + }, + { + "epoch": 2.1659211061407078, + "grad_norm": 4.37724341089532, + "learning_rate": 1.9037447971448565e-05, + "loss": 0.1075, + "step": 5326 + }, + { + "epoch": 2.1663277755185035, + "grad_norm": 118.92585938379531, + "learning_rate": 1.903701420791269e-05, + "loss": 6.2866, + "step": 5327 + }, + { + "epoch": 2.1667344448962993, + "grad_norm": 4.368101631238791, + "learning_rate": 1.9036580351607315e-05, + "loss": 0.1406, + "step": 5328 + }, + { + "epoch": 2.167141114274095, + "grad_norm": 20.476473293051907, + "learning_rate": 1.9036146402536895e-05, + "loss": 0.7974, + "step": 5329 + }, + { + "epoch": 2.167547783651891, + "grad_norm": 4.247972505852971, + "learning_rate": 1.9035712360705885e-05, + "loss": 0.0628, + "step": 5330 + }, + { + "epoch": 2.167954453029687, + "grad_norm": 2.9402836198938473, + "learning_rate": 1.903527822611874e-05, + "loss": 0.0495, + "step": 5331 + }, + { + "epoch": 2.168361122407483, + "grad_norm": 9.934901160620718, + "learning_rate": 1.9034843998779917e-05, + "loss": 0.3126, + "step": 5332 + }, + { + "epoch": 2.1687677917852786, + "grad_norm": 11.873724758893182, + "learning_rate": 1.903440967869387e-05, + "loss": 0.5126, + "step": 5333 + }, + { + "epoch": 2.1691744611630743, + "grad_norm": 3.8964827800787205, + "learning_rate": 1.9033975265865064e-05, + "loss": 0.0682, + "step": 5334 + }, + { + "epoch": 2.16958113054087, + "grad_norm": 7.859080769938974, + "learning_rate": 1.9033540760297952e-05, + "loss": 0.2631, + "step": 5335 + }, + { + "epoch": 2.1699877999186663, + "grad_norm": 1.0651033400621521, + "learning_rate": 1.9033106161997e-05, + "loss": 0.0201, + "step": 5336 + }, + { + "epoch": 2.170394469296462, + "grad_norm": 11.753269614267168, + "learning_rate": 1.903267147096667e-05, + "loss": 0.2369, + "step": 5337 + }, + { + "epoch": 2.170801138674258, + "grad_norm": 18.235431585778283, + "learning_rate": 1.9032236687211415e-05, + "loss": 0.3872, + "step": 5338 + }, + { + "epoch": 2.1712078080520536, + "grad_norm": 9.024688105467218, + "learning_rate": 1.9031801810735707e-05, + "loss": 0.3956, + "step": 5339 + }, + { + "epoch": 2.1716144774298494, + "grad_norm": 16.351025984806842, + "learning_rate": 1.903136684154401e-05, + "loss": 0.5978, + "step": 5340 + }, + { + "epoch": 2.1720211468076456, + "grad_norm": 20.17628363117017, + "learning_rate": 1.9030931779640786e-05, + "loss": 0.3856, + "step": 5341 + }, + { + "epoch": 2.1724278161854413, + "grad_norm": 6.876402237654625, + "learning_rate": 1.90304966250305e-05, + "loss": 0.1396, + "step": 5342 + }, + { + "epoch": 2.172834485563237, + "grad_norm": 0.6108584391825691, + "learning_rate": 1.9030061377717625e-05, + "loss": 0.0106, + "step": 5343 + }, + { + "epoch": 2.173241154941033, + "grad_norm": 14.976933625821118, + "learning_rate": 1.902962603770662e-05, + "loss": 0.6659, + "step": 5344 + }, + { + "epoch": 2.1736478243188286, + "grad_norm": 9.506906449386333, + "learning_rate": 1.9029190605001963e-05, + "loss": 0.3161, + "step": 5345 + }, + { + "epoch": 2.174054493696625, + "grad_norm": 12.079632585642564, + "learning_rate": 1.902875507960812e-05, + "loss": 0.5622, + "step": 5346 + }, + { + "epoch": 2.1744611630744206, + "grad_norm": 2.3394645896819957, + "learning_rate": 1.902831946152956e-05, + "loss": 0.0388, + "step": 5347 + }, + { + "epoch": 2.1748678324522164, + "grad_norm": 2.110917675709238, + "learning_rate": 1.9027883750770758e-05, + "loss": 0.0202, + "step": 5348 + }, + { + "epoch": 2.175274501830012, + "grad_norm": 31.969867757374804, + "learning_rate": 1.9027447947336185e-05, + "loss": 1.2176, + "step": 5349 + }, + { + "epoch": 2.175681171207808, + "grad_norm": 4.992327411361451, + "learning_rate": 1.9027012051230317e-05, + "loss": 0.1182, + "step": 5350 + }, + { + "epoch": 2.1760878405856037, + "grad_norm": 22.189638217336718, + "learning_rate": 1.9026576062457627e-05, + "loss": 1.4132, + "step": 5351 + }, + { + "epoch": 2.1764945099634, + "grad_norm": 80.20272664889578, + "learning_rate": 1.902613998102259e-05, + "loss": 0.5447, + "step": 5352 + }, + { + "epoch": 2.1769011793411956, + "grad_norm": 7.112146455156151, + "learning_rate": 1.9025703806929682e-05, + "loss": 0.2249, + "step": 5353 + }, + { + "epoch": 2.1773078487189914, + "grad_norm": 12.251898453460651, + "learning_rate": 1.9025267540183385e-05, + "loss": 0.3226, + "step": 5354 + }, + { + "epoch": 2.177714518096787, + "grad_norm": 17.956924596241155, + "learning_rate": 1.9024831180788174e-05, + "loss": 0.7349, + "step": 5355 + }, + { + "epoch": 2.1781211874745834, + "grad_norm": 7.294293181251317, + "learning_rate": 1.9024394728748524e-05, + "loss": 0.2125, + "step": 5356 + }, + { + "epoch": 2.178527856852379, + "grad_norm": 5.291689302377985, + "learning_rate": 1.9023958184068924e-05, + "loss": 0.2165, + "step": 5357 + }, + { + "epoch": 2.178934526230175, + "grad_norm": 1.993765307716113, + "learning_rate": 1.902352154675385e-05, + "loss": 0.0301, + "step": 5358 + }, + { + "epoch": 2.1793411956079707, + "grad_norm": 10.833154895196294, + "learning_rate": 1.902308481680779e-05, + "loss": 0.2927, + "step": 5359 + }, + { + "epoch": 2.1797478649857664, + "grad_norm": 17.466388819408085, + "learning_rate": 1.902264799423522e-05, + "loss": 0.3682, + "step": 5360 + }, + { + "epoch": 2.180154534363562, + "grad_norm": 16.303334597905028, + "learning_rate": 1.9022211079040628e-05, + "loss": 0.4299, + "step": 5361 + }, + { + "epoch": 2.1805612037413584, + "grad_norm": 1.9422090331380806, + "learning_rate": 1.90217740712285e-05, + "loss": 0.043, + "step": 5362 + }, + { + "epoch": 2.180967873119154, + "grad_norm": 6.878649257133412, + "learning_rate": 1.9021336970803315e-05, + "loss": 0.2643, + "step": 5363 + }, + { + "epoch": 2.18137454249695, + "grad_norm": 8.823473740016137, + "learning_rate": 1.902089977776957e-05, + "loss": 0.4296, + "step": 5364 + }, + { + "epoch": 2.1817812118747457, + "grad_norm": 14.521966484958408, + "learning_rate": 1.9020462492131748e-05, + "loss": 0.9052, + "step": 5365 + }, + { + "epoch": 2.1821878812525415, + "grad_norm": 6.944112171017405, + "learning_rate": 1.902002511389434e-05, + "loss": 0.3943, + "step": 5366 + }, + { + "epoch": 2.1825945506303377, + "grad_norm": 12.220002208537485, + "learning_rate": 1.901958764306183e-05, + "loss": 0.78, + "step": 5367 + }, + { + "epoch": 2.1830012200081335, + "grad_norm": 9.433521935792315, + "learning_rate": 1.9019150079638718e-05, + "loss": 0.3831, + "step": 5368 + }, + { + "epoch": 2.1834078893859292, + "grad_norm": 12.38276561658991, + "learning_rate": 1.901871242362949e-05, + "loss": 0.3372, + "step": 5369 + }, + { + "epoch": 2.183814558763725, + "grad_norm": 14.294946913196242, + "learning_rate": 1.9018274675038638e-05, + "loss": 0.5341, + "step": 5370 + }, + { + "epoch": 2.1842212281415208, + "grad_norm": 9.992315113148994, + "learning_rate": 1.9017836833870663e-05, + "loss": 0.4454, + "step": 5371 + }, + { + "epoch": 2.184627897519317, + "grad_norm": 167.69255675071562, + "learning_rate": 1.9017398900130048e-05, + "loss": 0.2161, + "step": 5372 + }, + { + "epoch": 2.1850345668971127, + "grad_norm": 7.873546160608925, + "learning_rate": 1.9016960873821298e-05, + "loss": 0.1833, + "step": 5373 + }, + { + "epoch": 2.1854412362749085, + "grad_norm": 0.5518182257936557, + "learning_rate": 1.9016522754948906e-05, + "loss": 0.0097, + "step": 5374 + }, + { + "epoch": 2.1858479056527043, + "grad_norm": 16.435083458779445, + "learning_rate": 1.9016084543517368e-05, + "loss": 0.7628, + "step": 5375 + }, + { + "epoch": 2.1862545750305, + "grad_norm": 1.5880503980695728, + "learning_rate": 1.9015646239531187e-05, + "loss": 0.0274, + "step": 5376 + }, + { + "epoch": 2.1866612444082962, + "grad_norm": 26.69428365593077, + "learning_rate": 1.901520784299486e-05, + "loss": 1.0257, + "step": 5377 + }, + { + "epoch": 2.187067913786092, + "grad_norm": 9.953794697831524, + "learning_rate": 1.9014769353912888e-05, + "loss": 0.5812, + "step": 5378 + }, + { + "epoch": 2.1874745831638878, + "grad_norm": 3.486022181002821, + "learning_rate": 1.9014330772289768e-05, + "loss": 0.0707, + "step": 5379 + }, + { + "epoch": 2.1878812525416835, + "grad_norm": 18.660440741437192, + "learning_rate": 1.901389209813001e-05, + "loss": 0.9501, + "step": 5380 + }, + { + "epoch": 2.1882879219194793, + "grad_norm": 14.047466059080067, + "learning_rate": 1.901345333143811e-05, + "loss": 0.5181, + "step": 5381 + }, + { + "epoch": 2.1886945912972755, + "grad_norm": 6.9153884492492965, + "learning_rate": 1.9013014472218575e-05, + "loss": 0.3, + "step": 5382 + }, + { + "epoch": 2.1891012606750713, + "grad_norm": 10.949876168772334, + "learning_rate": 1.901257552047591e-05, + "loss": 0.1379, + "step": 5383 + }, + { + "epoch": 2.189507930052867, + "grad_norm": 7.801662994743998, + "learning_rate": 1.9012136476214624e-05, + "loss": 0.1522, + "step": 5384 + }, + { + "epoch": 2.189914599430663, + "grad_norm": 9.208209816588756, + "learning_rate": 1.9011697339439222e-05, + "loss": 0.2944, + "step": 5385 + }, + { + "epoch": 2.1903212688084586, + "grad_norm": 5.880518431908457, + "learning_rate": 1.9011258110154206e-05, + "loss": 0.2065, + "step": 5386 + }, + { + "epoch": 2.1907279381862548, + "grad_norm": 7.687002741012367, + "learning_rate": 1.9010818788364095e-05, + "loss": 0.2156, + "step": 5387 + }, + { + "epoch": 2.1911346075640505, + "grad_norm": 7.59842469286264, + "learning_rate": 1.9010379374073395e-05, + "loss": 0.1761, + "step": 5388 + }, + { + "epoch": 2.1915412769418463, + "grad_norm": 11.266394524078477, + "learning_rate": 1.9009939867286612e-05, + "loss": 0.6367, + "step": 5389 + }, + { + "epoch": 2.191947946319642, + "grad_norm": 1.2872709028808507, + "learning_rate": 1.9009500268008267e-05, + "loss": 0.019, + "step": 5390 + }, + { + "epoch": 2.192354615697438, + "grad_norm": 2.841309195665685, + "learning_rate": 1.9009060576242858e-05, + "loss": 0.0587, + "step": 5391 + }, + { + "epoch": 2.1927612850752336, + "grad_norm": 11.623150819226481, + "learning_rate": 1.9008620791994916e-05, + "loss": 0.3903, + "step": 5392 + }, + { + "epoch": 2.19316795445303, + "grad_norm": 8.261859549625745, + "learning_rate": 1.9008180915268943e-05, + "loss": 0.3391, + "step": 5393 + }, + { + "epoch": 2.1935746238308256, + "grad_norm": 10.418839060670404, + "learning_rate": 1.9007740946069463e-05, + "loss": 0.1909, + "step": 5394 + }, + { + "epoch": 2.1939812932086213, + "grad_norm": 12.975294133743224, + "learning_rate": 1.9007300884400988e-05, + "loss": 0.6478, + "step": 5395 + }, + { + "epoch": 2.194387962586417, + "grad_norm": 3.2132488150167364, + "learning_rate": 1.9006860730268032e-05, + "loss": 0.0486, + "step": 5396 + }, + { + "epoch": 2.1947946319642133, + "grad_norm": 4.5719037192559195, + "learning_rate": 1.9006420483675125e-05, + "loss": 0.1909, + "step": 5397 + }, + { + "epoch": 2.195201301342009, + "grad_norm": 2.1142414409713073, + "learning_rate": 1.9005980144626775e-05, + "loss": 0.0416, + "step": 5398 + }, + { + "epoch": 2.195607970719805, + "grad_norm": 5.34553524510879, + "learning_rate": 1.9005539713127504e-05, + "loss": 0.1045, + "step": 5399 + }, + { + "epoch": 2.1960146400976006, + "grad_norm": 6.843763834047161, + "learning_rate": 1.900509918918184e-05, + "loss": 0.4482, + "step": 5400 + }, + { + "epoch": 2.1964213094753964, + "grad_norm": 7.001719464077042, + "learning_rate": 1.9004658572794294e-05, + "loss": 0.164, + "step": 5401 + }, + { + "epoch": 2.196827978853192, + "grad_norm": 13.94002060806066, + "learning_rate": 1.90042178639694e-05, + "loss": 0.0761, + "step": 5402 + }, + { + "epoch": 2.1972346482309884, + "grad_norm": 14.853372266367273, + "learning_rate": 1.900377706271168e-05, + "loss": 0.4311, + "step": 5403 + }, + { + "epoch": 2.197641317608784, + "grad_norm": 10.78172839388738, + "learning_rate": 1.9003336169025655e-05, + "loss": 0.358, + "step": 5404 + }, + { + "epoch": 2.19804798698658, + "grad_norm": 0.609151466164763, + "learning_rate": 1.9002895182915854e-05, + "loss": 0.008, + "step": 5405 + }, + { + "epoch": 2.1984546563643756, + "grad_norm": 17.621728186749223, + "learning_rate": 1.90024541043868e-05, + "loss": 1.2339, + "step": 5406 + }, + { + "epoch": 2.1988613257421714, + "grad_norm": 7.12356852634026, + "learning_rate": 1.9002012933443025e-05, + "loss": 0.2358, + "step": 5407 + }, + { + "epoch": 2.1992679951199676, + "grad_norm": 17.07174543972333, + "learning_rate": 1.9001571670089057e-05, + "loss": 0.3309, + "step": 5408 + }, + { + "epoch": 2.1996746644977634, + "grad_norm": 1.774156838102799, + "learning_rate": 1.9001130314329423e-05, + "loss": 0.0158, + "step": 5409 + }, + { + "epoch": 2.200081333875559, + "grad_norm": 31.4278065222669, + "learning_rate": 1.9000688866168658e-05, + "loss": 0.2458, + "step": 5410 + }, + { + "epoch": 2.200488003253355, + "grad_norm": 12.498046000425926, + "learning_rate": 1.9000247325611293e-05, + "loss": 0.2831, + "step": 5411 + }, + { + "epoch": 2.2008946726311507, + "grad_norm": 11.58149974716747, + "learning_rate": 1.899980569266186e-05, + "loss": 0.5373, + "step": 5412 + }, + { + "epoch": 2.201301342008947, + "grad_norm": 3.84786362258619, + "learning_rate": 1.8999363967324892e-05, + "loss": 0.0714, + "step": 5413 + }, + { + "epoch": 2.2017080113867427, + "grad_norm": 8.08147362087012, + "learning_rate": 1.8998922149604923e-05, + "loss": 0.4918, + "step": 5414 + }, + { + "epoch": 2.2021146807645384, + "grad_norm": 4.6688051856745, + "learning_rate": 1.8998480239506487e-05, + "loss": 0.0917, + "step": 5415 + }, + { + "epoch": 2.202521350142334, + "grad_norm": 1.2613108520168321, + "learning_rate": 1.899803823703412e-05, + "loss": 0.0211, + "step": 5416 + }, + { + "epoch": 2.20292801952013, + "grad_norm": 3.7451847126436193, + "learning_rate": 1.8997596142192368e-05, + "loss": 0.1247, + "step": 5417 + }, + { + "epoch": 2.203334688897926, + "grad_norm": 15.23772035792255, + "learning_rate": 1.8997153954985765e-05, + "loss": 0.1862, + "step": 5418 + }, + { + "epoch": 2.203741358275722, + "grad_norm": 7.72541008800862, + "learning_rate": 1.899671167541884e-05, + "loss": 0.2596, + "step": 5419 + }, + { + "epoch": 2.2041480276535177, + "grad_norm": 13.7621357670797, + "learning_rate": 1.899626930349615e-05, + "loss": 0.6376, + "step": 5420 + }, + { + "epoch": 2.2045546970313135, + "grad_norm": 3.0854385153822803, + "learning_rate": 1.8995826839222222e-05, + "loss": 0.0502, + "step": 5421 + }, + { + "epoch": 2.2049613664091092, + "grad_norm": 15.66906403160471, + "learning_rate": 1.8995384282601607e-05, + "loss": 0.5608, + "step": 5422 + }, + { + "epoch": 2.2053680357869054, + "grad_norm": 0.6210243237183942, + "learning_rate": 1.8994941633638845e-05, + "loss": 0.0096, + "step": 5423 + }, + { + "epoch": 2.205774705164701, + "grad_norm": 13.552762321380683, + "learning_rate": 1.8994498892338477e-05, + "loss": 0.3409, + "step": 5424 + }, + { + "epoch": 2.206181374542497, + "grad_norm": 8.372563359012593, + "learning_rate": 1.8994056058705053e-05, + "loss": 0.2014, + "step": 5425 + }, + { + "epoch": 2.2065880439202927, + "grad_norm": 6.235256829434581, + "learning_rate": 1.8993613132743117e-05, + "loss": 0.1137, + "step": 5426 + }, + { + "epoch": 2.2069947132980885, + "grad_norm": 9.552611391430505, + "learning_rate": 1.899317011445722e-05, + "loss": 0.2704, + "step": 5427 + }, + { + "epoch": 2.2074013826758847, + "grad_norm": 14.866239612729357, + "learning_rate": 1.89927270038519e-05, + "loss": 0.2864, + "step": 5428 + }, + { + "epoch": 2.2078080520536805, + "grad_norm": 1.5289077065508836, + "learning_rate": 1.8992283800931713e-05, + "loss": 0.0197, + "step": 5429 + }, + { + "epoch": 2.2082147214314762, + "grad_norm": 14.80208111934849, + "learning_rate": 1.8991840505701207e-05, + "loss": 0.1914, + "step": 5430 + }, + { + "epoch": 2.208621390809272, + "grad_norm": 8.057465366241734, + "learning_rate": 1.8991397118164932e-05, + "loss": 0.27, + "step": 5431 + }, + { + "epoch": 2.2090280601870678, + "grad_norm": 12.772772091411337, + "learning_rate": 1.899095363832744e-05, + "loss": 0.2453, + "step": 5432 + }, + { + "epoch": 2.2094347295648635, + "grad_norm": 21.556922825164346, + "learning_rate": 1.8990510066193285e-05, + "loss": 1.2308, + "step": 5433 + }, + { + "epoch": 2.2098413989426597, + "grad_norm": 4.834342775448588, + "learning_rate": 1.899006640176702e-05, + "loss": 0.0568, + "step": 5434 + }, + { + "epoch": 2.2102480683204555, + "grad_norm": 1.0544959772261115, + "learning_rate": 1.89896226450532e-05, + "loss": 0.0214, + "step": 5435 + }, + { + "epoch": 2.2106547376982513, + "grad_norm": 10.984400132083188, + "learning_rate": 1.8989178796056374e-05, + "loss": 0.5444, + "step": 5436 + }, + { + "epoch": 2.211061407076047, + "grad_norm": 5.77499550940754, + "learning_rate": 1.8988734854781106e-05, + "loss": 0.1083, + "step": 5437 + }, + { + "epoch": 2.2114680764538432, + "grad_norm": 8.99388598248317, + "learning_rate": 1.8988290821231953e-05, + "loss": 0.2182, + "step": 5438 + }, + { + "epoch": 2.211874745831639, + "grad_norm": 14.418114941252957, + "learning_rate": 1.898784669541347e-05, + "loss": 0.4432, + "step": 5439 + }, + { + "epoch": 2.2122814152094348, + "grad_norm": 11.455466610694263, + "learning_rate": 1.8987402477330218e-05, + "loss": 0.3568, + "step": 5440 + }, + { + "epoch": 2.2126880845872305, + "grad_norm": 3.868726367368935, + "learning_rate": 1.8986958166986753e-05, + "loss": 0.1218, + "step": 5441 + }, + { + "epoch": 2.2130947539650263, + "grad_norm": 6.931820017275408, + "learning_rate": 1.8986513764387642e-05, + "loss": 0.1829, + "step": 5442 + }, + { + "epoch": 2.213501423342822, + "grad_norm": 6.6937368495286735, + "learning_rate": 1.8986069269537444e-05, + "loss": 0.3088, + "step": 5443 + }, + { + "epoch": 2.2139080927206183, + "grad_norm": 16.300724454973246, + "learning_rate": 1.8985624682440723e-05, + "loss": 0.8337, + "step": 5444 + }, + { + "epoch": 2.214314762098414, + "grad_norm": 19.307606264547054, + "learning_rate": 1.898518000310204e-05, + "loss": 0.0904, + "step": 5445 + }, + { + "epoch": 2.21472143147621, + "grad_norm": 8.475428972176411, + "learning_rate": 1.8984735231525964e-05, + "loss": 0.2305, + "step": 5446 + }, + { + "epoch": 2.2151281008540056, + "grad_norm": 0.03736885590831164, + "learning_rate": 1.8984290367717057e-05, + "loss": 0.0007, + "step": 5447 + }, + { + "epoch": 2.2155347702318013, + "grad_norm": 23.444381832678996, + "learning_rate": 1.898384541167989e-05, + "loss": 0.6815, + "step": 5448 + }, + { + "epoch": 2.2159414396095976, + "grad_norm": 13.663564946855947, + "learning_rate": 1.8983400363419027e-05, + "loss": 0.4381, + "step": 5449 + }, + { + "epoch": 2.2163481089873933, + "grad_norm": 1.958940716444621, + "learning_rate": 1.898295522293904e-05, + "loss": 0.0345, + "step": 5450 + }, + { + "epoch": 2.216754778365189, + "grad_norm": 3.139131967298503, + "learning_rate": 1.8982509990244492e-05, + "loss": 0.1061, + "step": 5451 + }, + { + "epoch": 2.217161447742985, + "grad_norm": 12.489049233385117, + "learning_rate": 1.8982064665339963e-05, + "loss": 0.7085, + "step": 5452 + }, + { + "epoch": 2.2175681171207806, + "grad_norm": 7.221106799465949, + "learning_rate": 1.898161924823002e-05, + "loss": 0.1941, + "step": 5453 + }, + { + "epoch": 2.217974786498577, + "grad_norm": 9.169990368364855, + "learning_rate": 1.898117373891923e-05, + "loss": 0.1932, + "step": 5454 + }, + { + "epoch": 2.2183814558763726, + "grad_norm": 4.811174243423325, + "learning_rate": 1.8980728137412176e-05, + "loss": 0.0282, + "step": 5455 + }, + { + "epoch": 2.2187881252541684, + "grad_norm": 3.745851478683104, + "learning_rate": 1.8980282443713425e-05, + "loss": 0.0816, + "step": 5456 + }, + { + "epoch": 2.219194794631964, + "grad_norm": 1.8662107528261807, + "learning_rate": 1.8979836657827558e-05, + "loss": 0.0359, + "step": 5457 + }, + { + "epoch": 2.21960146400976, + "grad_norm": 0.20574073661143025, + "learning_rate": 1.8979390779759144e-05, + "loss": 0.003, + "step": 5458 + }, + { + "epoch": 2.220008133387556, + "grad_norm": 15.21433108697028, + "learning_rate": 1.8978944809512768e-05, + "loss": 0.533, + "step": 5459 + }, + { + "epoch": 2.220414802765352, + "grad_norm": 1.3469210370052407, + "learning_rate": 1.8978498747093003e-05, + "loss": 0.022, + "step": 5460 + }, + { + "epoch": 2.2208214721431476, + "grad_norm": 15.891324232901649, + "learning_rate": 1.897805259250443e-05, + "loss": 0.5573, + "step": 5461 + }, + { + "epoch": 2.2212281415209434, + "grad_norm": 5.156127299556623, + "learning_rate": 1.8977606345751628e-05, + "loss": 0.0272, + "step": 5462 + }, + { + "epoch": 2.221634810898739, + "grad_norm": 42.83683299491027, + "learning_rate": 1.897716000683918e-05, + "loss": 2.1812, + "step": 5463 + }, + { + "epoch": 2.2220414802765354, + "grad_norm": 6.296750596371426, + "learning_rate": 1.8976713575771665e-05, + "loss": 0.2309, + "step": 5464 + }, + { + "epoch": 2.222448149654331, + "grad_norm": 30.22182620149802, + "learning_rate": 1.8976267052553668e-05, + "loss": 0.3499, + "step": 5465 + }, + { + "epoch": 2.222854819032127, + "grad_norm": 3.8406674813012094, + "learning_rate": 1.897582043718977e-05, + "loss": 0.0702, + "step": 5466 + }, + { + "epoch": 2.2232614884099227, + "grad_norm": 0.649512726095665, + "learning_rate": 1.897537372968456e-05, + "loss": 0.0098, + "step": 5467 + }, + { + "epoch": 2.2236681577877184, + "grad_norm": 1.237793417395373, + "learning_rate": 1.8974926930042625e-05, + "loss": 0.0179, + "step": 5468 + }, + { + "epoch": 2.2240748271655146, + "grad_norm": 14.159256465868664, + "learning_rate": 1.897448003826854e-05, + "loss": 0.1874, + "step": 5469 + }, + { + "epoch": 2.2244814965433104, + "grad_norm": 7.16933371387212, + "learning_rate": 1.8974033054366906e-05, + "loss": 0.2357, + "step": 5470 + }, + { + "epoch": 2.224888165921106, + "grad_norm": 8.729061939302442, + "learning_rate": 1.8973585978342305e-05, + "loss": 0.1642, + "step": 5471 + }, + { + "epoch": 2.225294835298902, + "grad_norm": 1.323123987507907, + "learning_rate": 1.8973138810199327e-05, + "loss": 0.0117, + "step": 5472 + }, + { + "epoch": 2.2257015046766977, + "grad_norm": 10.488487595998219, + "learning_rate": 1.8972691549942562e-05, + "loss": 0.4225, + "step": 5473 + }, + { + "epoch": 2.226108174054494, + "grad_norm": 4.490230184330027, + "learning_rate": 1.8972244197576603e-05, + "loss": 0.0872, + "step": 5474 + }, + { + "epoch": 2.2265148434322897, + "grad_norm": 4.12642768865254, + "learning_rate": 1.8971796753106046e-05, + "loss": 0.0869, + "step": 5475 + }, + { + "epoch": 2.2269215128100854, + "grad_norm": 8.703685293207363, + "learning_rate": 1.8971349216535475e-05, + "loss": 0.2589, + "step": 5476 + }, + { + "epoch": 2.227328182187881, + "grad_norm": 6.375156178161772, + "learning_rate": 1.897090158786949e-05, + "loss": 0.3332, + "step": 5477 + }, + { + "epoch": 2.227734851565677, + "grad_norm": 8.494459984589152, + "learning_rate": 1.8970453867112685e-05, + "loss": 0.3417, + "step": 5478 + }, + { + "epoch": 2.228141520943473, + "grad_norm": 13.601721195118573, + "learning_rate": 1.8970006054269658e-05, + "loss": 0.5406, + "step": 5479 + }, + { + "epoch": 2.228548190321269, + "grad_norm": 12.433425055651393, + "learning_rate": 1.8969558149345004e-05, + "loss": 0.4372, + "step": 5480 + }, + { + "epoch": 2.2289548596990647, + "grad_norm": 0.28685296501010416, + "learning_rate": 1.896911015234332e-05, + "loss": 0.0052, + "step": 5481 + }, + { + "epoch": 2.2293615290768605, + "grad_norm": 10.063453616912462, + "learning_rate": 1.8968662063269207e-05, + "loss": 0.3827, + "step": 5482 + }, + { + "epoch": 2.2297681984546562, + "grad_norm": 39.64897364963271, + "learning_rate": 1.8968213882127265e-05, + "loss": 0.8008, + "step": 5483 + }, + { + "epoch": 2.230174867832452, + "grad_norm": 7.658916008716941, + "learning_rate": 1.896776560892209e-05, + "loss": 0.1926, + "step": 5484 + }, + { + "epoch": 2.230581537210248, + "grad_norm": 12.454574013202429, + "learning_rate": 1.8967317243658294e-05, + "loss": 0.2243, + "step": 5485 + }, + { + "epoch": 2.230988206588044, + "grad_norm": 8.776654090825982, + "learning_rate": 1.8966868786340467e-05, + "loss": 0.2784, + "step": 5486 + }, + { + "epoch": 2.2313948759658397, + "grad_norm": 13.359795373303678, + "learning_rate": 1.8966420236973225e-05, + "loss": 0.3308, + "step": 5487 + }, + { + "epoch": 2.2318015453436355, + "grad_norm": 4.6772585018537765, + "learning_rate": 1.8965971595561162e-05, + "loss": 0.1038, + "step": 5488 + }, + { + "epoch": 2.2322082147214313, + "grad_norm": 0.4719508289620225, + "learning_rate": 1.896552286210889e-05, + "loss": 0.0089, + "step": 5489 + }, + { + "epoch": 2.2326148840992275, + "grad_norm": 8.514945342234594, + "learning_rate": 1.8965074036621013e-05, + "loss": 0.1864, + "step": 5490 + }, + { + "epoch": 2.2330215534770232, + "grad_norm": 11.212379012017955, + "learning_rate": 1.896462511910214e-05, + "loss": 0.2672, + "step": 5491 + }, + { + "epoch": 2.233428222854819, + "grad_norm": 10.335353952151161, + "learning_rate": 1.896417610955688e-05, + "loss": 0.7253, + "step": 5492 + }, + { + "epoch": 2.2338348922326148, + "grad_norm": 4.269104210559541, + "learning_rate": 1.8963727007989835e-05, + "loss": 0.057, + "step": 5493 + }, + { + "epoch": 2.2342415616104105, + "grad_norm": 6.945028234552902, + "learning_rate": 1.8963277814405627e-05, + "loss": 0.1186, + "step": 5494 + }, + { + "epoch": 2.2346482309882068, + "grad_norm": 2.9559363860521386, + "learning_rate": 1.8962828528808853e-05, + "loss": 0.1494, + "step": 5495 + }, + { + "epoch": 2.2350549003660025, + "grad_norm": 9.2728977922434, + "learning_rate": 1.896237915120414e-05, + "loss": 0.3098, + "step": 5496 + }, + { + "epoch": 2.2354615697437983, + "grad_norm": 7.663714194682838, + "learning_rate": 1.8961929681596094e-05, + "loss": 0.1203, + "step": 5497 + }, + { + "epoch": 2.235868239121594, + "grad_norm": 5.2019229016257995, + "learning_rate": 1.896148011998933e-05, + "loss": 0.1725, + "step": 5498 + }, + { + "epoch": 2.23627490849939, + "grad_norm": 11.363588889646659, + "learning_rate": 1.896103046638846e-05, + "loss": 0.3141, + "step": 5499 + }, + { + "epoch": 2.236681577877186, + "grad_norm": 2.2193959969913415, + "learning_rate": 1.8960580720798104e-05, + "loss": 0.0402, + "step": 5500 + }, + { + "epoch": 2.237088247254982, + "grad_norm": 7.0362366463754835, + "learning_rate": 1.8960130883222874e-05, + "loss": 0.7576, + "step": 5501 + }, + { + "epoch": 2.2374949166327776, + "grad_norm": 12.615189378146987, + "learning_rate": 1.895968095366739e-05, + "loss": 0.4051, + "step": 5502 + }, + { + "epoch": 2.2379015860105733, + "grad_norm": 3.907086796165836, + "learning_rate": 1.8959230932136277e-05, + "loss": 0.1065, + "step": 5503 + }, + { + "epoch": 2.238308255388369, + "grad_norm": 2.5794518994794338, + "learning_rate": 1.8958780818634146e-05, + "loss": 0.0555, + "step": 5504 + }, + { + "epoch": 2.2387149247661653, + "grad_norm": 9.3145076928862, + "learning_rate": 1.8958330613165622e-05, + "loss": 0.6037, + "step": 5505 + }, + { + "epoch": 2.239121594143961, + "grad_norm": 9.017481248470819, + "learning_rate": 1.8957880315735328e-05, + "loss": 0.5058, + "step": 5506 + }, + { + "epoch": 2.239528263521757, + "grad_norm": 10.326112037580561, + "learning_rate": 1.895742992634788e-05, + "loss": 0.4572, + "step": 5507 + }, + { + "epoch": 2.2399349328995526, + "grad_norm": 26.763219508600255, + "learning_rate": 1.895697944500791e-05, + "loss": 1.4033, + "step": 5508 + }, + { + "epoch": 2.2403416022773484, + "grad_norm": 18.503851515421697, + "learning_rate": 1.895652887172003e-05, + "loss": 0.6096, + "step": 5509 + }, + { + "epoch": 2.2407482716551446, + "grad_norm": 1.8607830100488023, + "learning_rate": 1.895607820648888e-05, + "loss": 0.0298, + "step": 5510 + }, + { + "epoch": 2.2411549410329403, + "grad_norm": 11.27274324399376, + "learning_rate": 1.8955627449319082e-05, + "loss": 0.4977, + "step": 5511 + }, + { + "epoch": 2.241561610410736, + "grad_norm": 4.655992235766663, + "learning_rate": 1.8955176600215257e-05, + "loss": 0.1099, + "step": 5512 + }, + { + "epoch": 2.241968279788532, + "grad_norm": 9.957477170656484, + "learning_rate": 1.8954725659182038e-05, + "loss": 0.2236, + "step": 5513 + }, + { + "epoch": 2.2423749491663276, + "grad_norm": 8.6037977087206, + "learning_rate": 1.8954274626224053e-05, + "loss": 0.2517, + "step": 5514 + }, + { + "epoch": 2.242781618544124, + "grad_norm": 8.933496636070087, + "learning_rate": 1.895382350134593e-05, + "loss": 0.3504, + "step": 5515 + }, + { + "epoch": 2.2431882879219196, + "grad_norm": 15.327399733521325, + "learning_rate": 1.8953372284552306e-05, + "loss": 1.1423, + "step": 5516 + }, + { + "epoch": 2.2435949572997154, + "grad_norm": 10.296871413975012, + "learning_rate": 1.8952920975847808e-05, + "loss": 0.4113, + "step": 5517 + }, + { + "epoch": 2.244001626677511, + "grad_norm": 12.438775743838425, + "learning_rate": 1.895246957523707e-05, + "loss": 0.9177, + "step": 5518 + }, + { + "epoch": 2.244408296055307, + "grad_norm": 8.140034432990177, + "learning_rate": 1.8952018082724724e-05, + "loss": 0.4527, + "step": 5519 + }, + { + "epoch": 2.244814965433103, + "grad_norm": 8.377924274223728, + "learning_rate": 1.8951566498315412e-05, + "loss": 0.2159, + "step": 5520 + }, + { + "epoch": 2.245221634810899, + "grad_norm": 9.3642155012495, + "learning_rate": 1.8951114822013763e-05, + "loss": 0.1946, + "step": 5521 + }, + { + "epoch": 2.2456283041886946, + "grad_norm": 11.329063574902614, + "learning_rate": 1.895066305382441e-05, + "loss": 0.4045, + "step": 5522 + }, + { + "epoch": 2.2460349735664904, + "grad_norm": 9.819314809346812, + "learning_rate": 1.8950211193752004e-05, + "loss": 0.2808, + "step": 5523 + }, + { + "epoch": 2.246441642944286, + "grad_norm": 19.76722967264545, + "learning_rate": 1.894975924180117e-05, + "loss": 0.4791, + "step": 5524 + }, + { + "epoch": 2.246848312322082, + "grad_norm": 10.042397641606259, + "learning_rate": 1.8949307197976558e-05, + "loss": 0.255, + "step": 5525 + }, + { + "epoch": 2.247254981699878, + "grad_norm": 2.3393504014975, + "learning_rate": 1.8948855062282798e-05, + "loss": 0.0366, + "step": 5526 + }, + { + "epoch": 2.247661651077674, + "grad_norm": 5.548145480252637, + "learning_rate": 1.894840283472454e-05, + "loss": 0.1647, + "step": 5527 + }, + { + "epoch": 2.2480683204554697, + "grad_norm": 10.480740084181102, + "learning_rate": 1.8947950515306424e-05, + "loss": 0.2472, + "step": 5528 + }, + { + "epoch": 2.2484749898332654, + "grad_norm": 5.670506341224859, + "learning_rate": 1.894749810403309e-05, + "loss": 0.1331, + "step": 5529 + }, + { + "epoch": 2.248881659211061, + "grad_norm": 10.132998311571463, + "learning_rate": 1.8947045600909188e-05, + "loss": 0.4254, + "step": 5530 + }, + { + "epoch": 2.2492883285888574, + "grad_norm": 9.876737484686323, + "learning_rate": 1.8946593005939354e-05, + "loss": 0.3132, + "step": 5531 + }, + { + "epoch": 2.249694997966653, + "grad_norm": 13.21885126719903, + "learning_rate": 1.894614031912825e-05, + "loss": 0.559, + "step": 5532 + }, + { + "epoch": 2.250101667344449, + "grad_norm": 3.9028232633543993, + "learning_rate": 1.8945687540480504e-05, + "loss": 0.0562, + "step": 5533 + }, + { + "epoch": 2.2505083367222447, + "grad_norm": 15.578899314874738, + "learning_rate": 1.8945234670000777e-05, + "loss": 0.9596, + "step": 5534 + }, + { + "epoch": 2.2509150061000405, + "grad_norm": 12.403813155528223, + "learning_rate": 1.894478170769371e-05, + "loss": 0.1921, + "step": 5535 + }, + { + "epoch": 2.2513216754778367, + "grad_norm": 5.958238548739688, + "learning_rate": 1.8944328653563962e-05, + "loss": 0.4331, + "step": 5536 + }, + { + "epoch": 2.2517283448556324, + "grad_norm": 1.4308150512842592, + "learning_rate": 1.8943875507616178e-05, + "loss": 0.0254, + "step": 5537 + }, + { + "epoch": 2.252135014233428, + "grad_norm": 5.920049436454068, + "learning_rate": 1.894342226985501e-05, + "loss": 0.1044, + "step": 5538 + }, + { + "epoch": 2.252541683611224, + "grad_norm": 9.065756919171246, + "learning_rate": 1.8942968940285112e-05, + "loss": 0.2578, + "step": 5539 + }, + { + "epoch": 2.2529483529890197, + "grad_norm": 6.943668928678551, + "learning_rate": 1.8942515518911135e-05, + "loss": 0.3076, + "step": 5540 + }, + { + "epoch": 2.253355022366816, + "grad_norm": 11.735228594532279, + "learning_rate": 1.8942062005737736e-05, + "loss": 0.4215, + "step": 5541 + }, + { + "epoch": 2.2537616917446117, + "grad_norm": 5.361863291239889, + "learning_rate": 1.894160840076957e-05, + "loss": 0.1846, + "step": 5542 + }, + { + "epoch": 2.2541683611224075, + "grad_norm": 5.248799036951382, + "learning_rate": 1.8941154704011297e-05, + "loss": 0.1003, + "step": 5543 + }, + { + "epoch": 2.2545750305002032, + "grad_norm": 11.754167232012096, + "learning_rate": 1.8940700915467565e-05, + "loss": 0.4591, + "step": 5544 + }, + { + "epoch": 2.254981699877999, + "grad_norm": 4.5695181134842695, + "learning_rate": 1.8940247035143042e-05, + "loss": 0.0689, + "step": 5545 + }, + { + "epoch": 2.2553883692557952, + "grad_norm": 13.170058745505035, + "learning_rate": 1.893979306304238e-05, + "loss": 0.5251, + "step": 5546 + }, + { + "epoch": 2.255795038633591, + "grad_norm": 7.6399796458265685, + "learning_rate": 1.8939338999170247e-05, + "loss": 0.2775, + "step": 5547 + }, + { + "epoch": 2.2562017080113868, + "grad_norm": 8.112160123099745, + "learning_rate": 1.89388848435313e-05, + "loss": 0.2644, + "step": 5548 + }, + { + "epoch": 2.2566083773891825, + "grad_norm": 14.485146183071935, + "learning_rate": 1.89384305961302e-05, + "loss": 0.5447, + "step": 5549 + }, + { + "epoch": 2.2570150467669783, + "grad_norm": 4.118858282223993, + "learning_rate": 1.893797625697161e-05, + "loss": 0.0503, + "step": 5550 + }, + { + "epoch": 2.2574217161447745, + "grad_norm": 7.513866677868882, + "learning_rate": 1.89375218260602e-05, + "loss": 0.159, + "step": 5551 + }, + { + "epoch": 2.2578283855225703, + "grad_norm": 11.292093185719374, + "learning_rate": 1.8937067303400627e-05, + "loss": 0.3853, + "step": 5552 + }, + { + "epoch": 2.258235054900366, + "grad_norm": 9.254933635342283, + "learning_rate": 1.893661268899756e-05, + "loss": 0.146, + "step": 5553 + }, + { + "epoch": 2.258641724278162, + "grad_norm": 2.113780854587622, + "learning_rate": 1.8936157982855665e-05, + "loss": 0.0331, + "step": 5554 + }, + { + "epoch": 2.2590483936559576, + "grad_norm": 6.560839612165683, + "learning_rate": 1.8935703184979613e-05, + "loss": 0.1573, + "step": 5555 + }, + { + "epoch": 2.2594550630337533, + "grad_norm": 11.91479840589379, + "learning_rate": 1.8935248295374073e-05, + "loss": 0.2931, + "step": 5556 + }, + { + "epoch": 2.2598617324115495, + "grad_norm": 9.425758408029878, + "learning_rate": 1.8934793314043704e-05, + "loss": 0.2044, + "step": 5557 + }, + { + "epoch": 2.2602684017893453, + "grad_norm": 21.302547080496574, + "learning_rate": 1.893433824099319e-05, + "loss": 1.647, + "step": 5558 + }, + { + "epoch": 2.260675071167141, + "grad_norm": 9.20024362694538, + "learning_rate": 1.89338830762272e-05, + "loss": 0.2279, + "step": 5559 + }, + { + "epoch": 2.261081740544937, + "grad_norm": 25.293524005053246, + "learning_rate": 1.89334278197504e-05, + "loss": 0.5075, + "step": 5560 + }, + { + "epoch": 2.261488409922733, + "grad_norm": 7.866452276267778, + "learning_rate": 1.8932972471567467e-05, + "loss": 0.1334, + "step": 5561 + }, + { + "epoch": 2.261895079300529, + "grad_norm": 12.88627278226426, + "learning_rate": 1.893251703168308e-05, + "loss": 0.8828, + "step": 5562 + }, + { + "epoch": 2.2623017486783246, + "grad_norm": 15.080986558762397, + "learning_rate": 1.8932061500101907e-05, + "loss": 1.0016, + "step": 5563 + }, + { + "epoch": 2.2627084180561203, + "grad_norm": 7.687053669098584, + "learning_rate": 1.8931605876828626e-05, + "loss": 0.2095, + "step": 5564 + }, + { + "epoch": 2.263115087433916, + "grad_norm": 8.586081884828001, + "learning_rate": 1.8931150161867917e-05, + "loss": 0.2543, + "step": 5565 + }, + { + "epoch": 2.263521756811712, + "grad_norm": 7.815988395444461, + "learning_rate": 1.8930694355224458e-05, + "loss": 0.373, + "step": 5566 + }, + { + "epoch": 2.263928426189508, + "grad_norm": 4.46047167459169, + "learning_rate": 1.8930238456902925e-05, + "loss": 0.1594, + "step": 5567 + }, + { + "epoch": 2.264335095567304, + "grad_norm": 4.98440773797719, + "learning_rate": 1.8929782466908e-05, + "loss": 0.1368, + "step": 5568 + }, + { + "epoch": 2.2647417649450996, + "grad_norm": 4.339376603214818, + "learning_rate": 1.8929326385244364e-05, + "loss": 0.0932, + "step": 5569 + }, + { + "epoch": 2.2651484343228954, + "grad_norm": 18.879746075638533, + "learning_rate": 1.89288702119167e-05, + "loss": 1.1312, + "step": 5570 + }, + { + "epoch": 2.2655551037006916, + "grad_norm": 2.5768357491874454, + "learning_rate": 1.8928413946929684e-05, + "loss": 0.0481, + "step": 5571 + }, + { + "epoch": 2.2659617730784873, + "grad_norm": 10.397178777565667, + "learning_rate": 1.892795759028801e-05, + "loss": 0.2005, + "step": 5572 + }, + { + "epoch": 2.266368442456283, + "grad_norm": 10.82372902703382, + "learning_rate": 1.892750114199636e-05, + "loss": 0.4511, + "step": 5573 + }, + { + "epoch": 2.266775111834079, + "grad_norm": 11.150385706785704, + "learning_rate": 1.8927044602059412e-05, + "loss": 0.4364, + "step": 5574 + }, + { + "epoch": 2.2671817812118746, + "grad_norm": 5.514577139624393, + "learning_rate": 1.8926587970481864e-05, + "loss": 0.1274, + "step": 5575 + }, + { + "epoch": 2.2675884505896704, + "grad_norm": 5.0368239760852775, + "learning_rate": 1.8926131247268392e-05, + "loss": 0.0843, + "step": 5576 + }, + { + "epoch": 2.2679951199674666, + "grad_norm": 10.55078530214409, + "learning_rate": 1.8925674432423692e-05, + "loss": 0.4641, + "step": 5577 + }, + { + "epoch": 2.2684017893452624, + "grad_norm": 1.211177393816651, + "learning_rate": 1.8925217525952453e-05, + "loss": 0.026, + "step": 5578 + }, + { + "epoch": 2.268808458723058, + "grad_norm": 4.200095931312374, + "learning_rate": 1.8924760527859365e-05, + "loss": 0.0786, + "step": 5579 + }, + { + "epoch": 2.269215128100854, + "grad_norm": 15.144552490085665, + "learning_rate": 1.892430343814912e-05, + "loss": 0.318, + "step": 5580 + }, + { + "epoch": 2.2696217974786497, + "grad_norm": 9.38956288174685, + "learning_rate": 1.8923846256826403e-05, + "loss": 0.2622, + "step": 5581 + }, + { + "epoch": 2.270028466856446, + "grad_norm": 1.2697709828934065, + "learning_rate": 1.8923388983895917e-05, + "loss": 0.0285, + "step": 5582 + }, + { + "epoch": 2.2704351362342416, + "grad_norm": 6.077314245820008, + "learning_rate": 1.8922931619362352e-05, + "loss": 0.2099, + "step": 5583 + }, + { + "epoch": 2.2708418056120374, + "grad_norm": 10.17206230355986, + "learning_rate": 1.89224741632304e-05, + "loss": 0.4211, + "step": 5584 + }, + { + "epoch": 2.271248474989833, + "grad_norm": 19.139322688115286, + "learning_rate": 1.8922016615504765e-05, + "loss": 0.6238, + "step": 5585 + }, + { + "epoch": 2.271655144367629, + "grad_norm": 9.450054565304795, + "learning_rate": 1.8921558976190137e-05, + "loss": 0.4224, + "step": 5586 + }, + { + "epoch": 2.272061813745425, + "grad_norm": 16.846847117735635, + "learning_rate": 1.8921101245291214e-05, + "loss": 0.3462, + "step": 5587 + }, + { + "epoch": 2.272468483123221, + "grad_norm": 6.985311823579483, + "learning_rate": 1.89206434228127e-05, + "loss": 0.254, + "step": 5588 + }, + { + "epoch": 2.2728751525010167, + "grad_norm": 10.680755258078246, + "learning_rate": 1.892018550875929e-05, + "loss": 0.3594, + "step": 5589 + }, + { + "epoch": 2.2732818218788124, + "grad_norm": 12.880857564537033, + "learning_rate": 1.8919727503135685e-05, + "loss": 1.0445, + "step": 5590 + }, + { + "epoch": 2.273688491256608, + "grad_norm": 16.19608438895543, + "learning_rate": 1.891926940594659e-05, + "loss": 0.4451, + "step": 5591 + }, + { + "epoch": 2.2740951606344044, + "grad_norm": 10.49200894626536, + "learning_rate": 1.8918811217196703e-05, + "loss": 0.5994, + "step": 5592 + }, + { + "epoch": 2.2745018300122, + "grad_norm": 9.664571190644194, + "learning_rate": 1.8918352936890733e-05, + "loss": 0.2359, + "step": 5593 + }, + { + "epoch": 2.274908499389996, + "grad_norm": 2.210166535498499, + "learning_rate": 1.8917894565033383e-05, + "loss": 0.0333, + "step": 5594 + }, + { + "epoch": 2.2753151687677917, + "grad_norm": 2.8484272378983766, + "learning_rate": 1.8917436101629352e-05, + "loss": 0.0293, + "step": 5595 + }, + { + "epoch": 2.2757218381455875, + "grad_norm": 1.6851025379708258, + "learning_rate": 1.8916977546683354e-05, + "loss": 0.0262, + "step": 5596 + }, + { + "epoch": 2.2761285075233832, + "grad_norm": 14.337682165426614, + "learning_rate": 1.8916518900200094e-05, + "loss": 0.8511, + "step": 5597 + }, + { + "epoch": 2.2765351769011795, + "grad_norm": 10.097771206091664, + "learning_rate": 1.891606016218428e-05, + "loss": 0.3729, + "step": 5598 + }, + { + "epoch": 2.2769418462789752, + "grad_norm": 10.725347821339751, + "learning_rate": 1.8915601332640618e-05, + "loss": 0.3421, + "step": 5599 + }, + { + "epoch": 2.277348515656771, + "grad_norm": 5.490213366135468, + "learning_rate": 1.8915142411573827e-05, + "loss": 0.1028, + "step": 5600 + }, + { + "epoch": 2.2777551850345668, + "grad_norm": 9.059047828187834, + "learning_rate": 1.8914683398988606e-05, + "loss": 0.4683, + "step": 5601 + }, + { + "epoch": 2.278161854412363, + "grad_norm": 5.97433763843489, + "learning_rate": 1.8914224294889677e-05, + "loss": 0.0817, + "step": 5602 + }, + { + "epoch": 2.2785685237901587, + "grad_norm": 15.597159737227521, + "learning_rate": 1.8913765099281748e-05, + "loss": 0.4098, + "step": 5603 + }, + { + "epoch": 2.2789751931679545, + "grad_norm": 18.432078448449886, + "learning_rate": 1.8913305812169533e-05, + "loss": 0.8728, + "step": 5604 + }, + { + "epoch": 2.2793818625457503, + "grad_norm": 9.692186522814326, + "learning_rate": 1.8912846433557753e-05, + "loss": 0.3346, + "step": 5605 + }, + { + "epoch": 2.279788531923546, + "grad_norm": 13.454234808631435, + "learning_rate": 1.891238696345111e-05, + "loss": 0.6096, + "step": 5606 + }, + { + "epoch": 2.280195201301342, + "grad_norm": 6.941634872789003, + "learning_rate": 1.8911927401854336e-05, + "loss": 0.14, + "step": 5607 + }, + { + "epoch": 2.280601870679138, + "grad_norm": 17.06768514067799, + "learning_rate": 1.891146774877214e-05, + "loss": 0.9246, + "step": 5608 + }, + { + "epoch": 2.2810085400569338, + "grad_norm": 9.388103403518857, + "learning_rate": 1.8911008004209243e-05, + "loss": 0.5217, + "step": 5609 + }, + { + "epoch": 2.2814152094347295, + "grad_norm": 10.480973441632937, + "learning_rate": 1.8910548168170364e-05, + "loss": 0.3913, + "step": 5610 + }, + { + "epoch": 2.2818218788125253, + "grad_norm": 6.183453200160819, + "learning_rate": 1.8910088240660225e-05, + "loss": 0.0895, + "step": 5611 + }, + { + "epoch": 2.2822285481903215, + "grad_norm": 5.769870668008028, + "learning_rate": 1.890962822168354e-05, + "loss": 0.1623, + "step": 5612 + }, + { + "epoch": 2.2826352175681173, + "grad_norm": 1.8987617840923277, + "learning_rate": 1.8909168111245043e-05, + "loss": 0.0331, + "step": 5613 + }, + { + "epoch": 2.283041886945913, + "grad_norm": 18.93965756779127, + "learning_rate": 1.890870790934945e-05, + "loss": 0.5255, + "step": 5614 + }, + { + "epoch": 2.283448556323709, + "grad_norm": 2.0056850682683156, + "learning_rate": 1.8908247616001487e-05, + "loss": 0.0324, + "step": 5615 + }, + { + "epoch": 2.2838552257015046, + "grad_norm": 10.661623805530326, + "learning_rate": 1.8907787231205876e-05, + "loss": 0.7132, + "step": 5616 + }, + { + "epoch": 2.2842618950793003, + "grad_norm": 2.609470787545904, + "learning_rate": 1.890732675496735e-05, + "loss": 0.0433, + "step": 5617 + }, + { + "epoch": 2.2846685644570965, + "grad_norm": 9.147432661291525, + "learning_rate": 1.890686618729063e-05, + "loss": 0.5687, + "step": 5618 + }, + { + "epoch": 2.2850752338348923, + "grad_norm": 5.175659929652461, + "learning_rate": 1.8906405528180445e-05, + "loss": 0.1421, + "step": 5619 + }, + { + "epoch": 2.285481903212688, + "grad_norm": 2.368454613383635, + "learning_rate": 1.8905944777641526e-05, + "loss": 0.0312, + "step": 5620 + }, + { + "epoch": 2.285888572590484, + "grad_norm": 3.5519839822934784, + "learning_rate": 1.89054839356786e-05, + "loss": 0.0515, + "step": 5621 + }, + { + "epoch": 2.2862952419682796, + "grad_norm": 5.976583476746591, + "learning_rate": 1.89050230022964e-05, + "loss": 0.1233, + "step": 5622 + }, + { + "epoch": 2.286701911346076, + "grad_norm": 3.2963994332897055, + "learning_rate": 1.890456197749966e-05, + "loss": 0.0673, + "step": 5623 + }, + { + "epoch": 2.2871085807238716, + "grad_norm": 14.66370445481469, + "learning_rate": 1.8904100861293107e-05, + "loss": 0.7291, + "step": 5624 + }, + { + "epoch": 2.2875152501016673, + "grad_norm": 12.92811963019814, + "learning_rate": 1.8903639653681475e-05, + "loss": 0.4056, + "step": 5625 + }, + { + "epoch": 2.287921919479463, + "grad_norm": 13.009250704269816, + "learning_rate": 1.8903178354669506e-05, + "loss": 0.5441, + "step": 5626 + }, + { + "epoch": 2.288328588857259, + "grad_norm": 26.31063250180223, + "learning_rate": 1.8902716964261926e-05, + "loss": 0.2109, + "step": 5627 + }, + { + "epoch": 2.288735258235055, + "grad_norm": 31.281569340287714, + "learning_rate": 1.8902255482463476e-05, + "loss": 0.5828, + "step": 5628 + }, + { + "epoch": 2.289141927612851, + "grad_norm": 4.746055034981894, + "learning_rate": 1.8901793909278895e-05, + "loss": 0.1016, + "step": 5629 + }, + { + "epoch": 2.2895485969906466, + "grad_norm": 9.779125924527582, + "learning_rate": 1.890133224471292e-05, + "loss": 0.2546, + "step": 5630 + }, + { + "epoch": 2.2899552663684424, + "grad_norm": 17.428436663588737, + "learning_rate": 1.8900870488770288e-05, + "loss": 0.5013, + "step": 5631 + }, + { + "epoch": 2.290361935746238, + "grad_norm": 5.205706144830783, + "learning_rate": 1.8900408641455738e-05, + "loss": 0.2413, + "step": 5632 + }, + { + "epoch": 2.2907686051240344, + "grad_norm": 12.539405012354832, + "learning_rate": 1.8899946702774018e-05, + "loss": 0.254, + "step": 5633 + }, + { + "epoch": 2.29117527450183, + "grad_norm": 7.491411606642461, + "learning_rate": 1.8899484672729867e-05, + "loss": 0.1821, + "step": 5634 + }, + { + "epoch": 2.291581943879626, + "grad_norm": 5.059818833628785, + "learning_rate": 1.8899022551328023e-05, + "loss": 0.0955, + "step": 5635 + }, + { + "epoch": 2.2919886132574216, + "grad_norm": 8.161520778576971, + "learning_rate": 1.8898560338573238e-05, + "loss": 0.1589, + "step": 5636 + }, + { + "epoch": 2.2923952826352174, + "grad_norm": 16.54859702051234, + "learning_rate": 1.8898098034470245e-05, + "loss": 1.2089, + "step": 5637 + }, + { + "epoch": 2.292801952013013, + "grad_norm": 11.693028614341818, + "learning_rate": 1.8897635639023803e-05, + "loss": 0.5347, + "step": 5638 + }, + { + "epoch": 2.2932086213908094, + "grad_norm": 6.483273977210378, + "learning_rate": 1.8897173152238653e-05, + "loss": 0.2089, + "step": 5639 + }, + { + "epoch": 2.293615290768605, + "grad_norm": 0.7892090347404254, + "learning_rate": 1.8896710574119544e-05, + "loss": 0.0121, + "step": 5640 + }, + { + "epoch": 2.294021960146401, + "grad_norm": 15.65824928755092, + "learning_rate": 1.889624790467122e-05, + "loss": 0.7396, + "step": 5641 + }, + { + "epoch": 2.2944286295241967, + "grad_norm": 5.116833586183274, + "learning_rate": 1.8895785143898435e-05, + "loss": 0.1518, + "step": 5642 + }, + { + "epoch": 2.294835298901993, + "grad_norm": 10.220723237430285, + "learning_rate": 1.889532229180594e-05, + "loss": 0.2296, + "step": 5643 + }, + { + "epoch": 2.2952419682797887, + "grad_norm": 0.5869472085400729, + "learning_rate": 1.8894859348398484e-05, + "loss": 0.015, + "step": 5644 + }, + { + "epoch": 2.2956486376575844, + "grad_norm": 2.1306310653118876, + "learning_rate": 1.889439631368082e-05, + "loss": 0.0384, + "step": 5645 + }, + { + "epoch": 2.29605530703538, + "grad_norm": 17.556472039100257, + "learning_rate": 1.88939331876577e-05, + "loss": 0.348, + "step": 5646 + }, + { + "epoch": 2.296461976413176, + "grad_norm": 7.6453072453671265, + "learning_rate": 1.8893469970333876e-05, + "loss": 0.3519, + "step": 5647 + }, + { + "epoch": 2.2968686457909717, + "grad_norm": 12.704319003514918, + "learning_rate": 1.8893006661714115e-05, + "loss": 0.7169, + "step": 5648 + }, + { + "epoch": 2.297275315168768, + "grad_norm": 13.41050943153454, + "learning_rate": 1.889254326180316e-05, + "loss": 0.4505, + "step": 5649 + }, + { + "epoch": 2.2976819845465637, + "grad_norm": 10.077910211357212, + "learning_rate": 1.8892079770605775e-05, + "loss": 0.3269, + "step": 5650 + }, + { + "epoch": 2.2980886539243595, + "grad_norm": 5.670651413474236, + "learning_rate": 1.8891616188126713e-05, + "loss": 0.1632, + "step": 5651 + }, + { + "epoch": 2.2984953233021552, + "grad_norm": 0.607930498301671, + "learning_rate": 1.8891152514370737e-05, + "loss": 0.0129, + "step": 5652 + }, + { + "epoch": 2.2989019926799514, + "grad_norm": 11.414987613744344, + "learning_rate": 1.8890688749342607e-05, + "loss": 0.222, + "step": 5653 + }, + { + "epoch": 2.299308662057747, + "grad_norm": 10.517700509149169, + "learning_rate": 1.8890224893047084e-05, + "loss": 0.3287, + "step": 5654 + }, + { + "epoch": 2.299715331435543, + "grad_norm": 19.283512392348836, + "learning_rate": 1.8889760945488924e-05, + "loss": 0.3916, + "step": 5655 + }, + { + "epoch": 2.3001220008133387, + "grad_norm": 5.7879033239083, + "learning_rate": 1.8889296906672894e-05, + "loss": 0.1278, + "step": 5656 + }, + { + "epoch": 2.3005286701911345, + "grad_norm": 42.408747433110456, + "learning_rate": 1.888883277660376e-05, + "loss": 0.3838, + "step": 5657 + }, + { + "epoch": 2.3009353395689303, + "grad_norm": 0.3195583582130772, + "learning_rate": 1.8888368555286282e-05, + "loss": 0.0038, + "step": 5658 + }, + { + "epoch": 2.3013420089467265, + "grad_norm": 4.657571066778122, + "learning_rate": 1.888790424272523e-05, + "loss": 0.0791, + "step": 5659 + }, + { + "epoch": 2.3017486783245222, + "grad_norm": 14.251237168272418, + "learning_rate": 1.8887439838925366e-05, + "loss": 0.8457, + "step": 5660 + }, + { + "epoch": 2.302155347702318, + "grad_norm": 19.432273652753317, + "learning_rate": 1.888697534389146e-05, + "loss": 0.3724, + "step": 5661 + }, + { + "epoch": 2.3025620170801138, + "grad_norm": 4.145881126919334, + "learning_rate": 1.8886510757628274e-05, + "loss": 0.0845, + "step": 5662 + }, + { + "epoch": 2.3029686864579095, + "grad_norm": 9.233648024362473, + "learning_rate": 1.8886046080140587e-05, + "loss": 0.644, + "step": 5663 + }, + { + "epoch": 2.3033753558357057, + "grad_norm": 9.630115519072092, + "learning_rate": 1.8885581311433166e-05, + "loss": 0.3493, + "step": 5664 + }, + { + "epoch": 2.3037820252135015, + "grad_norm": 19.274678598836292, + "learning_rate": 1.8885116451510777e-05, + "loss": 0.9098, + "step": 5665 + }, + { + "epoch": 2.3041886945912973, + "grad_norm": 5.792097570347076, + "learning_rate": 1.8884651500378203e-05, + "loss": 0.1087, + "step": 5666 + }, + { + "epoch": 2.304595363969093, + "grad_norm": 8.603922041571792, + "learning_rate": 1.8884186458040205e-05, + "loss": 0.2636, + "step": 5667 + }, + { + "epoch": 2.305002033346889, + "grad_norm": 4.715062795043, + "learning_rate": 1.8883721324501562e-05, + "loss": 0.1545, + "step": 5668 + }, + { + "epoch": 2.305408702724685, + "grad_norm": 17.347765832437645, + "learning_rate": 1.8883256099767046e-05, + "loss": 0.5421, + "step": 5669 + }, + { + "epoch": 2.3058153721024808, + "grad_norm": 49.566703435340756, + "learning_rate": 1.8882790783841437e-05, + "loss": 0.7955, + "step": 5670 + }, + { + "epoch": 2.3062220414802765, + "grad_norm": 1.9027531500410024, + "learning_rate": 1.888232537672951e-05, + "loss": 0.0302, + "step": 5671 + }, + { + "epoch": 2.3066287108580723, + "grad_norm": 7.3225465116919715, + "learning_rate": 1.8881859878436044e-05, + "loss": 0.2262, + "step": 5672 + }, + { + "epoch": 2.307035380235868, + "grad_norm": 10.314051562598772, + "learning_rate": 1.8881394288965817e-05, + "loss": 0.2849, + "step": 5673 + }, + { + "epoch": 2.3074420496136643, + "grad_norm": 7.230673601338917, + "learning_rate": 1.8880928608323607e-05, + "loss": 0.1482, + "step": 5674 + }, + { + "epoch": 2.30784871899146, + "grad_norm": 14.857610451208764, + "learning_rate": 1.8880462836514193e-05, + "loss": 0.5851, + "step": 5675 + }, + { + "epoch": 2.308255388369256, + "grad_norm": 4.266572612034269, + "learning_rate": 1.887999697354236e-05, + "loss": 0.068, + "step": 5676 + }, + { + "epoch": 2.3086620577470516, + "grad_norm": 10.4785142891004, + "learning_rate": 1.887953101941289e-05, + "loss": 0.4298, + "step": 5677 + }, + { + "epoch": 2.3090687271248473, + "grad_norm": 1.723865890067309, + "learning_rate": 1.887906497413056e-05, + "loss": 0.0268, + "step": 5678 + }, + { + "epoch": 2.309475396502643, + "grad_norm": 12.809099020768727, + "learning_rate": 1.8878598837700166e-05, + "loss": 0.4383, + "step": 5679 + }, + { + "epoch": 2.3098820658804393, + "grad_norm": 1.1339428507409446, + "learning_rate": 1.887813261012648e-05, + "loss": 0.013, + "step": 5680 + }, + { + "epoch": 2.310288735258235, + "grad_norm": 6.578049088870301, + "learning_rate": 1.8877666291414297e-05, + "loss": 0.192, + "step": 5681 + }, + { + "epoch": 2.310695404636031, + "grad_norm": 9.010557918001869, + "learning_rate": 1.8877199881568405e-05, + "loss": 0.4878, + "step": 5682 + }, + { + "epoch": 2.3111020740138266, + "grad_norm": 9.92378951200133, + "learning_rate": 1.8876733380593586e-05, + "loss": 0.4403, + "step": 5683 + }, + { + "epoch": 2.311508743391623, + "grad_norm": 1.139561862155218, + "learning_rate": 1.887626678849463e-05, + "loss": 0.0151, + "step": 5684 + }, + { + "epoch": 2.3119154127694186, + "grad_norm": 0.46057442410527744, + "learning_rate": 1.887580010527633e-05, + "loss": 0.003, + "step": 5685 + }, + { + "epoch": 2.3123220821472144, + "grad_norm": 10.832472575552112, + "learning_rate": 1.8875333330943476e-05, + "loss": 0.7049, + "step": 5686 + }, + { + "epoch": 2.31272875152501, + "grad_norm": 2.360830334121049, + "learning_rate": 1.8874866465500856e-05, + "loss": 0.0496, + "step": 5687 + }, + { + "epoch": 2.313135420902806, + "grad_norm": 8.446062907169322, + "learning_rate": 1.8874399508953266e-05, + "loss": 0.1847, + "step": 5688 + }, + { + "epoch": 2.3135420902806016, + "grad_norm": 13.316816038548618, + "learning_rate": 1.8873932461305497e-05, + "loss": 0.6392, + "step": 5689 + }, + { + "epoch": 2.313948759658398, + "grad_norm": 8.096661824066745, + "learning_rate": 1.8873465322562346e-05, + "loss": 0.1874, + "step": 5690 + }, + { + "epoch": 2.3143554290361936, + "grad_norm": 14.042554078061228, + "learning_rate": 1.8872998092728608e-05, + "loss": 1.3143, + "step": 5691 + }, + { + "epoch": 2.3147620984139894, + "grad_norm": 11.259763777617728, + "learning_rate": 1.8872530771809077e-05, + "loss": 0.507, + "step": 5692 + }, + { + "epoch": 2.315168767791785, + "grad_norm": 5.911056693244928, + "learning_rate": 1.8872063359808558e-05, + "loss": 0.2137, + "step": 5693 + }, + { + "epoch": 2.3155754371695814, + "grad_norm": 0.21265784048451938, + "learning_rate": 1.887159585673184e-05, + "loss": 0.0033, + "step": 5694 + }, + { + "epoch": 2.315982106547377, + "grad_norm": 0.8107181183621305, + "learning_rate": 1.8871128262583723e-05, + "loss": 0.0169, + "step": 5695 + }, + { + "epoch": 2.316388775925173, + "grad_norm": 9.847950872132099, + "learning_rate": 1.887066057736901e-05, + "loss": 0.7048, + "step": 5696 + }, + { + "epoch": 2.3167954453029687, + "grad_norm": 52.05803805381196, + "learning_rate": 1.8870192801092505e-05, + "loss": 0.8844, + "step": 5697 + }, + { + "epoch": 2.3172021146807644, + "grad_norm": 7.64650927027429, + "learning_rate": 1.8869724933759003e-05, + "loss": 0.1368, + "step": 5698 + }, + { + "epoch": 2.31760878405856, + "grad_norm": 2.9180652977537997, + "learning_rate": 1.8869256975373315e-05, + "loss": 0.0687, + "step": 5699 + }, + { + "epoch": 2.3180154534363564, + "grad_norm": 1.6171265357664142, + "learning_rate": 1.8868788925940235e-05, + "loss": 0.024, + "step": 5700 + }, + { + "epoch": 2.318422122814152, + "grad_norm": 11.088999816799724, + "learning_rate": 1.8868320785464577e-05, + "loss": 0.6201, + "step": 5701 + }, + { + "epoch": 2.318828792191948, + "grad_norm": 9.648786750063225, + "learning_rate": 1.8867852553951143e-05, + "loss": 0.6348, + "step": 5702 + }, + { + "epoch": 2.3192354615697437, + "grad_norm": 20.28220248447069, + "learning_rate": 1.8867384231404737e-05, + "loss": 0.7869, + "step": 5703 + }, + { + "epoch": 2.3196421309475395, + "grad_norm": 7.416355750477801, + "learning_rate": 1.886691581783017e-05, + "loss": 0.2363, + "step": 5704 + }, + { + "epoch": 2.3200488003253357, + "grad_norm": 7.012401194579128, + "learning_rate": 1.886644731323225e-05, + "loss": 0.3509, + "step": 5705 + }, + { + "epoch": 2.3204554697031314, + "grad_norm": 1.2507428789051276, + "learning_rate": 1.8865978717615788e-05, + "loss": 0.0193, + "step": 5706 + }, + { + "epoch": 2.320862139080927, + "grad_norm": 3.8779214223938365, + "learning_rate": 1.8865510030985588e-05, + "loss": 0.0266, + "step": 5707 + }, + { + "epoch": 2.321268808458723, + "grad_norm": 8.732725677256035, + "learning_rate": 1.886504125334647e-05, + "loss": 0.427, + "step": 5708 + }, + { + "epoch": 2.3216754778365187, + "grad_norm": 19.476323522401493, + "learning_rate": 1.8864572384703238e-05, + "loss": 0.3215, + "step": 5709 + }, + { + "epoch": 2.322082147214315, + "grad_norm": 5.111224705863717, + "learning_rate": 1.886410342506071e-05, + "loss": 0.1846, + "step": 5710 + }, + { + "epoch": 2.3224888165921107, + "grad_norm": 13.292684970780988, + "learning_rate": 1.8863634374423697e-05, + "loss": 0.5697, + "step": 5711 + }, + { + "epoch": 2.3228954859699065, + "grad_norm": 9.790960379839795, + "learning_rate": 1.886316523279702e-05, + "loss": 0.3256, + "step": 5712 + }, + { + "epoch": 2.3233021553477022, + "grad_norm": 6.529698906275175, + "learning_rate": 1.886269600018549e-05, + "loss": 0.2216, + "step": 5713 + }, + { + "epoch": 2.323708824725498, + "grad_norm": 2.8283929179266005, + "learning_rate": 1.8862226676593924e-05, + "loss": 0.0441, + "step": 5714 + }, + { + "epoch": 2.324115494103294, + "grad_norm": 15.89217001512987, + "learning_rate": 1.8861757262027144e-05, + "loss": 0.4507, + "step": 5715 + }, + { + "epoch": 2.32452216348109, + "grad_norm": 10.020448354061926, + "learning_rate": 1.886128775648996e-05, + "loss": 0.6579, + "step": 5716 + }, + { + "epoch": 2.3249288328588857, + "grad_norm": 8.233733846499332, + "learning_rate": 1.8860818159987204e-05, + "loss": 0.261, + "step": 5717 + }, + { + "epoch": 2.3253355022366815, + "grad_norm": 8.098392923272968, + "learning_rate": 1.8860348472523687e-05, + "loss": 0.2545, + "step": 5718 + }, + { + "epoch": 2.3257421716144773, + "grad_norm": 0.21957833201464305, + "learning_rate": 1.8859878694104234e-05, + "loss": 0.0051, + "step": 5719 + }, + { + "epoch": 2.326148840992273, + "grad_norm": 15.054810561916002, + "learning_rate": 1.8859408824733666e-05, + "loss": 0.4319, + "step": 5720 + }, + { + "epoch": 2.3265555103700692, + "grad_norm": 12.822056457603159, + "learning_rate": 1.885893886441681e-05, + "loss": 0.6075, + "step": 5721 + }, + { + "epoch": 2.326962179747865, + "grad_norm": 2.3373403926976573, + "learning_rate": 1.8858468813158485e-05, + "loss": 0.0214, + "step": 5722 + }, + { + "epoch": 2.3273688491256608, + "grad_norm": 15.244775231279437, + "learning_rate": 1.8857998670963518e-05, + "loss": 0.3917, + "step": 5723 + }, + { + "epoch": 2.3277755185034565, + "grad_norm": 26.42727083517571, + "learning_rate": 1.8857528437836738e-05, + "loss": 0.2563, + "step": 5724 + }, + { + "epoch": 2.3281821878812528, + "grad_norm": 9.1789500049831, + "learning_rate": 1.8857058113782974e-05, + "loss": 0.2909, + "step": 5725 + }, + { + "epoch": 2.3285888572590485, + "grad_norm": 11.066176620734105, + "learning_rate": 1.8856587698807046e-05, + "loss": 0.3222, + "step": 5726 + }, + { + "epoch": 2.3289955266368443, + "grad_norm": 1.545703525865285, + "learning_rate": 1.885611719291379e-05, + "loss": 0.0078, + "step": 5727 + }, + { + "epoch": 2.32940219601464, + "grad_norm": 14.347039293772134, + "learning_rate": 1.8855646596108034e-05, + "loss": 0.6391, + "step": 5728 + }, + { + "epoch": 2.329808865392436, + "grad_norm": 2.858293904232988, + "learning_rate": 1.885517590839461e-05, + "loss": 0.0471, + "step": 5729 + }, + { + "epoch": 2.3302155347702316, + "grad_norm": 4.117988831717042, + "learning_rate": 1.8854705129778347e-05, + "loss": 0.085, + "step": 5730 + }, + { + "epoch": 2.330622204148028, + "grad_norm": 5.953192081364454, + "learning_rate": 1.885423426026408e-05, + "loss": 0.1417, + "step": 5731 + }, + { + "epoch": 2.3310288735258236, + "grad_norm": 8.030544635619963, + "learning_rate": 1.885376329985664e-05, + "loss": 0.4476, + "step": 5732 + }, + { + "epoch": 2.3314355429036193, + "grad_norm": 2.7745193832483626, + "learning_rate": 1.885329224856087e-05, + "loss": 0.0522, + "step": 5733 + }, + { + "epoch": 2.331842212281415, + "grad_norm": 26.57214555751784, + "learning_rate": 1.8852821106381593e-05, + "loss": 0.8525, + "step": 5734 + }, + { + "epoch": 2.3322488816592113, + "grad_norm": 9.168796916568922, + "learning_rate": 1.8852349873323654e-05, + "loss": 0.3132, + "step": 5735 + }, + { + "epoch": 2.332655551037007, + "grad_norm": 9.400024680182277, + "learning_rate": 1.8851878549391886e-05, + "loss": 0.3491, + "step": 5736 + }, + { + "epoch": 2.333062220414803, + "grad_norm": 5.159789803153941, + "learning_rate": 1.885140713459113e-05, + "loss": 0.2784, + "step": 5737 + }, + { + "epoch": 2.3334688897925986, + "grad_norm": 10.955817947640218, + "learning_rate": 1.8850935628926227e-05, + "loss": 0.3907, + "step": 5738 + }, + { + "epoch": 2.3338755591703944, + "grad_norm": 4.295354935393116, + "learning_rate": 1.8850464032402016e-05, + "loss": 0.0928, + "step": 5739 + }, + { + "epoch": 2.33428222854819, + "grad_norm": 4.984063406273416, + "learning_rate": 1.884999234502334e-05, + "loss": 0.122, + "step": 5740 + }, + { + "epoch": 2.3346888979259863, + "grad_norm": 19.026797830874838, + "learning_rate": 1.8849520566795032e-05, + "loss": 0.3533, + "step": 5741 + }, + { + "epoch": 2.335095567303782, + "grad_norm": 14.338813779430486, + "learning_rate": 1.8849048697721943e-05, + "loss": 0.3518, + "step": 5742 + }, + { + "epoch": 2.335502236681578, + "grad_norm": 10.12916661914273, + "learning_rate": 1.8848576737808918e-05, + "loss": 0.3628, + "step": 5743 + }, + { + "epoch": 2.3359089060593736, + "grad_norm": 3.679421127198509, + "learning_rate": 1.8848104687060798e-05, + "loss": 0.0642, + "step": 5744 + }, + { + "epoch": 2.3363155754371694, + "grad_norm": 22.139426882380125, + "learning_rate": 1.8847632545482428e-05, + "loss": 0.6165, + "step": 5745 + }, + { + "epoch": 2.3367222448149656, + "grad_norm": 23.7178634386691, + "learning_rate": 1.884716031307866e-05, + "loss": 1.3724, + "step": 5746 + }, + { + "epoch": 2.3371289141927614, + "grad_norm": 4.284357843716141, + "learning_rate": 1.884668798985434e-05, + "loss": 0.1468, + "step": 5747 + }, + { + "epoch": 2.337535583570557, + "grad_norm": 4.314249891795574, + "learning_rate": 1.884621557581431e-05, + "loss": 0.1102, + "step": 5748 + }, + { + "epoch": 2.337942252948353, + "grad_norm": 35.54989178379672, + "learning_rate": 1.884574307096343e-05, + "loss": 0.2959, + "step": 5749 + }, + { + "epoch": 2.3383489223261487, + "grad_norm": 6.259328190327469, + "learning_rate": 1.884527047530654e-05, + "loss": 0.2726, + "step": 5750 + }, + { + "epoch": 2.338755591703945, + "grad_norm": 7.413443982803756, + "learning_rate": 1.8844797788848504e-05, + "loss": 0.0793, + "step": 5751 + }, + { + "epoch": 2.3391622610817406, + "grad_norm": 17.85784272757013, + "learning_rate": 1.8844325011594163e-05, + "loss": 0.9854, + "step": 5752 + }, + { + "epoch": 2.3395689304595364, + "grad_norm": 7.1533721182020535, + "learning_rate": 1.8843852143548374e-05, + "loss": 0.2696, + "step": 5753 + }, + { + "epoch": 2.339975599837332, + "grad_norm": 19.591512033199983, + "learning_rate": 1.8843379184715994e-05, + "loss": 1.1889, + "step": 5754 + }, + { + "epoch": 2.340382269215128, + "grad_norm": 15.267442189265743, + "learning_rate": 1.8842906135101872e-05, + "loss": 0.3395, + "step": 5755 + }, + { + "epoch": 2.340788938592924, + "grad_norm": 6.674780214232735, + "learning_rate": 1.884243299471087e-05, + "loss": 0.0806, + "step": 5756 + }, + { + "epoch": 2.34119560797072, + "grad_norm": 13.236729671522399, + "learning_rate": 1.8841959763547843e-05, + "loss": 0.277, + "step": 5757 + }, + { + "epoch": 2.3416022773485157, + "grad_norm": 5.161262014331828, + "learning_rate": 1.884148644161765e-05, + "loss": 0.2819, + "step": 5758 + }, + { + "epoch": 2.3420089467263114, + "grad_norm": 9.449737814976954, + "learning_rate": 1.884101302892515e-05, + "loss": 0.4132, + "step": 5759 + }, + { + "epoch": 2.342415616104107, + "grad_norm": 15.433882091617082, + "learning_rate": 1.88405395254752e-05, + "loss": 1.332, + "step": 5760 + }, + { + "epoch": 2.342822285481903, + "grad_norm": 0.20403046249762907, + "learning_rate": 1.884006593127266e-05, + "loss": 0.0033, + "step": 5761 + }, + { + "epoch": 2.343228954859699, + "grad_norm": 7.949487629591247, + "learning_rate": 1.88395922463224e-05, + "loss": 0.2001, + "step": 5762 + }, + { + "epoch": 2.343635624237495, + "grad_norm": 6.385888593825424, + "learning_rate": 1.8839118470629273e-05, + "loss": 0.1727, + "step": 5763 + }, + { + "epoch": 2.3440422936152907, + "grad_norm": 13.252556833474527, + "learning_rate": 1.8838644604198146e-05, + "loss": 0.3357, + "step": 5764 + }, + { + "epoch": 2.3444489629930865, + "grad_norm": 9.590260200079506, + "learning_rate": 1.8838170647033886e-05, + "loss": 0.3165, + "step": 5765 + }, + { + "epoch": 2.3448556323708827, + "grad_norm": 10.758656998788382, + "learning_rate": 1.8837696599141352e-05, + "loss": 0.1672, + "step": 5766 + }, + { + "epoch": 2.3452623017486784, + "grad_norm": 3.2402624268579316, + "learning_rate": 1.8837222460525417e-05, + "loss": 0.0502, + "step": 5767 + }, + { + "epoch": 2.345668971126474, + "grad_norm": 3.716759434649472, + "learning_rate": 1.8836748231190948e-05, + "loss": 0.0614, + "step": 5768 + }, + { + "epoch": 2.34607564050427, + "grad_norm": 8.687511114061488, + "learning_rate": 1.883627391114281e-05, + "loss": 0.3208, + "step": 5769 + }, + { + "epoch": 2.3464823098820657, + "grad_norm": 1.5507006748415169, + "learning_rate": 1.8835799500385866e-05, + "loss": 0.0324, + "step": 5770 + }, + { + "epoch": 2.3468889792598615, + "grad_norm": 4.208579674191887, + "learning_rate": 1.8835324998925e-05, + "loss": 0.091, + "step": 5771 + }, + { + "epoch": 2.3472956486376577, + "grad_norm": 9.739950010486986, + "learning_rate": 1.8834850406765074e-05, + "loss": 0.2975, + "step": 5772 + }, + { + "epoch": 2.3477023180154535, + "grad_norm": 16.728648983545746, + "learning_rate": 1.8834375723910968e-05, + "loss": 0.2751, + "step": 5773 + }, + { + "epoch": 2.3481089873932492, + "grad_norm": 6.4205527845689945, + "learning_rate": 1.883390095036754e-05, + "loss": 0.1923, + "step": 5774 + }, + { + "epoch": 2.348515656771045, + "grad_norm": 15.091738564497232, + "learning_rate": 1.883342608613968e-05, + "loss": 0.332, + "step": 5775 + }, + { + "epoch": 2.348922326148841, + "grad_norm": 32.10414926126763, + "learning_rate": 1.883295113123225e-05, + "loss": 0.5868, + "step": 5776 + }, + { + "epoch": 2.349328995526637, + "grad_norm": 12.937681231116228, + "learning_rate": 1.8832476085650136e-05, + "loss": 0.2548, + "step": 5777 + }, + { + "epoch": 2.3497356649044328, + "grad_norm": 8.998117779898894, + "learning_rate": 1.8832000949398206e-05, + "loss": 0.1986, + "step": 5778 + }, + { + "epoch": 2.3501423342822285, + "grad_norm": 11.153585064912688, + "learning_rate": 1.8831525722481343e-05, + "loss": 0.4148, + "step": 5779 + }, + { + "epoch": 2.3505490036600243, + "grad_norm": 9.39493039833692, + "learning_rate": 1.8831050404904422e-05, + "loss": 0.2426, + "step": 5780 + }, + { + "epoch": 2.35095567303782, + "grad_norm": 11.056798430503333, + "learning_rate": 1.8830574996672327e-05, + "loss": 0.5272, + "step": 5781 + }, + { + "epoch": 2.3513623424156163, + "grad_norm": 8.016114287597956, + "learning_rate": 1.883009949778993e-05, + "loss": 0.3352, + "step": 5782 + }, + { + "epoch": 2.351769011793412, + "grad_norm": 1.7842766112769137, + "learning_rate": 1.882962390826212e-05, + "loss": 0.029, + "step": 5783 + }, + { + "epoch": 2.352175681171208, + "grad_norm": 1.335747231613395, + "learning_rate": 1.8829148228093783e-05, + "loss": 0.0227, + "step": 5784 + }, + { + "epoch": 2.3525823505490036, + "grad_norm": 4.031154600424336, + "learning_rate": 1.8828672457289786e-05, + "loss": 0.0628, + "step": 5785 + }, + { + "epoch": 2.3529890199267993, + "grad_norm": 21.179592626825976, + "learning_rate": 1.882819659585503e-05, + "loss": 0.7919, + "step": 5786 + }, + { + "epoch": 2.3533956893045955, + "grad_norm": 30.847848338031085, + "learning_rate": 1.882772064379439e-05, + "loss": 0.8873, + "step": 5787 + }, + { + "epoch": 2.3538023586823913, + "grad_norm": 13.42487277018329, + "learning_rate": 1.8827244601112752e-05, + "loss": 0.9573, + "step": 5788 + }, + { + "epoch": 2.354209028060187, + "grad_norm": 16.026921966352486, + "learning_rate": 1.8826768467815013e-05, + "loss": 0.7445, + "step": 5789 + }, + { + "epoch": 2.354615697437983, + "grad_norm": 4.521765460068406, + "learning_rate": 1.8826292243906045e-05, + "loss": 0.2409, + "step": 5790 + }, + { + "epoch": 2.3550223668157786, + "grad_norm": 8.053140152186113, + "learning_rate": 1.882581592939075e-05, + "loss": 0.381, + "step": 5791 + }, + { + "epoch": 2.355429036193575, + "grad_norm": 8.277147675118597, + "learning_rate": 1.8825339524274008e-05, + "loss": 0.3371, + "step": 5792 + }, + { + "epoch": 2.3558357055713706, + "grad_norm": 9.452767509999195, + "learning_rate": 1.8824863028560716e-05, + "loss": 0.4938, + "step": 5793 + }, + { + "epoch": 2.3562423749491663, + "grad_norm": 2.8856974138277622, + "learning_rate": 1.8824386442255767e-05, + "loss": 0.0532, + "step": 5794 + }, + { + "epoch": 2.356649044326962, + "grad_norm": 8.695140639527784, + "learning_rate": 1.882390976536405e-05, + "loss": 0.3434, + "step": 5795 + }, + { + "epoch": 2.357055713704758, + "grad_norm": 7.162881179202543, + "learning_rate": 1.8823432997890455e-05, + "loss": 0.1505, + "step": 5796 + }, + { + "epoch": 2.357462383082554, + "grad_norm": 5.9313018391475705, + "learning_rate": 1.882295613983988e-05, + "loss": 0.4319, + "step": 5797 + }, + { + "epoch": 2.35786905246035, + "grad_norm": 5.11253017697476, + "learning_rate": 1.882247919121722e-05, + "loss": 0.1069, + "step": 5798 + }, + { + "epoch": 2.3582757218381456, + "grad_norm": 6.618857179182189, + "learning_rate": 1.882200215202737e-05, + "loss": 0.1889, + "step": 5799 + }, + { + "epoch": 2.3586823912159414, + "grad_norm": 9.956588214509367, + "learning_rate": 1.882152502227523e-05, + "loss": 0.6791, + "step": 5800 + }, + { + "epoch": 2.359089060593737, + "grad_norm": 1.368350285496294, + "learning_rate": 1.8821047801965692e-05, + "loss": 0.0301, + "step": 5801 + }, + { + "epoch": 2.359495729971533, + "grad_norm": 25.815133840771605, + "learning_rate": 1.8820570491103664e-05, + "loss": 0.9103, + "step": 5802 + }, + { + "epoch": 2.359902399349329, + "grad_norm": 24.789878341277028, + "learning_rate": 1.8820093089694037e-05, + "loss": 0.636, + "step": 5803 + }, + { + "epoch": 2.360309068727125, + "grad_norm": 10.835839216694874, + "learning_rate": 1.8819615597741714e-05, + "loss": 0.284, + "step": 5804 + }, + { + "epoch": 2.3607157381049206, + "grad_norm": 11.07129247123112, + "learning_rate": 1.8819138015251602e-05, + "loss": 0.3056, + "step": 5805 + }, + { + "epoch": 2.3611224074827164, + "grad_norm": 15.59974978526934, + "learning_rate": 1.88186603422286e-05, + "loss": 0.7364, + "step": 5806 + }, + { + "epoch": 2.3615290768605126, + "grad_norm": 9.64091111775638, + "learning_rate": 1.881818257867761e-05, + "loss": 0.6115, + "step": 5807 + }, + { + "epoch": 2.3619357462383084, + "grad_norm": 7.111407217517332, + "learning_rate": 1.8817704724603536e-05, + "loss": 0.209, + "step": 5808 + }, + { + "epoch": 2.362342415616104, + "grad_norm": 12.972485938184734, + "learning_rate": 1.8817226780011287e-05, + "loss": 0.5312, + "step": 5809 + }, + { + "epoch": 2.3627490849939, + "grad_norm": 9.235428754761841, + "learning_rate": 1.8816748744905765e-05, + "loss": 0.558, + "step": 5810 + }, + { + "epoch": 2.3631557543716957, + "grad_norm": 10.896183263366455, + "learning_rate": 1.8816270619291882e-05, + "loss": 0.6473, + "step": 5811 + }, + { + "epoch": 2.3635624237494914, + "grad_norm": 15.844688433350003, + "learning_rate": 1.8815792403174546e-05, + "loss": 0.7268, + "step": 5812 + }, + { + "epoch": 2.3639690931272876, + "grad_norm": 12.527395165266244, + "learning_rate": 1.881531409655866e-05, + "loss": 0.5214, + "step": 5813 + }, + { + "epoch": 2.3643757625050834, + "grad_norm": 8.071169174882451, + "learning_rate": 1.881483569944914e-05, + "loss": 0.2245, + "step": 5814 + }, + { + "epoch": 2.364782431882879, + "grad_norm": 9.853908336902384, + "learning_rate": 1.8814357211850892e-05, + "loss": 0.2943, + "step": 5815 + }, + { + "epoch": 2.365189101260675, + "grad_norm": 4.925145845247614, + "learning_rate": 1.8813878633768838e-05, + "loss": 0.1034, + "step": 5816 + }, + { + "epoch": 2.365595770638471, + "grad_norm": 12.385151635113749, + "learning_rate": 1.881339996520788e-05, + "loss": 0.4505, + "step": 5817 + }, + { + "epoch": 2.366002440016267, + "grad_norm": 23.931700271560757, + "learning_rate": 1.8812921206172934e-05, + "loss": 0.7845, + "step": 5818 + }, + { + "epoch": 2.3664091093940627, + "grad_norm": 13.484196484927452, + "learning_rate": 1.8812442356668916e-05, + "loss": 0.478, + "step": 5819 + }, + { + "epoch": 2.3668157787718584, + "grad_norm": 5.588995877836172, + "learning_rate": 1.8811963416700746e-05, + "loss": 0.182, + "step": 5820 + }, + { + "epoch": 2.367222448149654, + "grad_norm": 2.409459104424916, + "learning_rate": 1.8811484386273333e-05, + "loss": 0.0447, + "step": 5821 + }, + { + "epoch": 2.36762911752745, + "grad_norm": 10.183189604838516, + "learning_rate": 1.88110052653916e-05, + "loss": 0.6854, + "step": 5822 + }, + { + "epoch": 2.368035786905246, + "grad_norm": 7.709473555487696, + "learning_rate": 1.881052605406046e-05, + "loss": 0.1628, + "step": 5823 + }, + { + "epoch": 2.368442456283042, + "grad_norm": 8.987314418913156, + "learning_rate": 1.881004675228484e-05, + "loss": 0.596, + "step": 5824 + }, + { + "epoch": 2.3688491256608377, + "grad_norm": 31.087436178760797, + "learning_rate": 1.8809567360069657e-05, + "loss": 0.442, + "step": 5825 + }, + { + "epoch": 2.3692557950386335, + "grad_norm": 24.101046448319092, + "learning_rate": 1.8809087877419828e-05, + "loss": 1.1665, + "step": 5826 + }, + { + "epoch": 2.3696624644164297, + "grad_norm": 3.8435684590153762, + "learning_rate": 1.880860830434028e-05, + "loss": 0.1839, + "step": 5827 + }, + { + "epoch": 2.3700691337942255, + "grad_norm": 14.40746481394693, + "learning_rate": 1.8808128640835933e-05, + "loss": 1.0, + "step": 5828 + }, + { + "epoch": 2.3704758031720212, + "grad_norm": 3.3503697123723515, + "learning_rate": 1.8807648886911716e-05, + "loss": 0.0996, + "step": 5829 + }, + { + "epoch": 2.370882472549817, + "grad_norm": 0.29560357249017677, + "learning_rate": 1.8807169042572547e-05, + "loss": 0.005, + "step": 5830 + }, + { + "epoch": 2.3712891419276128, + "grad_norm": 6.122069147903665, + "learning_rate": 1.8806689107823354e-05, + "loss": 0.241, + "step": 5831 + }, + { + "epoch": 2.3716958113054085, + "grad_norm": 20.12032555539501, + "learning_rate": 1.8806209082669068e-05, + "loss": 1.3545, + "step": 5832 + }, + { + "epoch": 2.3721024806832047, + "grad_norm": 8.317829250379361, + "learning_rate": 1.8805728967114615e-05, + "loss": 0.334, + "step": 5833 + }, + { + "epoch": 2.3725091500610005, + "grad_norm": 4.542223927317676, + "learning_rate": 1.8805248761164917e-05, + "loss": 0.054, + "step": 5834 + }, + { + "epoch": 2.3729158194387963, + "grad_norm": 10.633026740102384, + "learning_rate": 1.8804768464824914e-05, + "loss": 0.3652, + "step": 5835 + }, + { + "epoch": 2.373322488816592, + "grad_norm": 1.9158608138099267, + "learning_rate": 1.880428807809953e-05, + "loss": 0.0407, + "step": 5836 + }, + { + "epoch": 2.373729158194388, + "grad_norm": 9.778424159681192, + "learning_rate": 1.8803807600993695e-05, + "loss": 0.4095, + "step": 5837 + }, + { + "epoch": 2.374135827572184, + "grad_norm": 4.9116855810607705, + "learning_rate": 1.8803327033512348e-05, + "loss": 0.1438, + "step": 5838 + }, + { + "epoch": 2.3745424969499798, + "grad_norm": 11.993052255992886, + "learning_rate": 1.8802846375660415e-05, + "loss": 0.406, + "step": 5839 + }, + { + "epoch": 2.3749491663277755, + "grad_norm": 9.938544904832412, + "learning_rate": 1.880236562744284e-05, + "loss": 0.3324, + "step": 5840 + }, + { + "epoch": 2.3753558357055713, + "grad_norm": 16.95269866266649, + "learning_rate": 1.8801884788864545e-05, + "loss": 0.6317, + "step": 5841 + }, + { + "epoch": 2.375762505083367, + "grad_norm": 5.785200118208292, + "learning_rate": 1.8801403859930475e-05, + "loss": 0.11, + "step": 5842 + }, + { + "epoch": 2.376169174461163, + "grad_norm": 10.400977507182482, + "learning_rate": 1.8800922840645565e-05, + "loss": 0.4982, + "step": 5843 + }, + { + "epoch": 2.376575843838959, + "grad_norm": 8.41028720341649, + "learning_rate": 1.880044173101475e-05, + "loss": 0.3682, + "step": 5844 + }, + { + "epoch": 2.376982513216755, + "grad_norm": 9.890155174437785, + "learning_rate": 1.8799960531042973e-05, + "loss": 0.3335, + "step": 5845 + }, + { + "epoch": 2.3773891825945506, + "grad_norm": 3.5953679932911524, + "learning_rate": 1.879947924073517e-05, + "loss": 0.0815, + "step": 5846 + }, + { + "epoch": 2.3777958519723463, + "grad_norm": 16.91286295718436, + "learning_rate": 1.879899786009629e-05, + "loss": 0.7447, + "step": 5847 + }, + { + "epoch": 2.3782025213501425, + "grad_norm": 5.363624357929917, + "learning_rate": 1.8798516389131266e-05, + "loss": 0.2494, + "step": 5848 + }, + { + "epoch": 2.3786091907279383, + "grad_norm": 1.0308745355463207, + "learning_rate": 1.879803482784504e-05, + "loss": 0.0239, + "step": 5849 + }, + { + "epoch": 2.379015860105734, + "grad_norm": 2.96946345257128, + "learning_rate": 1.879755317624256e-05, + "loss": 0.0433, + "step": 5850 + }, + { + "epoch": 2.37942252948353, + "grad_norm": 0.8048691412777599, + "learning_rate": 1.8797071434328772e-05, + "loss": 0.0174, + "step": 5851 + }, + { + "epoch": 2.3798291988613256, + "grad_norm": 14.075425057933742, + "learning_rate": 1.8796589602108612e-05, + "loss": 0.5803, + "step": 5852 + }, + { + "epoch": 2.3802358682391214, + "grad_norm": 8.599317298129355, + "learning_rate": 1.879610767958704e-05, + "loss": 0.3291, + "step": 5853 + }, + { + "epoch": 2.3806425376169176, + "grad_norm": 8.468635688791931, + "learning_rate": 1.879562566676899e-05, + "loss": 0.3515, + "step": 5854 + }, + { + "epoch": 2.3810492069947133, + "grad_norm": 11.804557721000858, + "learning_rate": 1.8795143563659416e-05, + "loss": 0.6037, + "step": 5855 + }, + { + "epoch": 2.381455876372509, + "grad_norm": 9.79487402763487, + "learning_rate": 1.8794661370263273e-05, + "loss": 0.4046, + "step": 5856 + }, + { + "epoch": 2.381862545750305, + "grad_norm": 5.281882812639852, + "learning_rate": 1.8794179086585496e-05, + "loss": 0.0856, + "step": 5857 + }, + { + "epoch": 2.382269215128101, + "grad_norm": 13.953955172474274, + "learning_rate": 1.8793696712631053e-05, + "loss": 0.6392, + "step": 5858 + }, + { + "epoch": 2.382675884505897, + "grad_norm": 1.6733010360237046, + "learning_rate": 1.8793214248404883e-05, + "loss": 0.0288, + "step": 5859 + }, + { + "epoch": 2.3830825538836926, + "grad_norm": 3.2643692937894833, + "learning_rate": 1.8792731693911944e-05, + "loss": 0.0459, + "step": 5860 + }, + { + "epoch": 2.3834892232614884, + "grad_norm": 8.670770817357315, + "learning_rate": 1.8792249049157188e-05, + "loss": 0.3106, + "step": 5861 + }, + { + "epoch": 2.383895892639284, + "grad_norm": 4.131985047457676, + "learning_rate": 1.879176631414557e-05, + "loss": 0.0564, + "step": 5862 + }, + { + "epoch": 2.38430256201708, + "grad_norm": 9.505082878452209, + "learning_rate": 1.8791283488882048e-05, + "loss": 0.2686, + "step": 5863 + }, + { + "epoch": 2.384709231394876, + "grad_norm": 9.26351399400758, + "learning_rate": 1.8790800573371574e-05, + "loss": 0.1848, + "step": 5864 + }, + { + "epoch": 2.385115900772672, + "grad_norm": 3.9795549936253023, + "learning_rate": 1.879031756761911e-05, + "loss": 0.0826, + "step": 5865 + }, + { + "epoch": 2.3855225701504676, + "grad_norm": 6.351936533866193, + "learning_rate": 1.8789834471629613e-05, + "loss": 0.2057, + "step": 5866 + }, + { + "epoch": 2.3859292395282634, + "grad_norm": 22.71833392285156, + "learning_rate": 1.878935128540804e-05, + "loss": 2.1742, + "step": 5867 + }, + { + "epoch": 2.3863359089060596, + "grad_norm": 0.8595299479345117, + "learning_rate": 1.878886800895935e-05, + "loss": 0.0136, + "step": 5868 + }, + { + "epoch": 2.3867425782838554, + "grad_norm": 9.450328354156728, + "learning_rate": 1.878838464228851e-05, + "loss": 0.7224, + "step": 5869 + }, + { + "epoch": 2.387149247661651, + "grad_norm": 3.6064893768649293, + "learning_rate": 1.8787901185400475e-05, + "loss": 0.0379, + "step": 5870 + }, + { + "epoch": 2.387555917039447, + "grad_norm": 8.343735269826395, + "learning_rate": 1.8787417638300216e-05, + "loss": 0.1751, + "step": 5871 + }, + { + "epoch": 2.3879625864172427, + "grad_norm": 1.1554563512112312, + "learning_rate": 1.878693400099269e-05, + "loss": 0.0186, + "step": 5872 + }, + { + "epoch": 2.3883692557950384, + "grad_norm": 5.518880213808401, + "learning_rate": 1.8786450273482863e-05, + "loss": 0.1637, + "step": 5873 + }, + { + "epoch": 2.3887759251728347, + "grad_norm": 2.9750760122798794, + "learning_rate": 1.8785966455775703e-05, + "loss": 0.0596, + "step": 5874 + }, + { + "epoch": 2.3891825945506304, + "grad_norm": 9.868107976192528, + "learning_rate": 1.8785482547876176e-05, + "loss": 0.4299, + "step": 5875 + }, + { + "epoch": 2.389589263928426, + "grad_norm": 7.0374597854874095, + "learning_rate": 1.8784998549789247e-05, + "loss": 0.2014, + "step": 5876 + }, + { + "epoch": 2.389995933306222, + "grad_norm": 6.335728059560243, + "learning_rate": 1.878451446151989e-05, + "loss": 0.1332, + "step": 5877 + }, + { + "epoch": 2.3904026026840177, + "grad_norm": 4.602099162411541, + "learning_rate": 1.878403028307307e-05, + "loss": 0.1068, + "step": 5878 + }, + { + "epoch": 2.390809272061814, + "grad_norm": 5.818909782183589, + "learning_rate": 1.8783546014453756e-05, + "loss": 0.1741, + "step": 5879 + }, + { + "epoch": 2.3912159414396097, + "grad_norm": 5.0324072833602145, + "learning_rate": 1.8783061655666923e-05, + "loss": 0.1083, + "step": 5880 + }, + { + "epoch": 2.3916226108174055, + "grad_norm": 5.701072398952182, + "learning_rate": 1.878257720671754e-05, + "loss": 0.1205, + "step": 5881 + }, + { + "epoch": 2.3920292801952012, + "grad_norm": 14.98327970777828, + "learning_rate": 1.8782092667610582e-05, + "loss": 0.5564, + "step": 5882 + }, + { + "epoch": 2.392435949572997, + "grad_norm": 30.918690810745474, + "learning_rate": 1.8781608038351024e-05, + "loss": 1.159, + "step": 5883 + }, + { + "epoch": 2.392842618950793, + "grad_norm": 5.573628900037378, + "learning_rate": 1.8781123318943838e-05, + "loss": 0.1513, + "step": 5884 + }, + { + "epoch": 2.393249288328589, + "grad_norm": 8.843688971952126, + "learning_rate": 1.8780638509394005e-05, + "loss": 0.5176, + "step": 5885 + }, + { + "epoch": 2.3936559577063847, + "grad_norm": 23.16623195890991, + "learning_rate": 1.87801536097065e-05, + "loss": 0.7243, + "step": 5886 + }, + { + "epoch": 2.3940626270841805, + "grad_norm": 9.653938298566967, + "learning_rate": 1.8779668619886292e-05, + "loss": 0.1644, + "step": 5887 + }, + { + "epoch": 2.3944692964619763, + "grad_norm": 11.922910154156348, + "learning_rate": 1.8779183539938372e-05, + "loss": 0.4095, + "step": 5888 + }, + { + "epoch": 2.3948759658397725, + "grad_norm": 12.574376382021914, + "learning_rate": 1.8778698369867712e-05, + "loss": 0.7986, + "step": 5889 + }, + { + "epoch": 2.3952826352175682, + "grad_norm": 11.99225334256577, + "learning_rate": 1.8778213109679296e-05, + "loss": 0.4964, + "step": 5890 + }, + { + "epoch": 2.395689304595364, + "grad_norm": 3.7542015073319908, + "learning_rate": 1.8777727759378107e-05, + "loss": 0.0523, + "step": 5891 + }, + { + "epoch": 2.3960959739731598, + "grad_norm": 13.46921622112183, + "learning_rate": 1.8777242318969123e-05, + "loss": 0.2847, + "step": 5892 + }, + { + "epoch": 2.3965026433509555, + "grad_norm": 6.088913300174418, + "learning_rate": 1.8776756788457328e-05, + "loss": 0.114, + "step": 5893 + }, + { + "epoch": 2.3969093127287513, + "grad_norm": 13.590704025340647, + "learning_rate": 1.877627116784771e-05, + "loss": 0.8821, + "step": 5894 + }, + { + "epoch": 2.3973159821065475, + "grad_norm": 3.495025932646753, + "learning_rate": 1.877578545714525e-05, + "loss": 0.0402, + "step": 5895 + }, + { + "epoch": 2.3977226514843433, + "grad_norm": 6.212532220646325, + "learning_rate": 1.8775299656354935e-05, + "loss": 0.3002, + "step": 5896 + }, + { + "epoch": 2.398129320862139, + "grad_norm": 10.877316823657655, + "learning_rate": 1.877481376548175e-05, + "loss": 0.3075, + "step": 5897 + }, + { + "epoch": 2.398535990239935, + "grad_norm": 0.6426785897548452, + "learning_rate": 1.877432778453069e-05, + "loss": 0.0094, + "step": 5898 + }, + { + "epoch": 2.398942659617731, + "grad_norm": 4.615094579887169, + "learning_rate": 1.8773841713506736e-05, + "loss": 0.0517, + "step": 5899 + }, + { + "epoch": 2.3993493289955268, + "grad_norm": 18.384608319228686, + "learning_rate": 1.8773355552414884e-05, + "loss": 1.14, + "step": 5900 + }, + { + "epoch": 2.3997559983733225, + "grad_norm": 14.208593369151812, + "learning_rate": 1.877286930126012e-05, + "loss": 0.723, + "step": 5901 + }, + { + "epoch": 2.4001626677511183, + "grad_norm": 17.830969592064303, + "learning_rate": 1.8772382960047438e-05, + "loss": 0.727, + "step": 5902 + }, + { + "epoch": 2.400569337128914, + "grad_norm": 2.564276067126322, + "learning_rate": 1.8771896528781827e-05, + "loss": 0.0544, + "step": 5903 + }, + { + "epoch": 2.40097600650671, + "grad_norm": 15.540933187145521, + "learning_rate": 1.8771410007468288e-05, + "loss": 0.635, + "step": 5904 + }, + { + "epoch": 2.401382675884506, + "grad_norm": 12.081585584114466, + "learning_rate": 1.8770923396111807e-05, + "loss": 0.572, + "step": 5905 + }, + { + "epoch": 2.401789345262302, + "grad_norm": 5.6179560320871795, + "learning_rate": 1.8770436694717383e-05, + "loss": 0.1892, + "step": 5906 + }, + { + "epoch": 2.4021960146400976, + "grad_norm": 17.969466178760964, + "learning_rate": 1.8769949903290015e-05, + "loss": 1.2259, + "step": 5907 + }, + { + "epoch": 2.4026026840178933, + "grad_norm": 5.3037958327993255, + "learning_rate": 1.8769463021834696e-05, + "loss": 0.1416, + "step": 5908 + }, + { + "epoch": 2.4030093533956896, + "grad_norm": 9.50758116399239, + "learning_rate": 1.8768976050356428e-05, + "loss": 0.2961, + "step": 5909 + }, + { + "epoch": 2.4034160227734853, + "grad_norm": 6.5304264138144985, + "learning_rate": 1.8768488988860204e-05, + "loss": 0.1835, + "step": 5910 + }, + { + "epoch": 2.403822692151281, + "grad_norm": 6.0289432675189865, + "learning_rate": 1.8768001837351025e-05, + "loss": 0.1194, + "step": 5911 + }, + { + "epoch": 2.404229361529077, + "grad_norm": 14.084703811253629, + "learning_rate": 1.87675145958339e-05, + "loss": 0.7322, + "step": 5912 + }, + { + "epoch": 2.4046360309068726, + "grad_norm": 21.52245939233898, + "learning_rate": 1.876702726431382e-05, + "loss": 0.9439, + "step": 5913 + }, + { + "epoch": 2.4050427002846684, + "grad_norm": 9.683268154190154, + "learning_rate": 1.8766539842795798e-05, + "loss": 0.2387, + "step": 5914 + }, + { + "epoch": 2.4054493696624646, + "grad_norm": 9.619088935422711, + "learning_rate": 1.876605233128483e-05, + "loss": 0.475, + "step": 5915 + }, + { + "epoch": 2.4058560390402604, + "grad_norm": 1.111670153095876, + "learning_rate": 1.8765564729785922e-05, + "loss": 0.0175, + "step": 5916 + }, + { + "epoch": 2.406262708418056, + "grad_norm": 5.130404850006511, + "learning_rate": 1.8765077038304085e-05, + "loss": 0.0877, + "step": 5917 + }, + { + "epoch": 2.406669377795852, + "grad_norm": 13.616388728440596, + "learning_rate": 1.8764589256844313e-05, + "loss": 0.5204, + "step": 5918 + }, + { + "epoch": 2.4070760471736476, + "grad_norm": 13.854699904604978, + "learning_rate": 1.8764101385411627e-05, + "loss": 0.9176, + "step": 5919 + }, + { + "epoch": 2.407482716551444, + "grad_norm": 1.0929227310244773, + "learning_rate": 1.876361342401103e-05, + "loss": 0.0323, + "step": 5920 + }, + { + "epoch": 2.4078893859292396, + "grad_norm": 9.605169777441821, + "learning_rate": 1.876312537264753e-05, + "loss": 0.4412, + "step": 5921 + }, + { + "epoch": 2.4082960553070354, + "grad_norm": 0.5738315890373938, + "learning_rate": 1.8762637231326135e-05, + "loss": 0.0075, + "step": 5922 + }, + { + "epoch": 2.408702724684831, + "grad_norm": 0.9237682564636365, + "learning_rate": 1.8762149000051857e-05, + "loss": 0.0158, + "step": 5923 + }, + { + "epoch": 2.409109394062627, + "grad_norm": 8.319324734422, + "learning_rate": 1.8761660678829715e-05, + "loss": 0.1341, + "step": 5924 + }, + { + "epoch": 2.409516063440423, + "grad_norm": 10.985310233539701, + "learning_rate": 1.876117226766471e-05, + "loss": 0.324, + "step": 5925 + }, + { + "epoch": 2.409922732818219, + "grad_norm": 7.9514359158833186, + "learning_rate": 1.8760683766561868e-05, + "loss": 0.1812, + "step": 5926 + }, + { + "epoch": 2.4103294021960147, + "grad_norm": 4.767975032595463, + "learning_rate": 1.8760195175526196e-05, + "loss": 0.1662, + "step": 5927 + }, + { + "epoch": 2.4107360715738104, + "grad_norm": 7.493668522142777, + "learning_rate": 1.8759706494562713e-05, + "loss": 0.2816, + "step": 5928 + }, + { + "epoch": 2.411142740951606, + "grad_norm": 12.379084540822909, + "learning_rate": 1.875921772367643e-05, + "loss": 0.4611, + "step": 5929 + }, + { + "epoch": 2.4115494103294024, + "grad_norm": 7.305561442802407, + "learning_rate": 1.8758728862872372e-05, + "loss": 0.185, + "step": 5930 + }, + { + "epoch": 2.411956079707198, + "grad_norm": 9.304451329036993, + "learning_rate": 1.8758239912155554e-05, + "loss": 0.2112, + "step": 5931 + }, + { + "epoch": 2.412362749084994, + "grad_norm": 8.777569117600411, + "learning_rate": 1.8757750871530995e-05, + "loss": 0.2158, + "step": 5932 + }, + { + "epoch": 2.4127694184627897, + "grad_norm": 2.369898961073032, + "learning_rate": 1.8757261741003715e-05, + "loss": 0.0315, + "step": 5933 + }, + { + "epoch": 2.4131760878405855, + "grad_norm": 3.284982528697218, + "learning_rate": 1.8756772520578738e-05, + "loss": 0.0379, + "step": 5934 + }, + { + "epoch": 2.4135827572183812, + "grad_norm": 9.64350506711821, + "learning_rate": 1.875628321026108e-05, + "loss": 0.4566, + "step": 5935 + }, + { + "epoch": 2.4139894265961774, + "grad_norm": 8.600880998698234, + "learning_rate": 1.875579381005577e-05, + "loss": 0.4226, + "step": 5936 + }, + { + "epoch": 2.414396095973973, + "grad_norm": 0.8237314086576512, + "learning_rate": 1.875530431996783e-05, + "loss": 0.0106, + "step": 5937 + }, + { + "epoch": 2.414802765351769, + "grad_norm": 2.7300165028088164, + "learning_rate": 1.8754814740002288e-05, + "loss": 0.105, + "step": 5938 + }, + { + "epoch": 2.4152094347295647, + "grad_norm": 2.559306747328149, + "learning_rate": 1.875432507016416e-05, + "loss": 0.0369, + "step": 5939 + }, + { + "epoch": 2.415616104107361, + "grad_norm": 8.492832807513615, + "learning_rate": 1.8753835310458482e-05, + "loss": 0.2874, + "step": 5940 + }, + { + "epoch": 2.4160227734851567, + "grad_norm": 7.9115588142400695, + "learning_rate": 1.875334546089028e-05, + "loss": 0.5002, + "step": 5941 + }, + { + "epoch": 2.4164294428629525, + "grad_norm": 10.591559489384563, + "learning_rate": 1.875285552146458e-05, + "loss": 0.474, + "step": 5942 + }, + { + "epoch": 2.4168361122407482, + "grad_norm": 5.055873748051808, + "learning_rate": 1.8752365492186415e-05, + "loss": 0.0825, + "step": 5943 + }, + { + "epoch": 2.417242781618544, + "grad_norm": 5.0926474851561805, + "learning_rate": 1.875187537306081e-05, + "loss": 0.077, + "step": 5944 + }, + { + "epoch": 2.4176494509963398, + "grad_norm": 18.22513058832544, + "learning_rate": 1.87513851640928e-05, + "loss": 1.1184, + "step": 5945 + }, + { + "epoch": 2.418056120374136, + "grad_norm": 6.348432539715073, + "learning_rate": 1.875089486528742e-05, + "loss": 0.1445, + "step": 5946 + }, + { + "epoch": 2.4184627897519317, + "grad_norm": 3.316380255814081, + "learning_rate": 1.87504044766497e-05, + "loss": 0.0476, + "step": 5947 + }, + { + "epoch": 2.4188694591297275, + "grad_norm": 1.012260645602572, + "learning_rate": 1.874991399818467e-05, + "loss": 0.0143, + "step": 5948 + }, + { + "epoch": 2.4192761285075233, + "grad_norm": 18.46236747468933, + "learning_rate": 1.874942342989737e-05, + "loss": 0.9, + "step": 5949 + }, + { + "epoch": 2.4196827978853195, + "grad_norm": 0.7049784557310131, + "learning_rate": 1.8748932771792837e-05, + "loss": 0.0122, + "step": 5950 + }, + { + "epoch": 2.4200894672631152, + "grad_norm": 9.058403520729806, + "learning_rate": 1.8748442023876107e-05, + "loss": 0.3503, + "step": 5951 + }, + { + "epoch": 2.420496136640911, + "grad_norm": 5.4584935711597105, + "learning_rate": 1.874795118615221e-05, + "loss": 0.1834, + "step": 5952 + }, + { + "epoch": 2.4209028060187068, + "grad_norm": 1.561404344555728, + "learning_rate": 1.8747460258626196e-05, + "loss": 0.014, + "step": 5953 + }, + { + "epoch": 2.4213094753965025, + "grad_norm": 8.985811535103986, + "learning_rate": 1.8746969241303102e-05, + "loss": 0.3582, + "step": 5954 + }, + { + "epoch": 2.4217161447742983, + "grad_norm": 13.45210084165956, + "learning_rate": 1.8746478134187963e-05, + "loss": 0.9501, + "step": 5955 + }, + { + "epoch": 2.4221228141520945, + "grad_norm": 6.328477022562648, + "learning_rate": 1.8745986937285827e-05, + "loss": 0.3844, + "step": 5956 + }, + { + "epoch": 2.4225294835298903, + "grad_norm": 0.8561227857485392, + "learning_rate": 1.874549565060173e-05, + "loss": 0.009, + "step": 5957 + }, + { + "epoch": 2.422936152907686, + "grad_norm": 22.95533382351581, + "learning_rate": 1.874500427414072e-05, + "loss": 0.4409, + "step": 5958 + }, + { + "epoch": 2.423342822285482, + "grad_norm": 0.31928811283434444, + "learning_rate": 1.874451280790784e-05, + "loss": 0.0034, + "step": 5959 + }, + { + "epoch": 2.4237494916632776, + "grad_norm": 4.798158160785885, + "learning_rate": 1.8744021251908137e-05, + "loss": 0.1455, + "step": 5960 + }, + { + "epoch": 2.424156161041074, + "grad_norm": 9.84680339286165, + "learning_rate": 1.8743529606146654e-05, + "loss": 0.3199, + "step": 5961 + }, + { + "epoch": 2.4245628304188696, + "grad_norm": 6.6449859255153205, + "learning_rate": 1.8743037870628437e-05, + "loss": 0.3859, + "step": 5962 + }, + { + "epoch": 2.4249694997966653, + "grad_norm": 7.889033995054785, + "learning_rate": 1.8742546045358538e-05, + "loss": 0.1312, + "step": 5963 + }, + { + "epoch": 2.425376169174461, + "grad_norm": 0.13825508843494408, + "learning_rate": 1.8742054130342e-05, + "loss": 0.0023, + "step": 5964 + }, + { + "epoch": 2.425782838552257, + "grad_norm": 11.179728497424017, + "learning_rate": 1.874156212558388e-05, + "loss": 0.7031, + "step": 5965 + }, + { + "epoch": 2.426189507930053, + "grad_norm": 11.02712091041045, + "learning_rate": 1.8741070031089227e-05, + "loss": 0.6039, + "step": 5966 + }, + { + "epoch": 2.426596177307849, + "grad_norm": 11.11201112663496, + "learning_rate": 1.874057784686309e-05, + "loss": 0.285, + "step": 5967 + }, + { + "epoch": 2.4270028466856446, + "grad_norm": 10.268990598197425, + "learning_rate": 1.874008557291052e-05, + "loss": 0.4736, + "step": 5968 + }, + { + "epoch": 2.4274095160634404, + "grad_norm": 16.528643121539332, + "learning_rate": 1.8739593209236572e-05, + "loss": 0.7867, + "step": 5969 + }, + { + "epoch": 2.427816185441236, + "grad_norm": 3.791771040443543, + "learning_rate": 1.8739100755846305e-05, + "loss": 0.0607, + "step": 5970 + }, + { + "epoch": 2.4282228548190323, + "grad_norm": 15.028197514640052, + "learning_rate": 1.8738608212744765e-05, + "loss": 0.1083, + "step": 5971 + }, + { + "epoch": 2.428629524196828, + "grad_norm": 8.538244801182353, + "learning_rate": 1.873811557993702e-05, + "loss": 0.3599, + "step": 5972 + }, + { + "epoch": 2.429036193574624, + "grad_norm": 8.667235793383965, + "learning_rate": 1.8737622857428117e-05, + "loss": 0.1626, + "step": 5973 + }, + { + "epoch": 2.4294428629524196, + "grad_norm": 8.95459730192311, + "learning_rate": 1.873713004522312e-05, + "loss": 0.2567, + "step": 5974 + }, + { + "epoch": 2.4298495323302154, + "grad_norm": 9.921199632922688, + "learning_rate": 1.8736637143327083e-05, + "loss": 0.239, + "step": 5975 + }, + { + "epoch": 2.430256201708011, + "grad_norm": 4.257993450360828, + "learning_rate": 1.873614415174507e-05, + "loss": 0.077, + "step": 5976 + }, + { + "epoch": 2.4306628710858074, + "grad_norm": 1.3521237201437464, + "learning_rate": 1.873565107048214e-05, + "loss": 0.0241, + "step": 5977 + }, + { + "epoch": 2.431069540463603, + "grad_norm": 8.971647522136728, + "learning_rate": 1.8735157899543358e-05, + "loss": 0.5416, + "step": 5978 + }, + { + "epoch": 2.431476209841399, + "grad_norm": 0.31743388967374864, + "learning_rate": 1.873466463893378e-05, + "loss": 0.0053, + "step": 5979 + }, + { + "epoch": 2.4318828792191947, + "grad_norm": 4.397024262158273, + "learning_rate": 1.8734171288658476e-05, + "loss": 0.0617, + "step": 5980 + }, + { + "epoch": 2.432289548596991, + "grad_norm": 23.764324335725924, + "learning_rate": 1.8733677848722507e-05, + "loss": 1.0325, + "step": 5981 + }, + { + "epoch": 2.4326962179747866, + "grad_norm": 6.251508403937078, + "learning_rate": 1.8733184319130937e-05, + "loss": 0.1229, + "step": 5982 + }, + { + "epoch": 2.4331028873525824, + "grad_norm": 10.11321529168559, + "learning_rate": 1.8732690699888837e-05, + "loss": 0.3777, + "step": 5983 + }, + { + "epoch": 2.433509556730378, + "grad_norm": 7.512826724476236, + "learning_rate": 1.8732196991001276e-05, + "loss": 0.163, + "step": 5984 + }, + { + "epoch": 2.433916226108174, + "grad_norm": 3.2857804868533487, + "learning_rate": 1.8731703192473313e-05, + "loss": 0.0534, + "step": 5985 + }, + { + "epoch": 2.4343228954859697, + "grad_norm": 11.96940667489486, + "learning_rate": 1.8731209304310023e-05, + "loss": 0.4108, + "step": 5986 + }, + { + "epoch": 2.434729564863766, + "grad_norm": 3.7567849993304723, + "learning_rate": 1.8730715326516474e-05, + "loss": 0.0687, + "step": 5987 + }, + { + "epoch": 2.4351362342415617, + "grad_norm": 8.531068521030479, + "learning_rate": 1.873022125909774e-05, + "loss": 0.4153, + "step": 5988 + }, + { + "epoch": 2.4355429036193574, + "grad_norm": 7.930779939336204, + "learning_rate": 1.872972710205889e-05, + "loss": 0.59, + "step": 5989 + }, + { + "epoch": 2.435949572997153, + "grad_norm": 6.801293607932157, + "learning_rate": 1.8729232855404998e-05, + "loss": 0.1577, + "step": 5990 + }, + { + "epoch": 2.4363562423749494, + "grad_norm": 7.852772573436894, + "learning_rate": 1.8728738519141136e-05, + "loss": 0.2625, + "step": 5991 + }, + { + "epoch": 2.436762911752745, + "grad_norm": 7.312910725250514, + "learning_rate": 1.872824409327238e-05, + "loss": 0.5, + "step": 5992 + }, + { + "epoch": 2.437169581130541, + "grad_norm": 10.349997481044776, + "learning_rate": 1.8727749577803807e-05, + "loss": 0.2994, + "step": 5993 + }, + { + "epoch": 2.4375762505083367, + "grad_norm": 26.86713525816793, + "learning_rate": 1.8727254972740494e-05, + "loss": 0.5024, + "step": 5994 + }, + { + "epoch": 2.4379829198861325, + "grad_norm": 9.537066510216668, + "learning_rate": 1.8726760278087512e-05, + "loss": 0.3035, + "step": 5995 + }, + { + "epoch": 2.4383895892639282, + "grad_norm": 6.7126730727588155, + "learning_rate": 1.8726265493849947e-05, + "loss": 0.154, + "step": 5996 + }, + { + "epoch": 2.4387962586417244, + "grad_norm": 7.5483283196676, + "learning_rate": 1.8725770620032875e-05, + "loss": 0.1989, + "step": 5997 + }, + { + "epoch": 2.43920292801952, + "grad_norm": 9.963897702788117, + "learning_rate": 1.8725275656641374e-05, + "loss": 0.4975, + "step": 5998 + }, + { + "epoch": 2.439609597397316, + "grad_norm": 10.057739883031516, + "learning_rate": 1.872478060368053e-05, + "loss": 0.1577, + "step": 5999 + }, + { + "epoch": 2.4400162667751117, + "grad_norm": 3.790535004125371, + "learning_rate": 1.8724285461155418e-05, + "loss": 0.0677, + "step": 6000 + }, + { + "epoch": 2.4404229361529075, + "grad_norm": 11.273241113748487, + "learning_rate": 1.8723790229071127e-05, + "loss": 0.4566, + "step": 6001 + }, + { + "epoch": 2.4408296055307037, + "grad_norm": 2.331436663282983, + "learning_rate": 1.872329490743274e-05, + "loss": 0.0517, + "step": 6002 + }, + { + "epoch": 2.4412362749084995, + "grad_norm": 6.595751267268202, + "learning_rate": 1.8722799496245335e-05, + "loss": 0.1692, + "step": 6003 + }, + { + "epoch": 2.4416429442862952, + "grad_norm": 12.70681886513074, + "learning_rate": 1.872230399551401e-05, + "loss": 0.4842, + "step": 6004 + }, + { + "epoch": 2.442049613664091, + "grad_norm": 12.750929231056242, + "learning_rate": 1.872180840524384e-05, + "loss": 0.7537, + "step": 6005 + }, + { + "epoch": 2.4424562830418868, + "grad_norm": 3.093240393183667, + "learning_rate": 1.872131272543992e-05, + "loss": 0.0579, + "step": 6006 + }, + { + "epoch": 2.442862952419683, + "grad_norm": 10.561577369099876, + "learning_rate": 1.8720816956107334e-05, + "loss": 0.2603, + "step": 6007 + }, + { + "epoch": 2.4432696217974788, + "grad_norm": 7.96810997099812, + "learning_rate": 1.8720321097251175e-05, + "loss": 0.3194, + "step": 6008 + }, + { + "epoch": 2.4436762911752745, + "grad_norm": 12.52116783279288, + "learning_rate": 1.871982514887653e-05, + "loss": 0.6063, + "step": 6009 + }, + { + "epoch": 2.4440829605530703, + "grad_norm": 13.325345650422763, + "learning_rate": 1.8719329110988487e-05, + "loss": 0.2184, + "step": 6010 + }, + { + "epoch": 2.444489629930866, + "grad_norm": 11.99597592111181, + "learning_rate": 1.8718832983592146e-05, + "loss": 0.4336, + "step": 6011 + }, + { + "epoch": 2.4448962993086623, + "grad_norm": 1.2354690651483624, + "learning_rate": 1.87183367666926e-05, + "loss": 0.0153, + "step": 6012 + }, + { + "epoch": 2.445302968686458, + "grad_norm": 4.18132493786505, + "learning_rate": 1.871784046029493e-05, + "loss": 0.0611, + "step": 6013 + }, + { + "epoch": 2.445709638064254, + "grad_norm": 16.56421025843335, + "learning_rate": 1.8717344064404247e-05, + "loss": 0.7442, + "step": 6014 + }, + { + "epoch": 2.4461163074420496, + "grad_norm": 7.515581567919245, + "learning_rate": 1.871684757902564e-05, + "loss": 0.1552, + "step": 6015 + }, + { + "epoch": 2.4465229768198453, + "grad_norm": 16.261079909215233, + "learning_rate": 1.8716351004164202e-05, + "loss": 0.3603, + "step": 6016 + }, + { + "epoch": 2.446929646197641, + "grad_norm": 9.428175939675047, + "learning_rate": 1.8715854339825037e-05, + "loss": 0.6354, + "step": 6017 + }, + { + "epoch": 2.4473363155754373, + "grad_norm": 9.998503141579684, + "learning_rate": 1.871535758601324e-05, + "loss": 0.1907, + "step": 6018 + }, + { + "epoch": 2.447742984953233, + "grad_norm": 3.3112483376479784, + "learning_rate": 1.8714860742733908e-05, + "loss": 0.0617, + "step": 6019 + }, + { + "epoch": 2.448149654331029, + "grad_norm": 7.362096350292951, + "learning_rate": 1.871436380999215e-05, + "loss": 0.4092, + "step": 6020 + }, + { + "epoch": 2.4485563237088246, + "grad_norm": 8.781594616091555, + "learning_rate": 1.8713866787793056e-05, + "loss": 0.4053, + "step": 6021 + }, + { + "epoch": 2.448962993086621, + "grad_norm": 2.9171403629786927, + "learning_rate": 1.8713369676141737e-05, + "loss": 0.0595, + "step": 6022 + }, + { + "epoch": 2.4493696624644166, + "grad_norm": 16.885822533556478, + "learning_rate": 1.8712872475043292e-05, + "loss": 1.3444, + "step": 6023 + }, + { + "epoch": 2.4497763318422123, + "grad_norm": 8.542038475849724, + "learning_rate": 1.8712375184502826e-05, + "loss": 0.234, + "step": 6024 + }, + { + "epoch": 2.450183001220008, + "grad_norm": 51.644221124366375, + "learning_rate": 1.8711877804525446e-05, + "loss": 0.4801, + "step": 6025 + }, + { + "epoch": 2.450589670597804, + "grad_norm": 23.05653267446967, + "learning_rate": 1.8711380335116253e-05, + "loss": 0.8777, + "step": 6026 + }, + { + "epoch": 2.4509963399755996, + "grad_norm": 3.902736583426932, + "learning_rate": 1.8710882776280358e-05, + "loss": 0.0826, + "step": 6027 + }, + { + "epoch": 2.451403009353396, + "grad_norm": 2.722373119373008, + "learning_rate": 1.8710385128022866e-05, + "loss": 0.0392, + "step": 6028 + }, + { + "epoch": 2.4518096787311916, + "grad_norm": 10.98111177993182, + "learning_rate": 1.8709887390348886e-05, + "loss": 0.3432, + "step": 6029 + }, + { + "epoch": 2.4522163481089874, + "grad_norm": 3.1182341552064026, + "learning_rate": 1.870938956326353e-05, + "loss": 0.0432, + "step": 6030 + }, + { + "epoch": 2.452623017486783, + "grad_norm": 0.34330246761641775, + "learning_rate": 1.8708891646771907e-05, + "loss": 0.0057, + "step": 6031 + }, + { + "epoch": 2.4530296868645793, + "grad_norm": 4.837120402808435, + "learning_rate": 1.8708393640879127e-05, + "loss": 0.0619, + "step": 6032 + }, + { + "epoch": 2.453436356242375, + "grad_norm": 5.933681640449257, + "learning_rate": 1.8707895545590307e-05, + "loss": 0.1523, + "step": 6033 + }, + { + "epoch": 2.453843025620171, + "grad_norm": 7.144537371123911, + "learning_rate": 1.8707397360910552e-05, + "loss": 0.2154, + "step": 6034 + }, + { + "epoch": 2.4542496949979666, + "grad_norm": 1.5901718907888223, + "learning_rate": 1.8706899086844983e-05, + "loss": 0.0313, + "step": 6035 + }, + { + "epoch": 2.4546563643757624, + "grad_norm": 3.978867487074008, + "learning_rate": 1.870640072339871e-05, + "loss": 0.1266, + "step": 6036 + }, + { + "epoch": 2.455063033753558, + "grad_norm": 6.527500639845729, + "learning_rate": 1.8705902270576857e-05, + "loss": 0.1528, + "step": 6037 + }, + { + "epoch": 2.4554697031313544, + "grad_norm": 2.754187744255439, + "learning_rate": 1.8705403728384533e-05, + "loss": 0.042, + "step": 6038 + }, + { + "epoch": 2.45587637250915, + "grad_norm": 9.61326704301292, + "learning_rate": 1.870490509682686e-05, + "loss": 0.3224, + "step": 6039 + }, + { + "epoch": 2.456283041886946, + "grad_norm": 5.305483915423056, + "learning_rate": 1.870440637590895e-05, + "loss": 0.1225, + "step": 6040 + }, + { + "epoch": 2.4566897112647417, + "grad_norm": 12.478329827290802, + "learning_rate": 1.8703907565635932e-05, + "loss": 0.5186, + "step": 6041 + }, + { + "epoch": 2.4570963806425374, + "grad_norm": 3.311470799641557, + "learning_rate": 1.870340866601292e-05, + "loss": 0.0872, + "step": 6042 + }, + { + "epoch": 2.4575030500203336, + "grad_norm": 13.061766294556774, + "learning_rate": 1.8702909677045037e-05, + "loss": 0.7411, + "step": 6043 + }, + { + "epoch": 2.4579097193981294, + "grad_norm": 21.86771241125676, + "learning_rate": 1.870241059873741e-05, + "loss": 1.2638, + "step": 6044 + }, + { + "epoch": 2.458316388775925, + "grad_norm": 14.504936117133239, + "learning_rate": 1.8701911431095157e-05, + "loss": 0.9392, + "step": 6045 + }, + { + "epoch": 2.458723058153721, + "grad_norm": 4.47609596976951, + "learning_rate": 1.87014121741234e-05, + "loss": 0.1674, + "step": 6046 + }, + { + "epoch": 2.4591297275315167, + "grad_norm": 2.636049685328911, + "learning_rate": 1.870091282782727e-05, + "loss": 0.0395, + "step": 6047 + }, + { + "epoch": 2.459536396909313, + "grad_norm": 16.4446884693802, + "learning_rate": 1.870041339221189e-05, + "loss": 0.5201, + "step": 6048 + }, + { + "epoch": 2.4599430662871087, + "grad_norm": 21.6260020028725, + "learning_rate": 1.869991386728239e-05, + "loss": 0.5152, + "step": 6049 + }, + { + "epoch": 2.4603497356649044, + "grad_norm": 14.123760889173262, + "learning_rate": 1.8699414253043894e-05, + "loss": 0.6276, + "step": 6050 + }, + { + "epoch": 2.4607564050427, + "grad_norm": 6.4135489304482896, + "learning_rate": 1.8698914549501535e-05, + "loss": 0.2085, + "step": 6051 + }, + { + "epoch": 2.461163074420496, + "grad_norm": 8.028913426742852, + "learning_rate": 1.8698414756660436e-05, + "loss": 0.2172, + "step": 6052 + }, + { + "epoch": 2.461569743798292, + "grad_norm": 11.556088031783748, + "learning_rate": 1.8697914874525737e-05, + "loss": 0.5172, + "step": 6053 + }, + { + "epoch": 2.461976413176088, + "grad_norm": 14.946663112876193, + "learning_rate": 1.869741490310256e-05, + "loss": 1.022, + "step": 6054 + }, + { + "epoch": 2.4623830825538837, + "grad_norm": 9.934755717506459, + "learning_rate": 1.869691484239604e-05, + "loss": 0.3364, + "step": 6055 + }, + { + "epoch": 2.4627897519316795, + "grad_norm": 12.019463431616728, + "learning_rate": 1.8696414692411316e-05, + "loss": 0.7425, + "step": 6056 + }, + { + "epoch": 2.4631964213094752, + "grad_norm": 11.45685925723045, + "learning_rate": 1.8695914453153516e-05, + "loss": 0.3024, + "step": 6057 + }, + { + "epoch": 2.463603090687271, + "grad_norm": 10.592837920880191, + "learning_rate": 1.869541412462778e-05, + "loss": 0.3651, + "step": 6058 + }, + { + "epoch": 2.4640097600650672, + "grad_norm": 3.98957436471766, + "learning_rate": 1.8694913706839242e-05, + "loss": 0.0688, + "step": 6059 + }, + { + "epoch": 2.464416429442863, + "grad_norm": 7.1188856809692345, + "learning_rate": 1.8694413199793037e-05, + "loss": 0.2834, + "step": 6060 + }, + { + "epoch": 2.4648230988206588, + "grad_norm": 1.4416399059229323, + "learning_rate": 1.8693912603494306e-05, + "loss": 0.0218, + "step": 6061 + }, + { + "epoch": 2.4652297681984545, + "grad_norm": 13.26754124738919, + "learning_rate": 1.8693411917948182e-05, + "loss": 0.7863, + "step": 6062 + }, + { + "epoch": 2.4656364375762507, + "grad_norm": 13.590171958713345, + "learning_rate": 1.8692911143159812e-05, + "loss": 0.1886, + "step": 6063 + }, + { + "epoch": 2.4660431069540465, + "grad_norm": 3.533459320362766, + "learning_rate": 1.8692410279134336e-05, + "loss": 0.0671, + "step": 6064 + }, + { + "epoch": 2.4664497763318423, + "grad_norm": 10.760277511210559, + "learning_rate": 1.8691909325876893e-05, + "loss": 0.3909, + "step": 6065 + }, + { + "epoch": 2.466856445709638, + "grad_norm": 12.96593108542524, + "learning_rate": 1.8691408283392623e-05, + "loss": 0.6191, + "step": 6066 + }, + { + "epoch": 2.467263115087434, + "grad_norm": 7.354972992820867, + "learning_rate": 1.869090715168668e-05, + "loss": 0.1581, + "step": 6067 + }, + { + "epoch": 2.4676697844652296, + "grad_norm": 6.365371221916041, + "learning_rate": 1.8690405930764194e-05, + "loss": 0.1387, + "step": 6068 + }, + { + "epoch": 2.4680764538430258, + "grad_norm": 13.292934106607326, + "learning_rate": 1.868990462063032e-05, + "loss": 0.5702, + "step": 6069 + }, + { + "epoch": 2.4684831232208215, + "grad_norm": 0.7807586176621603, + "learning_rate": 1.8689403221290203e-05, + "loss": 0.009, + "step": 6070 + }, + { + "epoch": 2.4688897925986173, + "grad_norm": 6.700290857204283, + "learning_rate": 1.8688901732748984e-05, + "loss": 0.2169, + "step": 6071 + }, + { + "epoch": 2.469296461976413, + "grad_norm": 0.8616902466303537, + "learning_rate": 1.868840015501182e-05, + "loss": 0.017, + "step": 6072 + }, + { + "epoch": 2.4697031313542093, + "grad_norm": 12.999073136797563, + "learning_rate": 1.8687898488083856e-05, + "loss": 0.1856, + "step": 6073 + }, + { + "epoch": 2.470109800732005, + "grad_norm": 7.336092729673018, + "learning_rate": 1.868739673197024e-05, + "loss": 0.1084, + "step": 6074 + }, + { + "epoch": 2.470516470109801, + "grad_norm": 0.9948973317378311, + "learning_rate": 1.868689488667612e-05, + "loss": 0.0181, + "step": 6075 + }, + { + "epoch": 2.4709231394875966, + "grad_norm": 7.5017396342323375, + "learning_rate": 1.868639295220666e-05, + "loss": 0.1211, + "step": 6076 + }, + { + "epoch": 2.4713298088653923, + "grad_norm": 1.8482658560377982, + "learning_rate": 1.8685890928567e-05, + "loss": 0.0278, + "step": 6077 + }, + { + "epoch": 2.471736478243188, + "grad_norm": 5.157119979484572, + "learning_rate": 1.8685388815762296e-05, + "loss": 0.1398, + "step": 6078 + }, + { + "epoch": 2.4721431476209843, + "grad_norm": 8.094635767657035, + "learning_rate": 1.868488661379771e-05, + "loss": 0.1768, + "step": 6079 + }, + { + "epoch": 2.47254981699878, + "grad_norm": 8.14043643677772, + "learning_rate": 1.8684384322678387e-05, + "loss": 0.1802, + "step": 6080 + }, + { + "epoch": 2.472956486376576, + "grad_norm": 10.190769185623102, + "learning_rate": 1.868388194240949e-05, + "loss": 0.4892, + "step": 6081 + }, + { + "epoch": 2.4733631557543716, + "grad_norm": 0.7372937724271834, + "learning_rate": 1.8683379472996175e-05, + "loss": 0.0152, + "step": 6082 + }, + { + "epoch": 2.4737698251321674, + "grad_norm": 3.123359255980713, + "learning_rate": 1.8682876914443598e-05, + "loss": 0.079, + "step": 6083 + }, + { + "epoch": 2.4741764945099636, + "grad_norm": 16.820096568233378, + "learning_rate": 1.868237426675692e-05, + "loss": 0.1999, + "step": 6084 + }, + { + "epoch": 2.4745831638877593, + "grad_norm": 1.3836993541496165, + "learning_rate": 1.8681871529941304e-05, + "loss": 0.0172, + "step": 6085 + }, + { + "epoch": 2.474989833265555, + "grad_norm": 8.523488079751884, + "learning_rate": 1.8681368704001903e-05, + "loss": 0.4546, + "step": 6086 + }, + { + "epoch": 2.475396502643351, + "grad_norm": 12.417477025906864, + "learning_rate": 1.8680865788943885e-05, + "loss": 0.5086, + "step": 6087 + }, + { + "epoch": 2.4758031720211466, + "grad_norm": 15.355668834589904, + "learning_rate": 1.868036278477241e-05, + "loss": 0.367, + "step": 6088 + }, + { + "epoch": 2.476209841398943, + "grad_norm": 10.059825415061974, + "learning_rate": 1.867985969149264e-05, + "loss": 0.3344, + "step": 6089 + }, + { + "epoch": 2.4766165107767386, + "grad_norm": 0.573061655995141, + "learning_rate": 1.8679356509109745e-05, + "loss": 0.007, + "step": 6090 + }, + { + "epoch": 2.4770231801545344, + "grad_norm": 2.5098288399878, + "learning_rate": 1.8678853237628886e-05, + "loss": 0.0304, + "step": 6091 + }, + { + "epoch": 2.47742984953233, + "grad_norm": 4.49684701868953, + "learning_rate": 1.867834987705523e-05, + "loss": 0.0894, + "step": 6092 + }, + { + "epoch": 2.477836518910126, + "grad_norm": 16.59772676384236, + "learning_rate": 1.8677846427393945e-05, + "loss": 1.4857, + "step": 6093 + }, + { + "epoch": 2.478243188287922, + "grad_norm": 5.342574102775775, + "learning_rate": 1.86773428886502e-05, + "loss": 0.0947, + "step": 6094 + }, + { + "epoch": 2.478649857665718, + "grad_norm": 1.2585497454308927, + "learning_rate": 1.8676839260829163e-05, + "loss": 0.0171, + "step": 6095 + }, + { + "epoch": 2.4790565270435136, + "grad_norm": 12.752360424995913, + "learning_rate": 1.8676335543936005e-05, + "loss": 0.789, + "step": 6096 + }, + { + "epoch": 2.4794631964213094, + "grad_norm": 5.184283304228117, + "learning_rate": 1.8675831737975893e-05, + "loss": 0.2165, + "step": 6097 + }, + { + "epoch": 2.479869865799105, + "grad_norm": 7.981434203464194, + "learning_rate": 1.8675327842954004e-05, + "loss": 0.3348, + "step": 6098 + }, + { + "epoch": 2.480276535176901, + "grad_norm": 1.110684390876011, + "learning_rate": 1.8674823858875507e-05, + "loss": 0.0172, + "step": 6099 + }, + { + "epoch": 2.480683204554697, + "grad_norm": 12.682971735553048, + "learning_rate": 1.8674319785745577e-05, + "loss": 0.4173, + "step": 6100 + }, + { + "epoch": 2.481089873932493, + "grad_norm": 9.301409594208042, + "learning_rate": 1.8673815623569388e-05, + "loss": 0.1432, + "step": 6101 + }, + { + "epoch": 2.4814965433102887, + "grad_norm": 5.034333556938199, + "learning_rate": 1.867331137235212e-05, + "loss": 0.0622, + "step": 6102 + }, + { + "epoch": 2.4819032126880844, + "grad_norm": 1.1436238780755577, + "learning_rate": 1.867280703209894e-05, + "loss": 0.0186, + "step": 6103 + }, + { + "epoch": 2.4823098820658807, + "grad_norm": 0.6449811007492834, + "learning_rate": 1.867230260281504e-05, + "loss": 0.0085, + "step": 6104 + }, + { + "epoch": 2.4827165514436764, + "grad_norm": 1.0897661845644153, + "learning_rate": 1.8671798084505578e-05, + "loss": 0.0143, + "step": 6105 + }, + { + "epoch": 2.483123220821472, + "grad_norm": 0.3970710749577133, + "learning_rate": 1.8671293477175753e-05, + "loss": 0.0065, + "step": 6106 + }, + { + "epoch": 2.483529890199268, + "grad_norm": 6.184219932185752, + "learning_rate": 1.8670788780830734e-05, + "loss": 0.2037, + "step": 6107 + }, + { + "epoch": 2.4839365595770637, + "grad_norm": 1.0969499219119236, + "learning_rate": 1.86702839954757e-05, + "loss": 0.0149, + "step": 6108 + }, + { + "epoch": 2.4843432289548595, + "grad_norm": 10.148301773741206, + "learning_rate": 1.866977912111584e-05, + "loss": 0.6008, + "step": 6109 + }, + { + "epoch": 2.4847498983326557, + "grad_norm": 6.010598532443764, + "learning_rate": 1.8669274157756335e-05, + "loss": 0.2592, + "step": 6110 + }, + { + "epoch": 2.4851565677104515, + "grad_norm": 8.032610020035587, + "learning_rate": 1.8668769105402366e-05, + "loss": 0.4068, + "step": 6111 + }, + { + "epoch": 2.4855632370882472, + "grad_norm": 10.076086350995098, + "learning_rate": 1.866826396405912e-05, + "loss": 0.3318, + "step": 6112 + }, + { + "epoch": 2.485969906466043, + "grad_norm": 4.944032650959654, + "learning_rate": 1.8667758733731783e-05, + "loss": 0.116, + "step": 6113 + }, + { + "epoch": 2.486376575843839, + "grad_norm": 12.272392755857675, + "learning_rate": 1.8667253414425538e-05, + "loss": 0.8527, + "step": 6114 + }, + { + "epoch": 2.486783245221635, + "grad_norm": 13.193066937941934, + "learning_rate": 1.8666748006145576e-05, + "loss": 0.5468, + "step": 6115 + }, + { + "epoch": 2.4871899145994307, + "grad_norm": 8.098627562632158, + "learning_rate": 1.8666242508897082e-05, + "loss": 0.4958, + "step": 6116 + }, + { + "epoch": 2.4875965839772265, + "grad_norm": 0.4395436467182742, + "learning_rate": 1.866573692268525e-05, + "loss": 0.0086, + "step": 6117 + }, + { + "epoch": 2.4880032533550223, + "grad_norm": 5.288948336415803, + "learning_rate": 1.8665231247515265e-05, + "loss": 0.0537, + "step": 6118 + }, + { + "epoch": 2.488409922732818, + "grad_norm": 6.176554635479514, + "learning_rate": 1.8664725483392317e-05, + "loss": 0.0851, + "step": 6119 + }, + { + "epoch": 2.4888165921106142, + "grad_norm": 4.741688962627649, + "learning_rate": 1.8664219630321606e-05, + "loss": 0.1214, + "step": 6120 + }, + { + "epoch": 2.48922326148841, + "grad_norm": 11.820418950658585, + "learning_rate": 1.8663713688308322e-05, + "loss": 0.4288, + "step": 6121 + }, + { + "epoch": 2.4896299308662058, + "grad_norm": 6.5831304521628855, + "learning_rate": 1.8663207657357652e-05, + "loss": 0.3068, + "step": 6122 + }, + { + "epoch": 2.4900366002440015, + "grad_norm": 2.0667494084388562, + "learning_rate": 1.8662701537474795e-05, + "loss": 0.0425, + "step": 6123 + }, + { + "epoch": 2.4904432696217973, + "grad_norm": 11.546693865467748, + "learning_rate": 1.866219532866495e-05, + "loss": 0.5885, + "step": 6124 + }, + { + "epoch": 2.4908499389995935, + "grad_norm": 13.660504104584335, + "learning_rate": 1.866168903093331e-05, + "loss": 0.5329, + "step": 6125 + }, + { + "epoch": 2.4912566083773893, + "grad_norm": 9.908235216803138, + "learning_rate": 1.866118264428507e-05, + "loss": 0.4765, + "step": 6126 + }, + { + "epoch": 2.491663277755185, + "grad_norm": 14.182575755698071, + "learning_rate": 1.8660676168725433e-05, + "loss": 0.4735, + "step": 6127 + }, + { + "epoch": 2.492069947132981, + "grad_norm": 1.170315391820415, + "learning_rate": 1.8660169604259596e-05, + "loss": 0.0164, + "step": 6128 + }, + { + "epoch": 2.4924766165107766, + "grad_norm": 10.296690570678514, + "learning_rate": 1.8659662950892756e-05, + "loss": 0.4883, + "step": 6129 + }, + { + "epoch": 2.4928832858885728, + "grad_norm": 8.340140880310182, + "learning_rate": 1.865915620863012e-05, + "loss": 0.3555, + "step": 6130 + }, + { + "epoch": 2.4932899552663685, + "grad_norm": 0.5678716027083498, + "learning_rate": 1.865864937747689e-05, + "loss": 0.0099, + "step": 6131 + }, + { + "epoch": 2.4936966246441643, + "grad_norm": 7.173868579708892, + "learning_rate": 1.8658142457438268e-05, + "loss": 0.144, + "step": 6132 + }, + { + "epoch": 2.49410329402196, + "grad_norm": 4.573524169526562, + "learning_rate": 1.8657635448519448e-05, + "loss": 0.2035, + "step": 6133 + }, + { + "epoch": 2.494509963399756, + "grad_norm": 13.04860416456258, + "learning_rate": 1.8657128350725648e-05, + "loss": 0.2794, + "step": 6134 + }, + { + "epoch": 2.494916632777552, + "grad_norm": 9.419563475871117, + "learning_rate": 1.865662116406207e-05, + "loss": 0.332, + "step": 6135 + }, + { + "epoch": 2.495323302155348, + "grad_norm": 11.373261758492982, + "learning_rate": 1.8656113888533914e-05, + "loss": 0.5112, + "step": 6136 + }, + { + "epoch": 2.4957299715331436, + "grad_norm": 7.66475807836614, + "learning_rate": 1.8655606524146398e-05, + "loss": 0.1145, + "step": 6137 + }, + { + "epoch": 2.4961366409109393, + "grad_norm": 5.641878524546803, + "learning_rate": 1.8655099070904722e-05, + "loss": 0.2132, + "step": 6138 + }, + { + "epoch": 2.496543310288735, + "grad_norm": 8.805567346363052, + "learning_rate": 1.8654591528814095e-05, + "loss": 0.2708, + "step": 6139 + }, + { + "epoch": 2.496949979666531, + "grad_norm": 9.398710798137117, + "learning_rate": 1.8654083897879737e-05, + "loss": 0.5104, + "step": 6140 + }, + { + "epoch": 2.497356649044327, + "grad_norm": 0.2530880570353216, + "learning_rate": 1.8653576178106846e-05, + "loss": 0.0045, + "step": 6141 + }, + { + "epoch": 2.497763318422123, + "grad_norm": 3.8115831543195173, + "learning_rate": 1.8653068369500646e-05, + "loss": 0.0605, + "step": 6142 + }, + { + "epoch": 2.4981699877999186, + "grad_norm": 7.209957091606755, + "learning_rate": 1.8652560472066338e-05, + "loss": 0.3047, + "step": 6143 + }, + { + "epoch": 2.4985766571777144, + "grad_norm": 15.843901640044797, + "learning_rate": 1.8652052485809146e-05, + "loss": 0.6144, + "step": 6144 + }, + { + "epoch": 2.4989833265555106, + "grad_norm": 0.5036951319598427, + "learning_rate": 1.865154441073428e-05, + "loss": 0.009, + "step": 6145 + }, + { + "epoch": 2.4993899959333064, + "grad_norm": 5.722503878237914, + "learning_rate": 1.8651036246846953e-05, + "loss": 0.1163, + "step": 6146 + }, + { + "epoch": 2.499796665311102, + "grad_norm": 3.455797030157364, + "learning_rate": 1.865052799415239e-05, + "loss": 0.0422, + "step": 6147 + }, + { + "epoch": 2.500203334688898, + "grad_norm": 5.163095402623234, + "learning_rate": 1.86500196526558e-05, + "loss": 0.0739, + "step": 6148 + }, + { + "epoch": 2.5006100040666936, + "grad_norm": 12.496383057870686, + "learning_rate": 1.864951122236241e-05, + "loss": 0.7278, + "step": 6149 + }, + { + "epoch": 2.5010166734444894, + "grad_norm": 10.147894754499037, + "learning_rate": 1.864900270327743e-05, + "loss": 0.4794, + "step": 6150 + }, + { + "epoch": 2.5014233428222856, + "grad_norm": 22.16072261288453, + "learning_rate": 1.864849409540608e-05, + "loss": 0.9725, + "step": 6151 + }, + { + "epoch": 2.5018300122000814, + "grad_norm": 10.428182837357598, + "learning_rate": 1.864798539875359e-05, + "loss": 0.4364, + "step": 6152 + }, + { + "epoch": 2.502236681577877, + "grad_norm": 0.009714321033002281, + "learning_rate": 1.8647476613325178e-05, + "loss": 0.0002, + "step": 6153 + }, + { + "epoch": 2.502643350955673, + "grad_norm": 5.687823804846064, + "learning_rate": 1.864696773912607e-05, + "loss": 0.07, + "step": 6154 + }, + { + "epoch": 2.503050020333469, + "grad_norm": 0.053884362959098694, + "learning_rate": 1.864645877616148e-05, + "loss": 0.0009, + "step": 6155 + }, + { + "epoch": 2.503456689711265, + "grad_norm": 11.84292537502173, + "learning_rate": 1.8645949724436642e-05, + "loss": 0.6268, + "step": 6156 + }, + { + "epoch": 2.5038633590890607, + "grad_norm": 0.1787127633120817, + "learning_rate": 1.8645440583956778e-05, + "loss": 0.0029, + "step": 6157 + }, + { + "epoch": 2.5042700284668564, + "grad_norm": 4.895393820610003, + "learning_rate": 1.8644931354727115e-05, + "loss": 0.4064, + "step": 6158 + }, + { + "epoch": 2.504676697844652, + "grad_norm": 1.40819527082869, + "learning_rate": 1.864442203675288e-05, + "loss": 0.0299, + "step": 6159 + }, + { + "epoch": 2.505083367222448, + "grad_norm": 9.300234125551711, + "learning_rate": 1.8643912630039304e-05, + "loss": 0.1567, + "step": 6160 + }, + { + "epoch": 2.505490036600244, + "grad_norm": 7.460978096457077, + "learning_rate": 1.8643403134591614e-05, + "loss": 0.1017, + "step": 6161 + }, + { + "epoch": 2.50589670597804, + "grad_norm": 9.921273287414714, + "learning_rate": 1.8642893550415038e-05, + "loss": 0.471, + "step": 6162 + }, + { + "epoch": 2.5063033753558357, + "grad_norm": 8.082672313157227, + "learning_rate": 1.864238387751481e-05, + "loss": 0.2236, + "step": 6163 + }, + { + "epoch": 2.5067100447336315, + "grad_norm": 12.822715979799977, + "learning_rate": 1.8641874115896165e-05, + "loss": 0.6996, + "step": 6164 + }, + { + "epoch": 2.5071167141114277, + "grad_norm": 5.378724051898852, + "learning_rate": 1.8641364265564333e-05, + "loss": 0.0955, + "step": 6165 + }, + { + "epoch": 2.5075233834892234, + "grad_norm": 10.635780146051834, + "learning_rate": 1.8640854326524547e-05, + "loss": 0.4982, + "step": 6166 + }, + { + "epoch": 2.507930052867019, + "grad_norm": 5.05214194972692, + "learning_rate": 1.864034429878204e-05, + "loss": 0.0699, + "step": 6167 + }, + { + "epoch": 2.508336722244815, + "grad_norm": 0.9543656611385957, + "learning_rate": 1.8639834182342053e-05, + "loss": 0.016, + "step": 6168 + }, + { + "epoch": 2.5087433916226107, + "grad_norm": 17.207077677530965, + "learning_rate": 1.8639323977209818e-05, + "loss": 0.1063, + "step": 6169 + }, + { + "epoch": 2.5091500610004065, + "grad_norm": 10.410925138185243, + "learning_rate": 1.8638813683390574e-05, + "loss": 0.6759, + "step": 6170 + }, + { + "epoch": 2.5095567303782023, + "grad_norm": 6.132391497421538, + "learning_rate": 1.863830330088956e-05, + "loss": 0.3012, + "step": 6171 + }, + { + "epoch": 2.5099633997559985, + "grad_norm": 9.753388102155082, + "learning_rate": 1.863779282971202e-05, + "loss": 0.4289, + "step": 6172 + }, + { + "epoch": 2.5103700691337942, + "grad_norm": 10.569910822917013, + "learning_rate": 1.8637282269863185e-05, + "loss": 0.2686, + "step": 6173 + }, + { + "epoch": 2.51077673851159, + "grad_norm": 10.471364498820144, + "learning_rate": 1.8636771621348298e-05, + "loss": 0.448, + "step": 6174 + }, + { + "epoch": 2.5111834078893858, + "grad_norm": 3.6433857330588855, + "learning_rate": 1.8636260884172605e-05, + "loss": 0.0945, + "step": 6175 + }, + { + "epoch": 2.511590077267182, + "grad_norm": 6.175268625471347, + "learning_rate": 1.863575005834135e-05, + "loss": 0.1437, + "step": 6176 + }, + { + "epoch": 2.5119967466449777, + "grad_norm": 0.15492961659883947, + "learning_rate": 1.863523914385977e-05, + "loss": 0.0027, + "step": 6177 + }, + { + "epoch": 2.5124034160227735, + "grad_norm": 7.625966068228361, + "learning_rate": 1.863472814073312e-05, + "loss": 0.3311, + "step": 6178 + }, + { + "epoch": 2.5128100854005693, + "grad_norm": 10.942575037009156, + "learning_rate": 1.8634217048966638e-05, + "loss": 0.4193, + "step": 6179 + }, + { + "epoch": 2.513216754778365, + "grad_norm": 17.043991103121993, + "learning_rate": 1.863370586856557e-05, + "loss": 0.9952, + "step": 6180 + }, + { + "epoch": 2.513623424156161, + "grad_norm": 12.582387974356386, + "learning_rate": 1.8633194599535168e-05, + "loss": 0.6064, + "step": 6181 + }, + { + "epoch": 2.514030093533957, + "grad_norm": 0.12629102020964206, + "learning_rate": 1.863268324188068e-05, + "loss": 0.0018, + "step": 6182 + }, + { + "epoch": 2.5144367629117528, + "grad_norm": 2.357545993047345, + "learning_rate": 1.863217179560735e-05, + "loss": 0.0406, + "step": 6183 + }, + { + "epoch": 2.5148434322895485, + "grad_norm": 7.8653900744082, + "learning_rate": 1.8631660260720435e-05, + "loss": 0.2038, + "step": 6184 + }, + { + "epoch": 2.5152501016673443, + "grad_norm": 4.772334045586283, + "learning_rate": 1.863114863722518e-05, + "loss": 0.2395, + "step": 6185 + }, + { + "epoch": 2.5156567710451405, + "grad_norm": 0.194006380675544, + "learning_rate": 1.8630636925126843e-05, + "loss": 0.0028, + "step": 6186 + }, + { + "epoch": 2.5160634404229363, + "grad_norm": 4.291448961090226, + "learning_rate": 1.8630125124430673e-05, + "loss": 0.0555, + "step": 6187 + }, + { + "epoch": 2.516470109800732, + "grad_norm": 4.233478914568949, + "learning_rate": 1.8629613235141924e-05, + "loss": 0.1181, + "step": 6188 + }, + { + "epoch": 2.516876779178528, + "grad_norm": 11.039249430099296, + "learning_rate": 1.8629101257265855e-05, + "loss": 0.3322, + "step": 6189 + }, + { + "epoch": 2.5172834485563236, + "grad_norm": 22.371416228474445, + "learning_rate": 1.8628589190807717e-05, + "loss": 0.8656, + "step": 6190 + }, + { + "epoch": 2.5176901179341193, + "grad_norm": 13.478257043422982, + "learning_rate": 1.8628077035772765e-05, + "loss": 0.8468, + "step": 6191 + }, + { + "epoch": 2.5180967873119156, + "grad_norm": 11.791055843148193, + "learning_rate": 1.8627564792166265e-05, + "loss": 0.4713, + "step": 6192 + }, + { + "epoch": 2.5185034566897113, + "grad_norm": 4.63850914817783, + "learning_rate": 1.862705245999347e-05, + "loss": 0.1767, + "step": 6193 + }, + { + "epoch": 2.518910126067507, + "grad_norm": 0.781947737901798, + "learning_rate": 1.8626540039259636e-05, + "loss": 0.011, + "step": 6194 + }, + { + "epoch": 2.519316795445303, + "grad_norm": 8.839185085233874, + "learning_rate": 1.862602752997003e-05, + "loss": 0.1449, + "step": 6195 + }, + { + "epoch": 2.519723464823099, + "grad_norm": 13.265736127304807, + "learning_rate": 1.862551493212991e-05, + "loss": 0.2618, + "step": 6196 + }, + { + "epoch": 2.520130134200895, + "grad_norm": 12.981083615446165, + "learning_rate": 1.8625002245744537e-05, + "loss": 0.2635, + "step": 6197 + }, + { + "epoch": 2.5205368035786906, + "grad_norm": 17.155147071360375, + "learning_rate": 1.8624489470819175e-05, + "loss": 1.4269, + "step": 6198 + }, + { + "epoch": 2.5209434729564864, + "grad_norm": 25.83154024712852, + "learning_rate": 1.8623976607359087e-05, + "loss": 0.4261, + "step": 6199 + }, + { + "epoch": 2.521350142334282, + "grad_norm": 4.0023494034855105, + "learning_rate": 1.862346365536954e-05, + "loss": 0.0425, + "step": 6200 + }, + { + "epoch": 2.521756811712078, + "grad_norm": 4.213661159982551, + "learning_rate": 1.86229506148558e-05, + "loss": 0.1212, + "step": 6201 + }, + { + "epoch": 2.522163481089874, + "grad_norm": 7.894956526769422, + "learning_rate": 1.862243748582313e-05, + "loss": 0.2164, + "step": 6202 + }, + { + "epoch": 2.52257015046767, + "grad_norm": 13.42521576776834, + "learning_rate": 1.86219242682768e-05, + "loss": 0.5386, + "step": 6203 + }, + { + "epoch": 2.5229768198454656, + "grad_norm": 10.782101919025383, + "learning_rate": 1.8621410962222078e-05, + "loss": 0.4531, + "step": 6204 + }, + { + "epoch": 2.5233834892232614, + "grad_norm": 0.3177005062467815, + "learning_rate": 1.8620897567664234e-05, + "loss": 0.0014, + "step": 6205 + }, + { + "epoch": 2.5237901586010576, + "grad_norm": 2.4637127977028124, + "learning_rate": 1.8620384084608537e-05, + "loss": 0.0246, + "step": 6206 + }, + { + "epoch": 2.5241968279788534, + "grad_norm": 6.496381687273255, + "learning_rate": 1.861987051306026e-05, + "loss": 0.3515, + "step": 6207 + }, + { + "epoch": 2.524603497356649, + "grad_norm": 7.316717054284917, + "learning_rate": 1.861935685302467e-05, + "loss": 0.3229, + "step": 6208 + }, + { + "epoch": 2.525010166734445, + "grad_norm": 8.53582286498396, + "learning_rate": 1.8618843104507047e-05, + "loss": 0.2593, + "step": 6209 + }, + { + "epoch": 2.5254168361122407, + "grad_norm": 5.698270065642711, + "learning_rate": 1.8618329267512663e-05, + "loss": 0.1077, + "step": 6210 + }, + { + "epoch": 2.5258235054900364, + "grad_norm": 3.8520422710550326, + "learning_rate": 1.861781534204679e-05, + "loss": 0.1099, + "step": 6211 + }, + { + "epoch": 2.526230174867832, + "grad_norm": 0.9177740061758923, + "learning_rate": 1.8617301328114704e-05, + "loss": 0.0199, + "step": 6212 + }, + { + "epoch": 2.5266368442456284, + "grad_norm": 1.3222303891088405, + "learning_rate": 1.8616787225721686e-05, + "loss": 0.0185, + "step": 6213 + }, + { + "epoch": 2.527043513623424, + "grad_norm": 8.739862076219769, + "learning_rate": 1.8616273034873007e-05, + "loss": 0.4474, + "step": 6214 + }, + { + "epoch": 2.52745018300122, + "grad_norm": 5.7350808953996895, + "learning_rate": 1.8615758755573952e-05, + "loss": 0.297, + "step": 6215 + }, + { + "epoch": 2.5278568523790157, + "grad_norm": 6.6564921917747215, + "learning_rate": 1.8615244387829798e-05, + "loss": 0.1672, + "step": 6216 + }, + { + "epoch": 2.528263521756812, + "grad_norm": 9.984739621235335, + "learning_rate": 1.861472993164582e-05, + "loss": 0.3851, + "step": 6217 + }, + { + "epoch": 2.5286701911346077, + "grad_norm": 3.8074725650514374, + "learning_rate": 1.8614215387027307e-05, + "loss": 0.0838, + "step": 6218 + }, + { + "epoch": 2.5290768605124034, + "grad_norm": 5.39919203538741, + "learning_rate": 1.8613700753979535e-05, + "loss": 0.1366, + "step": 6219 + }, + { + "epoch": 2.529483529890199, + "grad_norm": 7.677851409480754, + "learning_rate": 1.861318603250779e-05, + "loss": 0.3282, + "step": 6220 + }, + { + "epoch": 2.529890199267995, + "grad_norm": 0.16304791697136264, + "learning_rate": 1.8612671222617355e-05, + "loss": 0.0021, + "step": 6221 + }, + { + "epoch": 2.5302968686457907, + "grad_norm": 0.2811244493211787, + "learning_rate": 1.8612156324313514e-05, + "loss": 0.0071, + "step": 6222 + }, + { + "epoch": 2.530703538023587, + "grad_norm": 1.100530165450058, + "learning_rate": 1.861164133760156e-05, + "loss": 0.012, + "step": 6223 + }, + { + "epoch": 2.5311102074013827, + "grad_norm": 10.11861367317891, + "learning_rate": 1.861112626248677e-05, + "loss": 0.3234, + "step": 6224 + }, + { + "epoch": 2.5315168767791785, + "grad_norm": 1.1351888851611318, + "learning_rate": 1.8610611098974432e-05, + "loss": 0.0167, + "step": 6225 + }, + { + "epoch": 2.5319235461569742, + "grad_norm": 6.436149254147769, + "learning_rate": 1.8610095847069844e-05, + "loss": 0.2137, + "step": 6226 + }, + { + "epoch": 2.5323302155347704, + "grad_norm": 1.9762306088310364, + "learning_rate": 1.8609580506778285e-05, + "loss": 0.0273, + "step": 6227 + }, + { + "epoch": 2.532736884912566, + "grad_norm": 0.5046337346354842, + "learning_rate": 1.8609065078105046e-05, + "loss": 0.0069, + "step": 6228 + }, + { + "epoch": 2.533143554290362, + "grad_norm": 0.1998279192788448, + "learning_rate": 1.8608549561055428e-05, + "loss": 0.0033, + "step": 6229 + }, + { + "epoch": 2.5335502236681577, + "grad_norm": 16.556815870643923, + "learning_rate": 1.8608033955634706e-05, + "loss": 0.7194, + "step": 6230 + }, + { + "epoch": 2.5339568930459535, + "grad_norm": 16.71391523505027, + "learning_rate": 1.8607518261848194e-05, + "loss": 0.4598, + "step": 6231 + }, + { + "epoch": 2.5343635624237493, + "grad_norm": 2.3052421628746655, + "learning_rate": 1.860700247970117e-05, + "loss": 0.0335, + "step": 6232 + }, + { + "epoch": 2.5347702318015455, + "grad_norm": 0.3920741506784019, + "learning_rate": 1.8606486609198934e-05, + "loss": 0.005, + "step": 6233 + }, + { + "epoch": 2.5351769011793412, + "grad_norm": 3.073091108986435, + "learning_rate": 1.860597065034678e-05, + "loss": 0.0896, + "step": 6234 + }, + { + "epoch": 2.535583570557137, + "grad_norm": 1.3914453812845715, + "learning_rate": 1.860545460315001e-05, + "loss": 0.0208, + "step": 6235 + }, + { + "epoch": 2.5359902399349328, + "grad_norm": 10.499138657685705, + "learning_rate": 1.8604938467613913e-05, + "loss": 0.2363, + "step": 6236 + }, + { + "epoch": 2.536396909312729, + "grad_norm": 5.75495910731009, + "learning_rate": 1.8604422243743794e-05, + "loss": 0.1505, + "step": 6237 + }, + { + "epoch": 2.5368035786905248, + "grad_norm": 1.3742258014783928, + "learning_rate": 1.860390593154495e-05, + "loss": 0.0236, + "step": 6238 + }, + { + "epoch": 2.5372102480683205, + "grad_norm": 8.179637459043516, + "learning_rate": 1.8603389531022683e-05, + "loss": 0.2886, + "step": 6239 + }, + { + "epoch": 2.5376169174461163, + "grad_norm": 14.331245662839944, + "learning_rate": 1.860287304218229e-05, + "loss": 0.463, + "step": 6240 + }, + { + "epoch": 2.538023586823912, + "grad_norm": 11.76573266069389, + "learning_rate": 1.8602356465029077e-05, + "loss": 0.5366, + "step": 6241 + }, + { + "epoch": 2.538430256201708, + "grad_norm": 9.036921391614724, + "learning_rate": 1.8601839799568343e-05, + "loss": 0.2172, + "step": 6242 + }, + { + "epoch": 2.538836925579504, + "grad_norm": 11.089809506522343, + "learning_rate": 1.8601323045805398e-05, + "loss": 0.2657, + "step": 6243 + }, + { + "epoch": 2.5392435949573, + "grad_norm": 8.222518734531704, + "learning_rate": 1.8600806203745542e-05, + "loss": 0.3175, + "step": 6244 + }, + { + "epoch": 2.5396502643350956, + "grad_norm": 13.842962345502816, + "learning_rate": 1.860028927339408e-05, + "loss": 0.6705, + "step": 6245 + }, + { + "epoch": 2.5400569337128913, + "grad_norm": 12.08461625517, + "learning_rate": 1.8599772254756317e-05, + "loss": 0.7704, + "step": 6246 + }, + { + "epoch": 2.5404636030906875, + "grad_norm": 12.648428618278942, + "learning_rate": 1.859925514783757e-05, + "loss": 0.1206, + "step": 6247 + }, + { + "epoch": 2.5408702724684833, + "grad_norm": 6.562498603858614, + "learning_rate": 1.859873795264314e-05, + "loss": 0.3416, + "step": 6248 + }, + { + "epoch": 2.541276941846279, + "grad_norm": 4.101925609784866, + "learning_rate": 1.859822066917833e-05, + "loss": 0.1613, + "step": 6249 + }, + { + "epoch": 2.541683611224075, + "grad_norm": 11.828974426492204, + "learning_rate": 1.8597703297448465e-05, + "loss": 0.868, + "step": 6250 + }, + { + "epoch": 2.5420902806018706, + "grad_norm": 4.614033058199255, + "learning_rate": 1.8597185837458847e-05, + "loss": 0.2296, + "step": 6251 + }, + { + "epoch": 2.5424969499796664, + "grad_norm": 6.202471659111523, + "learning_rate": 1.859666828921479e-05, + "loss": 0.252, + "step": 6252 + }, + { + "epoch": 2.542903619357462, + "grad_norm": 6.48260726420995, + "learning_rate": 1.8596150652721604e-05, + "loss": 0.1672, + "step": 6253 + }, + { + "epoch": 2.5433102887352583, + "grad_norm": 0.26440558628409083, + "learning_rate": 1.8595632927984607e-05, + "loss": 0.0064, + "step": 6254 + }, + { + "epoch": 2.543716958113054, + "grad_norm": 8.341487190873922, + "learning_rate": 1.8595115115009112e-05, + "loss": 0.3071, + "step": 6255 + }, + { + "epoch": 2.54412362749085, + "grad_norm": 14.026799035630232, + "learning_rate": 1.8594597213800435e-05, + "loss": 0.5962, + "step": 6256 + }, + { + "epoch": 2.5445302968686456, + "grad_norm": 3.2252108058096436, + "learning_rate": 1.859407922436389e-05, + "loss": 0.0548, + "step": 6257 + }, + { + "epoch": 2.544936966246442, + "grad_norm": 8.374941830001589, + "learning_rate": 1.85935611467048e-05, + "loss": 0.2875, + "step": 6258 + }, + { + "epoch": 2.5453436356242376, + "grad_norm": 15.45725170336911, + "learning_rate": 1.8593042980828477e-05, + "loss": 1.1566, + "step": 6259 + }, + { + "epoch": 2.5457503050020334, + "grad_norm": 11.004494696683986, + "learning_rate": 1.8592524726740243e-05, + "loss": 0.5977, + "step": 6260 + }, + { + "epoch": 2.546156974379829, + "grad_norm": 6.211706333738116, + "learning_rate": 1.859200638444542e-05, + "loss": 0.2894, + "step": 6261 + }, + { + "epoch": 2.546563643757625, + "grad_norm": 11.078582466501613, + "learning_rate": 1.8591487953949323e-05, + "loss": 0.3097, + "step": 6262 + }, + { + "epoch": 2.5469703131354207, + "grad_norm": 12.873128935584619, + "learning_rate": 1.859096943525728e-05, + "loss": 0.4684, + "step": 6263 + }, + { + "epoch": 2.547376982513217, + "grad_norm": 4.625818402312764, + "learning_rate": 1.8590450828374615e-05, + "loss": 0.0658, + "step": 6264 + }, + { + "epoch": 2.5477836518910126, + "grad_norm": 8.954420142872316, + "learning_rate": 1.8589932133306645e-05, + "loss": 0.2479, + "step": 6265 + }, + { + "epoch": 2.5481903212688084, + "grad_norm": 11.718342893383603, + "learning_rate": 1.85894133500587e-05, + "loss": 0.4288, + "step": 6266 + }, + { + "epoch": 2.548596990646604, + "grad_norm": 6.229409962597489, + "learning_rate": 1.8588894478636104e-05, + "loss": 0.1515, + "step": 6267 + }, + { + "epoch": 2.5490036600244004, + "grad_norm": 14.291341953845098, + "learning_rate": 1.8588375519044187e-05, + "loss": 0.1042, + "step": 6268 + }, + { + "epoch": 2.549410329402196, + "grad_norm": 14.516335969183238, + "learning_rate": 1.858785647128827e-05, + "loss": 0.7876, + "step": 6269 + }, + { + "epoch": 2.549816998779992, + "grad_norm": 4.177828345179885, + "learning_rate": 1.8587337335373678e-05, + "loss": 0.1023, + "step": 6270 + }, + { + "epoch": 2.5502236681577877, + "grad_norm": 6.234951154224216, + "learning_rate": 1.8586818111305756e-05, + "loss": 0.198, + "step": 6271 + }, + { + "epoch": 2.5506303375355834, + "grad_norm": 0.9962407619184724, + "learning_rate": 1.858629879908982e-05, + "loss": 0.0213, + "step": 6272 + }, + { + "epoch": 2.551037006913379, + "grad_norm": 7.945314626455491, + "learning_rate": 1.8585779398731202e-05, + "loss": 0.2603, + "step": 6273 + }, + { + "epoch": 2.5514436762911754, + "grad_norm": 14.48221769645537, + "learning_rate": 1.8585259910235243e-05, + "loss": 0.3725, + "step": 6274 + }, + { + "epoch": 2.551850345668971, + "grad_norm": 7.373385066871903, + "learning_rate": 1.8584740333607268e-05, + "loss": 0.1584, + "step": 6275 + }, + { + "epoch": 2.552257015046767, + "grad_norm": 2.8984950750850387, + "learning_rate": 1.8584220668852616e-05, + "loss": 0.0972, + "step": 6276 + }, + { + "epoch": 2.5526636844245627, + "grad_norm": 4.729065757293076, + "learning_rate": 1.858370091597661e-05, + "loss": 0.1059, + "step": 6277 + }, + { + "epoch": 2.553070353802359, + "grad_norm": 8.131208980009779, + "learning_rate": 1.85831810749846e-05, + "loss": 0.1747, + "step": 6278 + }, + { + "epoch": 2.5534770231801547, + "grad_norm": 2.2531109458004335, + "learning_rate": 1.858266114588192e-05, + "loss": 0.0427, + "step": 6279 + }, + { + "epoch": 2.5538836925579504, + "grad_norm": 4.766759559058115, + "learning_rate": 1.8582141128673898e-05, + "loss": 0.2878, + "step": 6280 + }, + { + "epoch": 2.554290361935746, + "grad_norm": 14.889744239209206, + "learning_rate": 1.8581621023365877e-05, + "loss": 0.4987, + "step": 6281 + }, + { + "epoch": 2.554697031313542, + "grad_norm": 0.23909858928222597, + "learning_rate": 1.8581100829963205e-05, + "loss": 0.0046, + "step": 6282 + }, + { + "epoch": 2.5551037006913377, + "grad_norm": 1.7228459535701408, + "learning_rate": 1.858058054847121e-05, + "loss": 0.0246, + "step": 6283 + }, + { + "epoch": 2.555510370069134, + "grad_norm": 10.809071759633794, + "learning_rate": 1.8580060178895235e-05, + "loss": 0.4863, + "step": 6284 + }, + { + "epoch": 2.5559170394469297, + "grad_norm": 3.9004182489240233, + "learning_rate": 1.8579539721240624e-05, + "loss": 0.0747, + "step": 6285 + }, + { + "epoch": 2.5563237088247255, + "grad_norm": 7.931040498535199, + "learning_rate": 1.857901917551272e-05, + "loss": 0.1442, + "step": 6286 + }, + { + "epoch": 2.5567303782025212, + "grad_norm": 16.630908871015368, + "learning_rate": 1.857849854171687e-05, + "loss": 0.6159, + "step": 6287 + }, + { + "epoch": 2.5571370475803175, + "grad_norm": 4.316457700045151, + "learning_rate": 1.8577977819858414e-05, + "loss": 0.0796, + "step": 6288 + }, + { + "epoch": 2.5575437169581132, + "grad_norm": 25.427438942722294, + "learning_rate": 1.8577457009942695e-05, + "loss": 1.1509, + "step": 6289 + }, + { + "epoch": 2.557950386335909, + "grad_norm": 4.609976408000378, + "learning_rate": 1.8576936111975062e-05, + "loss": 0.0575, + "step": 6290 + }, + { + "epoch": 2.5583570557137048, + "grad_norm": 8.503436750007369, + "learning_rate": 1.857641512596087e-05, + "loss": 0.2609, + "step": 6291 + }, + { + "epoch": 2.5587637250915005, + "grad_norm": 5.066498766455501, + "learning_rate": 1.8575894051905454e-05, + "loss": 0.1134, + "step": 6292 + }, + { + "epoch": 2.5591703944692963, + "grad_norm": 3.4457760913247313, + "learning_rate": 1.857537288981417e-05, + "loss": 0.0618, + "step": 6293 + }, + { + "epoch": 2.559577063847092, + "grad_norm": 5.610614766411839, + "learning_rate": 1.857485163969237e-05, + "loss": 0.1925, + "step": 6294 + }, + { + "epoch": 2.5599837332248883, + "grad_norm": 8.362854923551792, + "learning_rate": 1.85743303015454e-05, + "loss": 0.1501, + "step": 6295 + }, + { + "epoch": 2.560390402602684, + "grad_norm": 0.3048100285691114, + "learning_rate": 1.8573808875378617e-05, + "loss": 0.0051, + "step": 6296 + }, + { + "epoch": 2.56079707198048, + "grad_norm": 7.18688349071571, + "learning_rate": 1.857328736119737e-05, + "loss": 0.184, + "step": 6297 + }, + { + "epoch": 2.561203741358276, + "grad_norm": 0.7226902858091655, + "learning_rate": 1.8572765759007014e-05, + "loss": 0.0124, + "step": 6298 + }, + { + "epoch": 2.5616104107360718, + "grad_norm": 6.692394944595658, + "learning_rate": 1.85722440688129e-05, + "loss": 0.3652, + "step": 6299 + }, + { + "epoch": 2.5620170801138675, + "grad_norm": 11.872454989163819, + "learning_rate": 1.857172229062039e-05, + "loss": 0.6049, + "step": 6300 + }, + { + "epoch": 2.5624237494916633, + "grad_norm": 0.0641819887186461, + "learning_rate": 1.8571200424434833e-05, + "loss": 0.0011, + "step": 6301 + }, + { + "epoch": 2.562830418869459, + "grad_norm": 2.237575924330003, + "learning_rate": 1.857067847026159e-05, + "loss": 0.0991, + "step": 6302 + }, + { + "epoch": 2.563237088247255, + "grad_norm": 15.398226620407364, + "learning_rate": 1.857015642810602e-05, + "loss": 0.743, + "step": 6303 + }, + { + "epoch": 2.5636437576250506, + "grad_norm": 1.278950485351883, + "learning_rate": 1.856963429797348e-05, + "loss": 0.0271, + "step": 6304 + }, + { + "epoch": 2.564050427002847, + "grad_norm": 1.4983254766442449, + "learning_rate": 1.8569112079869333e-05, + "loss": 0.0229, + "step": 6305 + }, + { + "epoch": 2.5644570963806426, + "grad_norm": 19.615138646869998, + "learning_rate": 1.856858977379894e-05, + "loss": 0.4871, + "step": 6306 + }, + { + "epoch": 2.5648637657584383, + "grad_norm": 12.618256491957368, + "learning_rate": 1.8568067379767658e-05, + "loss": 0.378, + "step": 6307 + }, + { + "epoch": 2.565270435136234, + "grad_norm": 3.270021000067609, + "learning_rate": 1.8567544897780848e-05, + "loss": 0.0761, + "step": 6308 + }, + { + "epoch": 2.5656771045140303, + "grad_norm": 3.032920580753202, + "learning_rate": 1.8567022327843882e-05, + "loss": 0.0636, + "step": 6309 + }, + { + "epoch": 2.566083773891826, + "grad_norm": 1.6767910341513863, + "learning_rate": 1.8566499669962122e-05, + "loss": 0.02, + "step": 6310 + }, + { + "epoch": 2.566490443269622, + "grad_norm": 22.229792666908754, + "learning_rate": 1.8565976924140927e-05, + "loss": 1.0068, + "step": 6311 + }, + { + "epoch": 2.5668971126474176, + "grad_norm": 4.46167167740512, + "learning_rate": 1.8565454090385667e-05, + "loss": 0.2301, + "step": 6312 + }, + { + "epoch": 2.5673037820252134, + "grad_norm": 15.282280194382645, + "learning_rate": 1.8564931168701713e-05, + "loss": 0.5569, + "step": 6313 + }, + { + "epoch": 2.567710451403009, + "grad_norm": 1.1792079165565623, + "learning_rate": 1.856440815909443e-05, + "loss": 0.0104, + "step": 6314 + }, + { + "epoch": 2.5681171207808053, + "grad_norm": 9.503788822011982, + "learning_rate": 1.8563885061569188e-05, + "loss": 0.5235, + "step": 6315 + }, + { + "epoch": 2.568523790158601, + "grad_norm": 20.67930633372342, + "learning_rate": 1.8563361876131354e-05, + "loss": 0.8232, + "step": 6316 + }, + { + "epoch": 2.568930459536397, + "grad_norm": 8.452791724663172, + "learning_rate": 1.8562838602786303e-05, + "loss": 0.3901, + "step": 6317 + }, + { + "epoch": 2.5693371289141926, + "grad_norm": 6.038285164189252, + "learning_rate": 1.8562315241539398e-05, + "loss": 0.1971, + "step": 6318 + }, + { + "epoch": 2.569743798291989, + "grad_norm": 6.806592532758327, + "learning_rate": 1.8561791792396023e-05, + "loss": 0.243, + "step": 6319 + }, + { + "epoch": 2.5701504676697846, + "grad_norm": 0.4905055510296035, + "learning_rate": 1.8561268255361545e-05, + "loss": 0.0081, + "step": 6320 + }, + { + "epoch": 2.5705571370475804, + "grad_norm": 18.96948199336655, + "learning_rate": 1.8560744630441338e-05, + "loss": 0.6415, + "step": 6321 + }, + { + "epoch": 2.570963806425376, + "grad_norm": 10.83721134707552, + "learning_rate": 1.8560220917640777e-05, + "loss": 0.3089, + "step": 6322 + }, + { + "epoch": 2.571370475803172, + "grad_norm": 13.97051357477745, + "learning_rate": 1.8559697116965245e-05, + "loss": 0.7603, + "step": 6323 + }, + { + "epoch": 2.5717771451809677, + "grad_norm": 8.665700540817776, + "learning_rate": 1.8559173228420113e-05, + "loss": 0.3337, + "step": 6324 + }, + { + "epoch": 2.572183814558764, + "grad_norm": 11.844781249035805, + "learning_rate": 1.855864925201076e-05, + "loss": 0.7062, + "step": 6325 + }, + { + "epoch": 2.5725904839365596, + "grad_norm": 5.2542587318690375, + "learning_rate": 1.8558125187742565e-05, + "loss": 0.0894, + "step": 6326 + }, + { + "epoch": 2.5729971533143554, + "grad_norm": 8.770813626873565, + "learning_rate": 1.8557601035620903e-05, + "loss": 0.2322, + "step": 6327 + }, + { + "epoch": 2.573403822692151, + "grad_norm": 1.9992186550307893, + "learning_rate": 1.8557076795651163e-05, + "loss": 0.0365, + "step": 6328 + }, + { + "epoch": 2.5738104920699474, + "grad_norm": 6.316319754921874, + "learning_rate": 1.8556552467838723e-05, + "loss": 0.3714, + "step": 6329 + }, + { + "epoch": 2.574217161447743, + "grad_norm": 17.726652817224952, + "learning_rate": 1.855602805218897e-05, + "loss": 0.8079, + "step": 6330 + }, + { + "epoch": 2.574623830825539, + "grad_norm": 3.2948483079806183, + "learning_rate": 1.8555503548707276e-05, + "loss": 0.0862, + "step": 6331 + }, + { + "epoch": 2.5750305002033347, + "grad_norm": 10.084666865100507, + "learning_rate": 1.8554978957399034e-05, + "loss": 0.3137, + "step": 6332 + }, + { + "epoch": 2.5754371695811304, + "grad_norm": 10.521248329195776, + "learning_rate": 1.855445427826963e-05, + "loss": 0.2402, + "step": 6333 + }, + { + "epoch": 2.575843838958926, + "grad_norm": 4.888176374003317, + "learning_rate": 1.8553929511324444e-05, + "loss": 0.1275, + "step": 6334 + }, + { + "epoch": 2.576250508336722, + "grad_norm": 6.832723220316719, + "learning_rate": 1.855340465656887e-05, + "loss": 0.0772, + "step": 6335 + }, + { + "epoch": 2.576657177714518, + "grad_norm": 0.3408580571607449, + "learning_rate": 1.855287971400829e-05, + "loss": 0.0068, + "step": 6336 + }, + { + "epoch": 2.577063847092314, + "grad_norm": 5.965589202511855, + "learning_rate": 1.8552354683648094e-05, + "loss": 0.3148, + "step": 6337 + }, + { + "epoch": 2.5774705164701097, + "grad_norm": 5.7920034603745, + "learning_rate": 1.8551829565493677e-05, + "loss": 0.3209, + "step": 6338 + }, + { + "epoch": 2.577877185847906, + "grad_norm": 12.553423467317362, + "learning_rate": 1.855130435955042e-05, + "loss": 0.5683, + "step": 6339 + }, + { + "epoch": 2.5782838552257017, + "grad_norm": 11.17321483673995, + "learning_rate": 1.8550779065823724e-05, + "loss": 0.4082, + "step": 6340 + }, + { + "epoch": 2.5786905246034975, + "grad_norm": 6.978001322393126, + "learning_rate": 1.8550253684318974e-05, + "loss": 0.1725, + "step": 6341 + }, + { + "epoch": 2.5790971939812932, + "grad_norm": 12.947374332212302, + "learning_rate": 1.854972821504157e-05, + "loss": 0.4817, + "step": 6342 + }, + { + "epoch": 2.579503863359089, + "grad_norm": 1.5746002094128733, + "learning_rate": 1.8549202657996902e-05, + "loss": 0.0265, + "step": 6343 + }, + { + "epoch": 2.5799105327368848, + "grad_norm": 15.249884052891128, + "learning_rate": 1.8548677013190364e-05, + "loss": 0.4585, + "step": 6344 + }, + { + "epoch": 2.5803172021146805, + "grad_norm": 2.6672835301667948, + "learning_rate": 1.8548151280627354e-05, + "loss": 0.0251, + "step": 6345 + }, + { + "epoch": 2.5807238714924767, + "grad_norm": 4.191842538431198, + "learning_rate": 1.8547625460313276e-05, + "loss": 0.198, + "step": 6346 + }, + { + "epoch": 2.5811305408702725, + "grad_norm": 8.80109985775295, + "learning_rate": 1.8547099552253513e-05, + "loss": 0.287, + "step": 6347 + }, + { + "epoch": 2.5815372102480683, + "grad_norm": 8.027673878952715, + "learning_rate": 1.8546573556453476e-05, + "loss": 0.2144, + "step": 6348 + }, + { + "epoch": 2.581943879625864, + "grad_norm": 1.2432937612033852, + "learning_rate": 1.8546047472918556e-05, + "loss": 0.0177, + "step": 6349 + }, + { + "epoch": 2.5823505490036602, + "grad_norm": 8.797022243665683, + "learning_rate": 1.854552130165416e-05, + "loss": 0.6878, + "step": 6350 + }, + { + "epoch": 2.582757218381456, + "grad_norm": 3.2022586071833143, + "learning_rate": 1.8544995042665686e-05, + "loss": 0.0604, + "step": 6351 + }, + { + "epoch": 2.5831638877592518, + "grad_norm": 11.335942399626031, + "learning_rate": 1.8544468695958537e-05, + "loss": 0.5401, + "step": 6352 + }, + { + "epoch": 2.5835705571370475, + "grad_norm": 19.633130926302396, + "learning_rate": 1.8543942261538118e-05, + "loss": 1.5175, + "step": 6353 + }, + { + "epoch": 2.5839772265148433, + "grad_norm": 1.2114816837099605, + "learning_rate": 1.8543415739409832e-05, + "loss": 0.0243, + "step": 6354 + }, + { + "epoch": 2.584383895892639, + "grad_norm": 10.900645723913877, + "learning_rate": 1.854288912957908e-05, + "loss": 0.5245, + "step": 6355 + }, + { + "epoch": 2.5847905652704353, + "grad_norm": 6.019770446916023, + "learning_rate": 1.8542362432051276e-05, + "loss": 0.1352, + "step": 6356 + }, + { + "epoch": 2.585197234648231, + "grad_norm": 11.13579099214977, + "learning_rate": 1.854183564683182e-05, + "loss": 0.5636, + "step": 6357 + }, + { + "epoch": 2.585603904026027, + "grad_norm": 6.614887154876482, + "learning_rate": 1.854130877392612e-05, + "loss": 0.1717, + "step": 6358 + }, + { + "epoch": 2.5860105734038226, + "grad_norm": 4.530224485003344, + "learning_rate": 1.854078181333959e-05, + "loss": 0.0771, + "step": 6359 + }, + { + "epoch": 2.5864172427816188, + "grad_norm": 8.559472768416, + "learning_rate": 1.8540254765077638e-05, + "loss": 0.2399, + "step": 6360 + }, + { + "epoch": 2.5868239121594145, + "grad_norm": 11.895292092619693, + "learning_rate": 1.8539727629145667e-05, + "loss": 0.5733, + "step": 6361 + }, + { + "epoch": 2.5872305815372103, + "grad_norm": 10.2307215588734, + "learning_rate": 1.85392004055491e-05, + "loss": 0.3153, + "step": 6362 + }, + { + "epoch": 2.587637250915006, + "grad_norm": 8.781932560296456, + "learning_rate": 1.8538673094293336e-05, + "loss": 0.4222, + "step": 6363 + }, + { + "epoch": 2.588043920292802, + "grad_norm": 10.633951484097228, + "learning_rate": 1.85381456953838e-05, + "loss": 0.2904, + "step": 6364 + }, + { + "epoch": 2.5884505896705976, + "grad_norm": 7.438741921385809, + "learning_rate": 1.85376182088259e-05, + "loss": 0.2654, + "step": 6365 + }, + { + "epoch": 2.588857259048394, + "grad_norm": 8.068010572997316, + "learning_rate": 1.8537090634625055e-05, + "loss": 0.3626, + "step": 6366 + }, + { + "epoch": 2.5892639284261896, + "grad_norm": 11.755984448745016, + "learning_rate": 1.8536562972786675e-05, + "loss": 0.6616, + "step": 6367 + }, + { + "epoch": 2.5896705978039853, + "grad_norm": 4.054277924221699, + "learning_rate": 1.8536035223316177e-05, + "loss": 0.0806, + "step": 6368 + }, + { + "epoch": 2.590077267181781, + "grad_norm": 9.717321134731227, + "learning_rate": 1.853550738621899e-05, + "loss": 0.4159, + "step": 6369 + }, + { + "epoch": 2.5904839365595773, + "grad_norm": 12.369232383511463, + "learning_rate": 1.8534979461500515e-05, + "loss": 0.5547, + "step": 6370 + }, + { + "epoch": 2.590890605937373, + "grad_norm": 5.630423391493949, + "learning_rate": 1.8534451449166186e-05, + "loss": 0.226, + "step": 6371 + }, + { + "epoch": 2.591297275315169, + "grad_norm": 2.402762712095213, + "learning_rate": 1.8533923349221413e-05, + "loss": 0.0438, + "step": 6372 + }, + { + "epoch": 2.5917039446929646, + "grad_norm": 0.024893295527580718, + "learning_rate": 1.8533395161671624e-05, + "loss": 0.0004, + "step": 6373 + }, + { + "epoch": 2.5921106140707604, + "grad_norm": 5.9640045390296486, + "learning_rate": 1.853286688652224e-05, + "loss": 0.413, + "step": 6374 + }, + { + "epoch": 2.592517283448556, + "grad_norm": 1.4060549893812655, + "learning_rate": 1.8532338523778682e-05, + "loss": 0.024, + "step": 6375 + }, + { + "epoch": 2.592923952826352, + "grad_norm": 7.22237178611479, + "learning_rate": 1.8531810073446374e-05, + "loss": 0.2192, + "step": 6376 + }, + { + "epoch": 2.593330622204148, + "grad_norm": 13.922969350579457, + "learning_rate": 1.8531281535530744e-05, + "loss": 0.6399, + "step": 6377 + }, + { + "epoch": 2.593737291581944, + "grad_norm": 8.78224294449624, + "learning_rate": 1.8530752910037214e-05, + "loss": 0.569, + "step": 6378 + }, + { + "epoch": 2.5941439609597396, + "grad_norm": 7.661845519564963, + "learning_rate": 1.853022419697121e-05, + "loss": 0.2656, + "step": 6379 + }, + { + "epoch": 2.594550630337536, + "grad_norm": 11.901922472408176, + "learning_rate": 1.8529695396338164e-05, + "loss": 0.6305, + "step": 6380 + }, + { + "epoch": 2.5949572997153316, + "grad_norm": 11.192212189245133, + "learning_rate": 1.85291665081435e-05, + "loss": 0.3812, + "step": 6381 + }, + { + "epoch": 2.5953639690931274, + "grad_norm": 12.931875000384487, + "learning_rate": 1.852863753239265e-05, + "loss": 0.6078, + "step": 6382 + }, + { + "epoch": 2.595770638470923, + "grad_norm": 18.022893702422625, + "learning_rate": 1.8528108469091046e-05, + "loss": 0.9862, + "step": 6383 + }, + { + "epoch": 2.596177307848719, + "grad_norm": 10.772446773055547, + "learning_rate": 1.852757931824411e-05, + "loss": 0.2756, + "step": 6384 + }, + { + "epoch": 2.5965839772265147, + "grad_norm": 3.129736305754636, + "learning_rate": 1.8527050079857288e-05, + "loss": 0.0639, + "step": 6385 + }, + { + "epoch": 2.5969906466043104, + "grad_norm": 11.06167461527365, + "learning_rate": 1.8526520753936e-05, + "loss": 0.2646, + "step": 6386 + }, + { + "epoch": 2.5973973159821067, + "grad_norm": 8.683667786857878, + "learning_rate": 1.8525991340485687e-05, + "loss": 0.3218, + "step": 6387 + }, + { + "epoch": 2.5978039853599024, + "grad_norm": 4.348643559526759, + "learning_rate": 1.8525461839511783e-05, + "loss": 0.1014, + "step": 6388 + }, + { + "epoch": 2.598210654737698, + "grad_norm": 2.3280049396986477, + "learning_rate": 1.8524932251019722e-05, + "loss": 0.0405, + "step": 6389 + }, + { + "epoch": 2.598617324115494, + "grad_norm": 10.390747441565521, + "learning_rate": 1.852440257501494e-05, + "loss": 0.3177, + "step": 6390 + }, + { + "epoch": 2.59902399349329, + "grad_norm": 8.56483265804225, + "learning_rate": 1.8523872811502874e-05, + "loss": 0.3418, + "step": 6391 + }, + { + "epoch": 2.599430662871086, + "grad_norm": 10.09779146618577, + "learning_rate": 1.8523342960488966e-05, + "loss": 0.4309, + "step": 6392 + }, + { + "epoch": 2.5998373322488817, + "grad_norm": 10.109006556858843, + "learning_rate": 1.8522813021978652e-05, + "loss": 0.4121, + "step": 6393 + }, + { + "epoch": 2.6002440016266775, + "grad_norm": 9.595600902793029, + "learning_rate": 1.8522282995977373e-05, + "loss": 0.6499, + "step": 6394 + }, + { + "epoch": 2.6006506710044732, + "grad_norm": 10.82122202953507, + "learning_rate": 1.8521752882490568e-05, + "loss": 0.6295, + "step": 6395 + }, + { + "epoch": 2.601057340382269, + "grad_norm": 14.382715335245912, + "learning_rate": 1.8521222681523683e-05, + "loss": 0.5932, + "step": 6396 + }, + { + "epoch": 2.601464009760065, + "grad_norm": 9.104699850782893, + "learning_rate": 1.8520692393082158e-05, + "loss": 0.2427, + "step": 6397 + }, + { + "epoch": 2.601870679137861, + "grad_norm": 2.3193524820841924, + "learning_rate": 1.8520162017171437e-05, + "loss": 0.0527, + "step": 6398 + }, + { + "epoch": 2.6022773485156567, + "grad_norm": 5.027937345380868, + "learning_rate": 1.8519631553796964e-05, + "loss": 0.0718, + "step": 6399 + }, + { + "epoch": 2.6026840178934525, + "grad_norm": 4.773858984453752, + "learning_rate": 1.8519101002964185e-05, + "loss": 0.1317, + "step": 6400 + }, + { + "epoch": 2.6030906872712487, + "grad_norm": 9.203532189787033, + "learning_rate": 1.851857036467855e-05, + "loss": 0.2399, + "step": 6401 + }, + { + "epoch": 2.6034973566490445, + "grad_norm": 10.443255875140405, + "learning_rate": 1.85180396389455e-05, + "loss": 0.3113, + "step": 6402 + }, + { + "epoch": 2.6039040260268402, + "grad_norm": 0.26101118653817007, + "learning_rate": 1.8517508825770488e-05, + "loss": 0.0039, + "step": 6403 + }, + { + "epoch": 2.604310695404636, + "grad_norm": 7.922308522462792, + "learning_rate": 1.851697792515896e-05, + "loss": 0.2343, + "step": 6404 + }, + { + "epoch": 2.6047173647824318, + "grad_norm": 3.2648671932491578, + "learning_rate": 1.8516446937116365e-05, + "loss": 0.0504, + "step": 6405 + }, + { + "epoch": 2.6051240341602275, + "grad_norm": 11.052349132060066, + "learning_rate": 1.851591586164816e-05, + "loss": 0.4852, + "step": 6406 + }, + { + "epoch": 2.6055307035380237, + "grad_norm": 20.63297504436313, + "learning_rate": 1.851538469875979e-05, + "loss": 1.1584, + "step": 6407 + }, + { + "epoch": 2.6059373729158195, + "grad_norm": 8.425546635383908, + "learning_rate": 1.851485344845671e-05, + "loss": 0.3268, + "step": 6408 + }, + { + "epoch": 2.6063440422936153, + "grad_norm": 6.3457546556697215, + "learning_rate": 1.8514322110744378e-05, + "loss": 0.1317, + "step": 6409 + }, + { + "epoch": 2.606750711671411, + "grad_norm": 5.993008778068524, + "learning_rate": 1.8513790685628242e-05, + "loss": 0.1348, + "step": 6410 + }, + { + "epoch": 2.6071573810492072, + "grad_norm": 4.095294320346566, + "learning_rate": 1.8513259173113757e-05, + "loss": 0.1159, + "step": 6411 + }, + { + "epoch": 2.607564050427003, + "grad_norm": 7.588691383090242, + "learning_rate": 1.8512727573206383e-05, + "loss": 0.2203, + "step": 6412 + }, + { + "epoch": 2.6079707198047988, + "grad_norm": 9.407379950204755, + "learning_rate": 1.851219588591158e-05, + "loss": 0.4453, + "step": 6413 + }, + { + "epoch": 2.6083773891825945, + "grad_norm": 8.200609763282415, + "learning_rate": 1.85116641112348e-05, + "loss": 0.2222, + "step": 6414 + }, + { + "epoch": 2.6087840585603903, + "grad_norm": 4.154207562522179, + "learning_rate": 1.85111322491815e-05, + "loss": 0.1167, + "step": 6415 + }, + { + "epoch": 2.609190727938186, + "grad_norm": 16.191933644647445, + "learning_rate": 1.851060029975715e-05, + "loss": 1.2098, + "step": 6416 + }, + { + "epoch": 2.609597397315982, + "grad_norm": 5.555172119950842, + "learning_rate": 1.85100682629672e-05, + "loss": 0.1019, + "step": 6417 + }, + { + "epoch": 2.610004066693778, + "grad_norm": 8.445985271768642, + "learning_rate": 1.8509536138817117e-05, + "loss": 0.3932, + "step": 6418 + }, + { + "epoch": 2.610410736071574, + "grad_norm": 8.861219737107032, + "learning_rate": 1.8509003927312362e-05, + "loss": 0.3684, + "step": 6419 + }, + { + "epoch": 2.6108174054493696, + "grad_norm": 1.5157835778161795, + "learning_rate": 1.8508471628458404e-05, + "loss": 0.0322, + "step": 6420 + }, + { + "epoch": 2.611224074827166, + "grad_norm": 8.327562921896595, + "learning_rate": 1.85079392422607e-05, + "loss": 0.1876, + "step": 6421 + }, + { + "epoch": 2.6116307442049616, + "grad_norm": 5.90506702025744, + "learning_rate": 1.8507406768724715e-05, + "loss": 0.1911, + "step": 6422 + }, + { + "epoch": 2.6120374135827573, + "grad_norm": 11.158829744108527, + "learning_rate": 1.850687420785592e-05, + "loss": 0.7535, + "step": 6423 + }, + { + "epoch": 2.612444082960553, + "grad_norm": 7.883359508507568, + "learning_rate": 1.8506341559659783e-05, + "loss": 0.44, + "step": 6424 + }, + { + "epoch": 2.612850752338349, + "grad_norm": 6.794139642495854, + "learning_rate": 1.8505808824141763e-05, + "loss": 0.1424, + "step": 6425 + }, + { + "epoch": 2.6132574217161446, + "grad_norm": 4.070764420227437, + "learning_rate": 1.8505276001307336e-05, + "loss": 0.133, + "step": 6426 + }, + { + "epoch": 2.6136640910939404, + "grad_norm": 13.681246202557329, + "learning_rate": 1.8504743091161975e-05, + "loss": 0.8174, + "step": 6427 + }, + { + "epoch": 2.6140707604717366, + "grad_norm": 9.87565765853315, + "learning_rate": 1.850421009371114e-05, + "loss": 0.183, + "step": 6428 + }, + { + "epoch": 2.6144774298495324, + "grad_norm": 8.026574229261342, + "learning_rate": 1.8503677008960312e-05, + "loss": 0.2221, + "step": 6429 + }, + { + "epoch": 2.614884099227328, + "grad_norm": 12.445817407401394, + "learning_rate": 1.8503143836914956e-05, + "loss": 0.6669, + "step": 6430 + }, + { + "epoch": 2.615290768605124, + "grad_norm": 6.231455378269459, + "learning_rate": 1.850261057758055e-05, + "loss": 0.3095, + "step": 6431 + }, + { + "epoch": 2.61569743798292, + "grad_norm": 5.831305510429465, + "learning_rate": 1.8502077230962568e-05, + "loss": 0.2918, + "step": 6432 + }, + { + "epoch": 2.616104107360716, + "grad_norm": 5.54300170797321, + "learning_rate": 1.8501543797066487e-05, + "loss": 0.155, + "step": 6433 + }, + { + "epoch": 2.6165107767385116, + "grad_norm": 9.695452570623848, + "learning_rate": 1.8501010275897776e-05, + "loss": 0.4303, + "step": 6434 + }, + { + "epoch": 2.6169174461163074, + "grad_norm": 1.5469288916059785, + "learning_rate": 1.8500476667461923e-05, + "loss": 0.0323, + "step": 6435 + }, + { + "epoch": 2.617324115494103, + "grad_norm": 1.1130518565641516, + "learning_rate": 1.849994297176439e-05, + "loss": 0.0165, + "step": 6436 + }, + { + "epoch": 2.617730784871899, + "grad_norm": 12.712702275124991, + "learning_rate": 1.849940918881067e-05, + "loss": 0.5242, + "step": 6437 + }, + { + "epoch": 2.618137454249695, + "grad_norm": 2.7076056707918488, + "learning_rate": 1.8498875318606234e-05, + "loss": 0.0374, + "step": 6438 + }, + { + "epoch": 2.618544123627491, + "grad_norm": 6.4576133616648725, + "learning_rate": 1.8498341361156567e-05, + "loss": 0.1943, + "step": 6439 + }, + { + "epoch": 2.6189507930052867, + "grad_norm": 15.341069202054168, + "learning_rate": 1.849780731646715e-05, + "loss": 0.6147, + "step": 6440 + }, + { + "epoch": 2.6193574623830824, + "grad_norm": 4.775263950614629, + "learning_rate": 1.849727318454346e-05, + "loss": 0.1314, + "step": 6441 + }, + { + "epoch": 2.6197641317608786, + "grad_norm": 4.937882044449621, + "learning_rate": 1.8496738965390988e-05, + "loss": 0.0906, + "step": 6442 + }, + { + "epoch": 2.6201708011386744, + "grad_norm": 6.5261217503116296, + "learning_rate": 1.8496204659015213e-05, + "loss": 0.1641, + "step": 6443 + }, + { + "epoch": 2.62057747051647, + "grad_norm": 12.154148195605725, + "learning_rate": 1.849567026542162e-05, + "loss": 0.6513, + "step": 6444 + }, + { + "epoch": 2.620984139894266, + "grad_norm": 4.002038039301157, + "learning_rate": 1.8495135784615697e-05, + "loss": 0.1193, + "step": 6445 + }, + { + "epoch": 2.6213908092720617, + "grad_norm": 0.7788520066667658, + "learning_rate": 1.8494601216602927e-05, + "loss": 0.013, + "step": 6446 + }, + { + "epoch": 2.6217974786498575, + "grad_norm": 3.9728103216197814, + "learning_rate": 1.8494066561388802e-05, + "loss": 0.0514, + "step": 6447 + }, + { + "epoch": 2.6222041480276537, + "grad_norm": 5.560618092777942, + "learning_rate": 1.8493531818978813e-05, + "loss": 0.244, + "step": 6448 + }, + { + "epoch": 2.6226108174054494, + "grad_norm": 6.014785254551848, + "learning_rate": 1.849299698937844e-05, + "loss": 0.107, + "step": 6449 + }, + { + "epoch": 2.623017486783245, + "grad_norm": 3.8348645323474804, + "learning_rate": 1.8492462072593182e-05, + "loss": 0.1013, + "step": 6450 + }, + { + "epoch": 2.623424156161041, + "grad_norm": 11.86016832024028, + "learning_rate": 1.8491927068628522e-05, + "loss": 0.6598, + "step": 6451 + }, + { + "epoch": 2.623830825538837, + "grad_norm": 4.036815213450676, + "learning_rate": 1.8491391977489963e-05, + "loss": 0.06, + "step": 6452 + }, + { + "epoch": 2.624237494916633, + "grad_norm": 12.619786684692341, + "learning_rate": 1.8490856799182988e-05, + "loss": 0.8524, + "step": 6453 + }, + { + "epoch": 2.6246441642944287, + "grad_norm": 4.39909217447579, + "learning_rate": 1.8490321533713094e-05, + "loss": 0.0709, + "step": 6454 + }, + { + "epoch": 2.6250508336722245, + "grad_norm": 18.019648480545683, + "learning_rate": 1.8489786181085777e-05, + "loss": 1.0884, + "step": 6455 + }, + { + "epoch": 2.6254575030500202, + "grad_norm": 3.4410183311891953, + "learning_rate": 1.848925074130653e-05, + "loss": 0.2246, + "step": 6456 + }, + { + "epoch": 2.625864172427816, + "grad_norm": 8.819402907760036, + "learning_rate": 1.8488715214380857e-05, + "loss": 0.2547, + "step": 6457 + }, + { + "epoch": 2.6262708418056118, + "grad_norm": 0.4090556543516786, + "learning_rate": 1.8488179600314245e-05, + "loss": 0.0047, + "step": 6458 + }, + { + "epoch": 2.626677511183408, + "grad_norm": 4.3232120888201155, + "learning_rate": 1.84876438991122e-05, + "loss": 0.0882, + "step": 6459 + }, + { + "epoch": 2.6270841805612037, + "grad_norm": 11.164306977723808, + "learning_rate": 1.848710811078022e-05, + "loss": 0.5219, + "step": 6460 + }, + { + "epoch": 2.6274908499389995, + "grad_norm": 1.5387161216059198, + "learning_rate": 1.84865722353238e-05, + "loss": 0.0335, + "step": 6461 + }, + { + "epoch": 2.6278975193167957, + "grad_norm": 2.5239535055744104, + "learning_rate": 1.848603627274845e-05, + "loss": 0.0412, + "step": 6462 + }, + { + "epoch": 2.6283041886945915, + "grad_norm": 8.74755456559964, + "learning_rate": 1.848550022305966e-05, + "loss": 0.6403, + "step": 6463 + }, + { + "epoch": 2.6287108580723872, + "grad_norm": 15.444153524229707, + "learning_rate": 1.848496408626294e-05, + "loss": 0.725, + "step": 6464 + }, + { + "epoch": 2.629117527450183, + "grad_norm": 16.629148929779525, + "learning_rate": 1.84844278623638e-05, + "loss": 0.7497, + "step": 6465 + }, + { + "epoch": 2.6295241968279788, + "grad_norm": 9.877448723171932, + "learning_rate": 1.8483891551367735e-05, + "loss": 0.2091, + "step": 6466 + }, + { + "epoch": 2.6299308662057745, + "grad_norm": 0.6842030687284746, + "learning_rate": 1.848335515328025e-05, + "loss": 0.0129, + "step": 6467 + }, + { + "epoch": 2.6303375355835703, + "grad_norm": 1.6990843180871291, + "learning_rate": 1.8482818668106857e-05, + "loss": 0.0351, + "step": 6468 + }, + { + "epoch": 2.6307442049613665, + "grad_norm": 4.7776413950413685, + "learning_rate": 1.8482282095853062e-05, + "loss": 0.0682, + "step": 6469 + }, + { + "epoch": 2.6311508743391623, + "grad_norm": 24.144396998944543, + "learning_rate": 1.848174543652437e-05, + "loss": 0.3758, + "step": 6470 + }, + { + "epoch": 2.631557543716958, + "grad_norm": 4.461207498416919, + "learning_rate": 1.8481208690126294e-05, + "loss": 0.0919, + "step": 6471 + }, + { + "epoch": 2.631964213094754, + "grad_norm": 5.078735773879406, + "learning_rate": 1.8480671856664344e-05, + "loss": 0.3164, + "step": 6472 + }, + { + "epoch": 2.63237088247255, + "grad_norm": 3.541143546651406, + "learning_rate": 1.8480134936144028e-05, + "loss": 0.0608, + "step": 6473 + }, + { + "epoch": 2.632777551850346, + "grad_norm": 2.419415082719963, + "learning_rate": 1.8479597928570856e-05, + "loss": 0.0582, + "step": 6474 + }, + { + "epoch": 2.6331842212281416, + "grad_norm": 2.084664277813359, + "learning_rate": 1.8479060833950345e-05, + "loss": 0.0551, + "step": 6475 + }, + { + "epoch": 2.6335908906059373, + "grad_norm": 3.7394493260366053, + "learning_rate": 1.8478523652288006e-05, + "loss": 0.0877, + "step": 6476 + }, + { + "epoch": 2.633997559983733, + "grad_norm": 17.89189042150652, + "learning_rate": 1.847798638358936e-05, + "loss": 1.3003, + "step": 6477 + }, + { + "epoch": 2.634404229361529, + "grad_norm": 8.739662414289032, + "learning_rate": 1.847744902785991e-05, + "loss": 0.211, + "step": 6478 + }, + { + "epoch": 2.634810898739325, + "grad_norm": 6.051063975194219, + "learning_rate": 1.8476911585105185e-05, + "loss": 0.0923, + "step": 6479 + }, + { + "epoch": 2.635217568117121, + "grad_norm": 4.752406918202532, + "learning_rate": 1.8476374055330693e-05, + "loss": 0.1482, + "step": 6480 + }, + { + "epoch": 2.6356242374949166, + "grad_norm": 1.5479023257895221, + "learning_rate": 1.8475836438541956e-05, + "loss": 0.0139, + "step": 6481 + }, + { + "epoch": 2.6360309068727124, + "grad_norm": 3.952817772941528, + "learning_rate": 1.847529873474449e-05, + "loss": 0.0552, + "step": 6482 + }, + { + "epoch": 2.6364375762505086, + "grad_norm": 2.677348274239605, + "learning_rate": 1.8474760943943822e-05, + "loss": 0.0752, + "step": 6483 + }, + { + "epoch": 2.6368442456283043, + "grad_norm": 0.6910610246458938, + "learning_rate": 1.8474223066145463e-05, + "loss": 0.01, + "step": 6484 + }, + { + "epoch": 2.6372509150061, + "grad_norm": 2.575518706725314, + "learning_rate": 1.847368510135494e-05, + "loss": 0.0389, + "step": 6485 + }, + { + "epoch": 2.637657584383896, + "grad_norm": 15.51713121993795, + "learning_rate": 1.8473147049577777e-05, + "loss": 0.8827, + "step": 6486 + }, + { + "epoch": 2.6380642537616916, + "grad_norm": 9.526451137550188, + "learning_rate": 1.847260891081949e-05, + "loss": 0.3097, + "step": 6487 + }, + { + "epoch": 2.6384709231394874, + "grad_norm": 11.563011939404447, + "learning_rate": 1.8472070685085618e-05, + "loss": 0.3579, + "step": 6488 + }, + { + "epoch": 2.6388775925172836, + "grad_norm": 6.669014605956632, + "learning_rate": 1.8471532372381666e-05, + "loss": 0.345, + "step": 6489 + }, + { + "epoch": 2.6392842618950794, + "grad_norm": 8.478824273048332, + "learning_rate": 1.847099397271318e-05, + "loss": 0.3309, + "step": 6490 + }, + { + "epoch": 2.639690931272875, + "grad_norm": 7.316873001153509, + "learning_rate": 1.847045548608567e-05, + "loss": 0.2295, + "step": 6491 + }, + { + "epoch": 2.640097600650671, + "grad_norm": 7.473720659245381, + "learning_rate": 1.8469916912504672e-05, + "loss": 0.2334, + "step": 6492 + }, + { + "epoch": 2.640504270028467, + "grad_norm": 1.2577477978468112, + "learning_rate": 1.8469378251975716e-05, + "loss": 0.0199, + "step": 6493 + }, + { + "epoch": 2.640910939406263, + "grad_norm": 6.658370525608987, + "learning_rate": 1.846883950450433e-05, + "loss": 0.1423, + "step": 6494 + }, + { + "epoch": 2.6413176087840586, + "grad_norm": 25.08716748458685, + "learning_rate": 1.8468300670096046e-05, + "loss": 1.4473, + "step": 6495 + }, + { + "epoch": 2.6417242781618544, + "grad_norm": 9.913496645357265, + "learning_rate": 1.8467761748756393e-05, + "loss": 0.3438, + "step": 6496 + }, + { + "epoch": 2.64213094753965, + "grad_norm": 1.0764661074520814, + "learning_rate": 1.84672227404909e-05, + "loss": 0.0156, + "step": 6497 + }, + { + "epoch": 2.642537616917446, + "grad_norm": 9.198468593921824, + "learning_rate": 1.8466683645305106e-05, + "loss": 0.3686, + "step": 6498 + }, + { + "epoch": 2.6429442862952417, + "grad_norm": 10.22759209878371, + "learning_rate": 1.8466144463204544e-05, + "loss": 0.3743, + "step": 6499 + }, + { + "epoch": 2.643350955673038, + "grad_norm": 4.5237947315408835, + "learning_rate": 1.846560519419475e-05, + "loss": 0.1179, + "step": 6500 + }, + { + "epoch": 2.6437576250508337, + "grad_norm": 13.984555754079455, + "learning_rate": 1.8465065838281252e-05, + "loss": 0.4922, + "step": 6501 + }, + { + "epoch": 2.6441642944286294, + "grad_norm": 2.67276984482902, + "learning_rate": 1.8464526395469597e-05, + "loss": 0.0446, + "step": 6502 + }, + { + "epoch": 2.6445709638064256, + "grad_norm": 9.37137298338281, + "learning_rate": 1.846398686576532e-05, + "loss": 0.3502, + "step": 6503 + }, + { + "epoch": 2.6449776331842214, + "grad_norm": 10.223347324526856, + "learning_rate": 1.8463447249173954e-05, + "loss": 0.4834, + "step": 6504 + }, + { + "epoch": 2.645384302562017, + "grad_norm": 11.607523280722496, + "learning_rate": 1.8462907545701046e-05, + "loss": 0.4397, + "step": 6505 + }, + { + "epoch": 2.645790971939813, + "grad_norm": 3.6641916307108784, + "learning_rate": 1.846236775535213e-05, + "loss": 0.062, + "step": 6506 + }, + { + "epoch": 2.6461976413176087, + "grad_norm": 5.112736559322515, + "learning_rate": 1.8461827878132755e-05, + "loss": 0.1033, + "step": 6507 + }, + { + "epoch": 2.6466043106954045, + "grad_norm": 5.934543679649028, + "learning_rate": 1.8461287914048453e-05, + "loss": 0.1396, + "step": 6508 + }, + { + "epoch": 2.6470109800732002, + "grad_norm": 3.255781697234169, + "learning_rate": 1.8460747863104774e-05, + "loss": 0.0303, + "step": 6509 + }, + { + "epoch": 2.6474176494509964, + "grad_norm": 3.906514860673338, + "learning_rate": 1.846020772530726e-05, + "loss": 0.1853, + "step": 6510 + }, + { + "epoch": 2.647824318828792, + "grad_norm": 14.923464386861562, + "learning_rate": 1.8459667500661455e-05, + "loss": 0.5197, + "step": 6511 + }, + { + "epoch": 2.648230988206588, + "grad_norm": 7.414589833498077, + "learning_rate": 1.8459127189172904e-05, + "loss": 0.1691, + "step": 6512 + }, + { + "epoch": 2.6486376575843837, + "grad_norm": 7.376948965771882, + "learning_rate": 1.8458586790847156e-05, + "loss": 0.2352, + "step": 6513 + }, + { + "epoch": 2.64904432696218, + "grad_norm": 7.241184769836039, + "learning_rate": 1.845804630568976e-05, + "loss": 0.2925, + "step": 6514 + }, + { + "epoch": 2.6494509963399757, + "grad_norm": 6.433336895683338, + "learning_rate": 1.845750573370626e-05, + "loss": 0.1682, + "step": 6515 + }, + { + "epoch": 2.6498576657177715, + "grad_norm": 7.369224528044006, + "learning_rate": 1.8456965074902206e-05, + "loss": 0.4101, + "step": 6516 + }, + { + "epoch": 2.6502643350955672, + "grad_norm": 7.093105640077427, + "learning_rate": 1.8456424329283148e-05, + "loss": 0.1749, + "step": 6517 + }, + { + "epoch": 2.650671004473363, + "grad_norm": 7.662193976846102, + "learning_rate": 1.8455883496854644e-05, + "loss": 0.2358, + "step": 6518 + }, + { + "epoch": 2.651077673851159, + "grad_norm": 7.373149697237328, + "learning_rate": 1.8455342577622234e-05, + "loss": 0.5701, + "step": 6519 + }, + { + "epoch": 2.651484343228955, + "grad_norm": 4.924940562893699, + "learning_rate": 1.845480157159148e-05, + "loss": 0.2989, + "step": 6520 + }, + { + "epoch": 2.6518910126067508, + "grad_norm": 17.190483035534818, + "learning_rate": 1.8454260478767932e-05, + "loss": 0.5163, + "step": 6521 + }, + { + "epoch": 2.6522976819845465, + "grad_norm": 3.5649041382694224, + "learning_rate": 1.8453719299157147e-05, + "loss": 0.1007, + "step": 6522 + }, + { + "epoch": 2.6527043513623423, + "grad_norm": 9.808496761630588, + "learning_rate": 1.845317803276468e-05, + "loss": 0.1968, + "step": 6523 + }, + { + "epoch": 2.6531110207401385, + "grad_norm": 15.30851865386454, + "learning_rate": 1.8452636679596077e-05, + "loss": 0.9295, + "step": 6524 + }, + { + "epoch": 2.6535176901179343, + "grad_norm": 7.736685083158915, + "learning_rate": 1.8452095239656912e-05, + "loss": 0.4604, + "step": 6525 + }, + { + "epoch": 2.65392435949573, + "grad_norm": 12.982927960525604, + "learning_rate": 1.8451553712952734e-05, + "loss": 0.914, + "step": 6526 + }, + { + "epoch": 2.654331028873526, + "grad_norm": 3.845040166550634, + "learning_rate": 1.8451012099489105e-05, + "loss": 0.0781, + "step": 6527 + }, + { + "epoch": 2.6547376982513216, + "grad_norm": 11.283960869160122, + "learning_rate": 1.8450470399271582e-05, + "loss": 0.452, + "step": 6528 + }, + { + "epoch": 2.6551443676291173, + "grad_norm": 5.473196613284494, + "learning_rate": 1.844992861230573e-05, + "loss": 0.1097, + "step": 6529 + }, + { + "epoch": 2.6555510370069135, + "grad_norm": 0.7543193275245175, + "learning_rate": 1.8449386738597102e-05, + "loss": 0.0135, + "step": 6530 + }, + { + "epoch": 2.6559577063847093, + "grad_norm": 2.5577143450035176, + "learning_rate": 1.844884477815127e-05, + "loss": 0.0462, + "step": 6531 + }, + { + "epoch": 2.656364375762505, + "grad_norm": 13.329299814205447, + "learning_rate": 1.8448302730973794e-05, + "loss": 0.7028, + "step": 6532 + }, + { + "epoch": 2.656771045140301, + "grad_norm": 13.666320243132057, + "learning_rate": 1.8447760597070242e-05, + "loss": 0.5802, + "step": 6533 + }, + { + "epoch": 2.657177714518097, + "grad_norm": 9.65621003278467, + "learning_rate": 1.8447218376446173e-05, + "loss": 0.4324, + "step": 6534 + }, + { + "epoch": 2.657584383895893, + "grad_norm": 6.469939747410957, + "learning_rate": 1.8446676069107153e-05, + "loss": 0.1389, + "step": 6535 + }, + { + "epoch": 2.6579910532736886, + "grad_norm": 5.421419860957053, + "learning_rate": 1.8446133675058753e-05, + "loss": 0.1738, + "step": 6536 + }, + { + "epoch": 2.6583977226514843, + "grad_norm": 5.356199055367684, + "learning_rate": 1.8445591194306543e-05, + "loss": 0.1759, + "step": 6537 + }, + { + "epoch": 2.65880439202928, + "grad_norm": 3.424424260146365, + "learning_rate": 1.844504862685609e-05, + "loss": 0.0637, + "step": 6538 + }, + { + "epoch": 2.659211061407076, + "grad_norm": 0.9023848206589721, + "learning_rate": 1.8444505972712958e-05, + "loss": 0.0143, + "step": 6539 + }, + { + "epoch": 2.6596177307848716, + "grad_norm": 12.36376031113939, + "learning_rate": 1.8443963231882724e-05, + "loss": 0.5064, + "step": 6540 + }, + { + "epoch": 2.660024400162668, + "grad_norm": 6.570966088199277, + "learning_rate": 1.844342040437096e-05, + "loss": 0.2009, + "step": 6541 + }, + { + "epoch": 2.6604310695404636, + "grad_norm": 5.449899347762352, + "learning_rate": 1.844287749018323e-05, + "loss": 0.1096, + "step": 6542 + }, + { + "epoch": 2.6608377389182594, + "grad_norm": 8.018828047061671, + "learning_rate": 1.844233448932512e-05, + "loss": 0.2026, + "step": 6543 + }, + { + "epoch": 2.6612444082960556, + "grad_norm": 1.1509205264447433, + "learning_rate": 1.8441791401802197e-05, + "loss": 0.0128, + "step": 6544 + }, + { + "epoch": 2.6616510776738513, + "grad_norm": 7.700840989039619, + "learning_rate": 1.8441248227620033e-05, + "loss": 0.1811, + "step": 6545 + }, + { + "epoch": 2.662057747051647, + "grad_norm": 4.600937810584357, + "learning_rate": 1.8440704966784208e-05, + "loss": 0.1819, + "step": 6546 + }, + { + "epoch": 2.662464416429443, + "grad_norm": 1.6143953652892422, + "learning_rate": 1.84401616193003e-05, + "loss": 0.0871, + "step": 6547 + }, + { + "epoch": 2.6628710858072386, + "grad_norm": 8.143856131864974, + "learning_rate": 1.8439618185173886e-05, + "loss": 0.1868, + "step": 6548 + }, + { + "epoch": 2.6632777551850344, + "grad_norm": 9.149128267423693, + "learning_rate": 1.843907466441054e-05, + "loss": 0.1613, + "step": 6549 + }, + { + "epoch": 2.66368442456283, + "grad_norm": 4.8423617358747775, + "learning_rate": 1.8438531057015845e-05, + "loss": 0.1064, + "step": 6550 + }, + { + "epoch": 2.6640910939406264, + "grad_norm": 11.04001130417342, + "learning_rate": 1.8437987362995387e-05, + "loss": 0.3848, + "step": 6551 + }, + { + "epoch": 2.664497763318422, + "grad_norm": 13.532601149740549, + "learning_rate": 1.843744358235474e-05, + "loss": 0.8253, + "step": 6552 + }, + { + "epoch": 2.664904432696218, + "grad_norm": 10.42186587110231, + "learning_rate": 1.8436899715099485e-05, + "loss": 0.5478, + "step": 6553 + }, + { + "epoch": 2.6653111020740137, + "grad_norm": 6.08033754460851, + "learning_rate": 1.843635576123521e-05, + "loss": 0.4807, + "step": 6554 + }, + { + "epoch": 2.66571777145181, + "grad_norm": 6.199406033377899, + "learning_rate": 1.8435811720767498e-05, + "loss": 0.2662, + "step": 6555 + }, + { + "epoch": 2.6661244408296056, + "grad_norm": 1.3073156267984876, + "learning_rate": 1.8435267593701933e-05, + "loss": 0.0191, + "step": 6556 + }, + { + "epoch": 2.6665311102074014, + "grad_norm": 1.4612373103724487, + "learning_rate": 1.84347233800441e-05, + "loss": 0.0204, + "step": 6557 + }, + { + "epoch": 2.666937779585197, + "grad_norm": 0.29850328813886406, + "learning_rate": 1.8434179079799587e-05, + "loss": 0.0043, + "step": 6558 + }, + { + "epoch": 2.667344448962993, + "grad_norm": 3.3628071569190103, + "learning_rate": 1.843363469297398e-05, + "loss": 0.1247, + "step": 6559 + }, + { + "epoch": 2.6677511183407887, + "grad_norm": 5.939195124935014, + "learning_rate": 1.8433090219572872e-05, + "loss": 0.1384, + "step": 6560 + }, + { + "epoch": 2.668157787718585, + "grad_norm": 16.08982096859374, + "learning_rate": 1.8432545659601846e-05, + "loss": 0.9754, + "step": 6561 + }, + { + "epoch": 2.6685644570963807, + "grad_norm": 4.199146247625298, + "learning_rate": 1.8432001013066495e-05, + "loss": 0.0669, + "step": 6562 + }, + { + "epoch": 2.6689711264741764, + "grad_norm": 5.07730164235292, + "learning_rate": 1.843145627997241e-05, + "loss": 0.1135, + "step": 6563 + }, + { + "epoch": 2.669377795851972, + "grad_norm": 9.137735470335693, + "learning_rate": 1.843091146032518e-05, + "loss": 0.3752, + "step": 6564 + }, + { + "epoch": 2.6697844652297684, + "grad_norm": 10.357975678993517, + "learning_rate": 1.8430366554130403e-05, + "loss": 0.4225, + "step": 6565 + }, + { + "epoch": 2.670191134607564, + "grad_norm": 21.115700671520514, + "learning_rate": 1.842982156139367e-05, + "loss": 1.1326, + "step": 6566 + }, + { + "epoch": 2.67059780398536, + "grad_norm": 5.30944486974069, + "learning_rate": 1.8429276482120573e-05, + "loss": 0.4423, + "step": 6567 + }, + { + "epoch": 2.6710044733631557, + "grad_norm": 15.305754010037148, + "learning_rate": 1.8428731316316716e-05, + "loss": 0.3889, + "step": 6568 + }, + { + "epoch": 2.6714111427409515, + "grad_norm": 10.79581105561027, + "learning_rate": 1.842818606398769e-05, + "loss": 0.4147, + "step": 6569 + }, + { + "epoch": 2.6718178121187472, + "grad_norm": 3.1223491204275504, + "learning_rate": 1.8427640725139088e-05, + "loss": 0.1076, + "step": 6570 + }, + { + "epoch": 2.6722244814965435, + "grad_norm": 1.0827212379929, + "learning_rate": 1.8427095299776515e-05, + "loss": 0.0158, + "step": 6571 + }, + { + "epoch": 2.6726311508743392, + "grad_norm": 6.674630916568734, + "learning_rate": 1.8426549787905566e-05, + "loss": 0.3526, + "step": 6572 + }, + { + "epoch": 2.673037820252135, + "grad_norm": 12.575299032615556, + "learning_rate": 1.8426004189531842e-05, + "loss": 0.6902, + "step": 6573 + }, + { + "epoch": 2.6734444896299308, + "grad_norm": 13.640054519781856, + "learning_rate": 1.842545850466095e-05, + "loss": 0.3825, + "step": 6574 + }, + { + "epoch": 2.673851159007727, + "grad_norm": 5.555637678733132, + "learning_rate": 1.8424912733298482e-05, + "loss": 0.2309, + "step": 6575 + }, + { + "epoch": 2.6742578283855227, + "grad_norm": 9.964731911618951, + "learning_rate": 1.842436687545004e-05, + "loss": 0.4515, + "step": 6576 + }, + { + "epoch": 2.6746644977633185, + "grad_norm": 8.591998198297414, + "learning_rate": 1.8423820931121238e-05, + "loss": 0.1393, + "step": 6577 + }, + { + "epoch": 2.6750711671411143, + "grad_norm": 6.983317377646626, + "learning_rate": 1.8423274900317675e-05, + "loss": 0.2188, + "step": 6578 + }, + { + "epoch": 2.67547783651891, + "grad_norm": 10.001790290908211, + "learning_rate": 1.8422728783044953e-05, + "loss": 0.3649, + "step": 6579 + }, + { + "epoch": 2.675884505896706, + "grad_norm": 5.056974197918845, + "learning_rate": 1.842218257930868e-05, + "loss": 0.1087, + "step": 6580 + }, + { + "epoch": 2.6762911752745016, + "grad_norm": 4.182185963226429, + "learning_rate": 1.8421636289114468e-05, + "loss": 0.0649, + "step": 6581 + }, + { + "epoch": 2.6766978446522978, + "grad_norm": 10.267358499002675, + "learning_rate": 1.8421089912467915e-05, + "loss": 0.5882, + "step": 6582 + }, + { + "epoch": 2.6771045140300935, + "grad_norm": 9.740604288480672, + "learning_rate": 1.8420543449374643e-05, + "loss": 0.3389, + "step": 6583 + }, + { + "epoch": 2.6775111834078893, + "grad_norm": 4.7505175664061605, + "learning_rate": 1.841999689984025e-05, + "loss": 0.0786, + "step": 6584 + }, + { + "epoch": 2.6779178527856855, + "grad_norm": 6.299413313823259, + "learning_rate": 1.8419450263870352e-05, + "loss": 0.3754, + "step": 6585 + }, + { + "epoch": 2.6783245221634813, + "grad_norm": 53.06200358794108, + "learning_rate": 1.841890354147056e-05, + "loss": 1.3475, + "step": 6586 + }, + { + "epoch": 2.678731191541277, + "grad_norm": 3.2293641862506584, + "learning_rate": 1.8418356732646487e-05, + "loss": 0.0672, + "step": 6587 + }, + { + "epoch": 2.679137860919073, + "grad_norm": 2.3725068956918194, + "learning_rate": 1.841780983740374e-05, + "loss": 0.0373, + "step": 6588 + }, + { + "epoch": 2.6795445302968686, + "grad_norm": 4.553094467322203, + "learning_rate": 1.8417262855747947e-05, + "loss": 0.1032, + "step": 6589 + }, + { + "epoch": 2.6799511996746643, + "grad_norm": 6.574781172440938, + "learning_rate": 1.8416715787684707e-05, + "loss": 0.2457, + "step": 6590 + }, + { + "epoch": 2.68035786905246, + "grad_norm": 9.4197369903788, + "learning_rate": 1.8416168633219647e-05, + "loss": 0.3109, + "step": 6591 + }, + { + "epoch": 2.6807645384302563, + "grad_norm": 3.3090573097684355, + "learning_rate": 1.8415621392358378e-05, + "loss": 0.1157, + "step": 6592 + }, + { + "epoch": 2.681171207808052, + "grad_norm": 8.446752025757986, + "learning_rate": 1.841507406510652e-05, + "loss": 0.691, + "step": 6593 + }, + { + "epoch": 2.681577877185848, + "grad_norm": 5.418815902987198, + "learning_rate": 1.8414526651469695e-05, + "loss": 0.1803, + "step": 6594 + }, + { + "epoch": 2.6819845465636436, + "grad_norm": 12.729907490196746, + "learning_rate": 1.841397915145352e-05, + "loss": 0.3072, + "step": 6595 + }, + { + "epoch": 2.68239121594144, + "grad_norm": 8.731678319540224, + "learning_rate": 1.8413431565063608e-05, + "loss": 0.6253, + "step": 6596 + }, + { + "epoch": 2.6827978853192356, + "grad_norm": 19.425141769203137, + "learning_rate": 1.8412883892305592e-05, + "loss": 0.717, + "step": 6597 + }, + { + "epoch": 2.6832045546970313, + "grad_norm": 8.104643923029323, + "learning_rate": 1.8412336133185084e-05, + "loss": 0.2433, + "step": 6598 + }, + { + "epoch": 2.683611224074827, + "grad_norm": 3.7389319933078538, + "learning_rate": 1.8411788287707716e-05, + "loss": 0.0595, + "step": 6599 + }, + { + "epoch": 2.684017893452623, + "grad_norm": 9.985420918771748, + "learning_rate": 1.8411240355879104e-05, + "loss": 0.3906, + "step": 6600 + }, + { + "epoch": 2.6844245628304186, + "grad_norm": 2.3197461387135947, + "learning_rate": 1.841069233770488e-05, + "loss": 0.0444, + "step": 6601 + }, + { + "epoch": 2.684831232208215, + "grad_norm": 7.69612102012131, + "learning_rate": 1.8410144233190666e-05, + "loss": 0.2532, + "step": 6602 + }, + { + "epoch": 2.6852379015860106, + "grad_norm": 9.588411115784679, + "learning_rate": 1.8409596042342082e-05, + "loss": 0.2866, + "step": 6603 + }, + { + "epoch": 2.6856445709638064, + "grad_norm": 15.635797637002238, + "learning_rate": 1.840904776516477e-05, + "loss": 0.7949, + "step": 6604 + }, + { + "epoch": 2.686051240341602, + "grad_norm": 17.026708928434523, + "learning_rate": 1.840849940166435e-05, + "loss": 0.7096, + "step": 6605 + }, + { + "epoch": 2.6864579097193984, + "grad_norm": 2.7951767813515795, + "learning_rate": 1.8407950951846448e-05, + "loss": 0.0595, + "step": 6606 + }, + { + "epoch": 2.686864579097194, + "grad_norm": 1.749129176754107, + "learning_rate": 1.84074024157167e-05, + "loss": 0.0274, + "step": 6607 + }, + { + "epoch": 2.68727124847499, + "grad_norm": 4.669402330959297, + "learning_rate": 1.8406853793280734e-05, + "loss": 0.1849, + "step": 6608 + }, + { + "epoch": 2.6876779178527856, + "grad_norm": 12.317236536530984, + "learning_rate": 1.8406305084544182e-05, + "loss": 0.8762, + "step": 6609 + }, + { + "epoch": 2.6880845872305814, + "grad_norm": 10.078689834761306, + "learning_rate": 1.840575628951268e-05, + "loss": 0.3822, + "step": 6610 + }, + { + "epoch": 2.688491256608377, + "grad_norm": 9.916407149651137, + "learning_rate": 1.840520740819186e-05, + "loss": 0.3087, + "step": 6611 + }, + { + "epoch": 2.6888979259861734, + "grad_norm": 1.9600066345892173, + "learning_rate": 1.840465844058735e-05, + "loss": 0.0768, + "step": 6612 + }, + { + "epoch": 2.689304595363969, + "grad_norm": 3.3894117651059377, + "learning_rate": 1.8404109386704795e-05, + "loss": 0.041, + "step": 6613 + }, + { + "epoch": 2.689711264741765, + "grad_norm": 0.2696726757354885, + "learning_rate": 1.840356024654983e-05, + "loss": 0.0047, + "step": 6614 + }, + { + "epoch": 2.6901179341195607, + "grad_norm": 4.659245919220792, + "learning_rate": 1.8403011020128084e-05, + "loss": 0.1408, + "step": 6615 + }, + { + "epoch": 2.690524603497357, + "grad_norm": 0.417719488829225, + "learning_rate": 1.8402461707445206e-05, + "loss": 0.0054, + "step": 6616 + }, + { + "epoch": 2.6909312728751527, + "grad_norm": 10.156507257816337, + "learning_rate": 1.8401912308506825e-05, + "loss": 0.3125, + "step": 6617 + }, + { + "epoch": 2.6913379422529484, + "grad_norm": 14.819250896813168, + "learning_rate": 1.840136282331859e-05, + "loss": 0.795, + "step": 6618 + }, + { + "epoch": 2.691744611630744, + "grad_norm": 14.640905013582191, + "learning_rate": 1.8400813251886132e-05, + "loss": 0.7076, + "step": 6619 + }, + { + "epoch": 2.69215128100854, + "grad_norm": 9.497303693791794, + "learning_rate": 1.8400263594215103e-05, + "loss": 0.3453, + "step": 6620 + }, + { + "epoch": 2.6925579503863357, + "grad_norm": 0.505007696138134, + "learning_rate": 1.8399713850311136e-05, + "loss": 0.0083, + "step": 6621 + }, + { + "epoch": 2.6929646197641315, + "grad_norm": 11.747047504090855, + "learning_rate": 1.8399164020179882e-05, + "loss": 0.5526, + "step": 6622 + }, + { + "epoch": 2.6933712891419277, + "grad_norm": 7.457127968056393, + "learning_rate": 1.8398614103826983e-05, + "loss": 0.3856, + "step": 6623 + }, + { + "epoch": 2.6937779585197235, + "grad_norm": 8.5475034119321, + "learning_rate": 1.839806410125808e-05, + "loss": 0.528, + "step": 6624 + }, + { + "epoch": 2.6941846278975192, + "grad_norm": 0.08560036011278913, + "learning_rate": 1.8397514012478824e-05, + "loss": 0.0009, + "step": 6625 + }, + { + "epoch": 2.6945912972753154, + "grad_norm": 4.746742411254449, + "learning_rate": 1.839696383749486e-05, + "loss": 0.0861, + "step": 6626 + }, + { + "epoch": 2.694997966653111, + "grad_norm": 15.03207559670229, + "learning_rate": 1.8396413576311837e-05, + "loss": 0.9067, + "step": 6627 + }, + { + "epoch": 2.695404636030907, + "grad_norm": 18.29307562963627, + "learning_rate": 1.83958632289354e-05, + "loss": 0.8809, + "step": 6628 + }, + { + "epoch": 2.6958113054087027, + "grad_norm": 6.558975589243208, + "learning_rate": 1.8395312795371204e-05, + "loss": 0.3098, + "step": 6629 + }, + { + "epoch": 2.6962179747864985, + "grad_norm": 21.260051078089077, + "learning_rate": 1.83947622756249e-05, + "loss": 1.6742, + "step": 6630 + }, + { + "epoch": 2.6966246441642943, + "grad_norm": 10.712446504117866, + "learning_rate": 1.8394211669702132e-05, + "loss": 0.3391, + "step": 6631 + }, + { + "epoch": 2.69703131354209, + "grad_norm": 8.117267390024153, + "learning_rate": 1.8393660977608555e-05, + "loss": 0.4434, + "step": 6632 + }, + { + "epoch": 2.6974379829198862, + "grad_norm": 4.649549017876342, + "learning_rate": 1.8393110199349827e-05, + "loss": 0.0969, + "step": 6633 + }, + { + "epoch": 2.697844652297682, + "grad_norm": 8.223872133564736, + "learning_rate": 1.8392559334931598e-05, + "loss": 0.197, + "step": 6634 + }, + { + "epoch": 2.6982513216754778, + "grad_norm": 12.401311842724834, + "learning_rate": 1.8392008384359524e-05, + "loss": 0.4593, + "step": 6635 + }, + { + "epoch": 2.6986579910532735, + "grad_norm": 4.0010423333644525, + "learning_rate": 1.8391457347639256e-05, + "loss": 0.0856, + "step": 6636 + }, + { + "epoch": 2.6990646604310697, + "grad_norm": 7.267678642666175, + "learning_rate": 1.8390906224776462e-05, + "loss": 0.3259, + "step": 6637 + }, + { + "epoch": 2.6994713298088655, + "grad_norm": 6.474924410877091, + "learning_rate": 1.8390355015776788e-05, + "loss": 0.1571, + "step": 6638 + }, + { + "epoch": 2.6998779991866613, + "grad_norm": 3.434586819074415, + "learning_rate": 1.8389803720645897e-05, + "loss": 0.0916, + "step": 6639 + }, + { + "epoch": 2.700284668564457, + "grad_norm": 4.390041121696929, + "learning_rate": 1.8389252339389448e-05, + "loss": 0.0464, + "step": 6640 + }, + { + "epoch": 2.700691337942253, + "grad_norm": 4.799478370967771, + "learning_rate": 1.8388700872013103e-05, + "loss": 0.1068, + "step": 6641 + }, + { + "epoch": 2.7010980073200486, + "grad_norm": 0.31517725857222595, + "learning_rate": 1.838814931852252e-05, + "loss": 0.0055, + "step": 6642 + }, + { + "epoch": 2.7015046766978448, + "grad_norm": 1.5385644378066718, + "learning_rate": 1.8387597678923367e-05, + "loss": 0.013, + "step": 6643 + }, + { + "epoch": 2.7019113460756405, + "grad_norm": 5.13976506224688, + "learning_rate": 1.83870459532213e-05, + "loss": 0.1349, + "step": 6644 + }, + { + "epoch": 2.7023180154534363, + "grad_norm": 7.169398474638136, + "learning_rate": 1.838649414142198e-05, + "loss": 0.2015, + "step": 6645 + }, + { + "epoch": 2.702724684831232, + "grad_norm": 5.897646518897319, + "learning_rate": 1.8385942243531084e-05, + "loss": 0.1103, + "step": 6646 + }, + { + "epoch": 2.7031313542090283, + "grad_norm": 5.077362738910202, + "learning_rate": 1.8385390259554268e-05, + "loss": 0.1081, + "step": 6647 + }, + { + "epoch": 2.703538023586824, + "grad_norm": 11.45691963891565, + "learning_rate": 1.83848381894972e-05, + "loss": 0.8218, + "step": 6648 + }, + { + "epoch": 2.70394469296462, + "grad_norm": 8.972242696437911, + "learning_rate": 1.8384286033365547e-05, + "loss": 0.1328, + "step": 6649 + }, + { + "epoch": 2.7043513623424156, + "grad_norm": 6.526761014712514, + "learning_rate": 1.838373379116498e-05, + "loss": 0.104, + "step": 6650 + }, + { + "epoch": 2.7047580317202113, + "grad_norm": 5.866131597436124, + "learning_rate": 1.8383181462901166e-05, + "loss": 0.3875, + "step": 6651 + }, + { + "epoch": 2.705164701098007, + "grad_norm": 7.67685981263453, + "learning_rate": 1.8382629048579774e-05, + "loss": 0.3217, + "step": 6652 + }, + { + "epoch": 2.7055713704758033, + "grad_norm": 11.136381279235355, + "learning_rate": 1.838207654820648e-05, + "loss": 0.8425, + "step": 6653 + }, + { + "epoch": 2.705978039853599, + "grad_norm": 0.5353888963451802, + "learning_rate": 1.8381523961786945e-05, + "loss": 0.0053, + "step": 6654 + }, + { + "epoch": 2.706384709231395, + "grad_norm": 11.704341100637457, + "learning_rate": 1.838097128932685e-05, + "loss": 0.5938, + "step": 6655 + }, + { + "epoch": 2.7067913786091906, + "grad_norm": 2.891693884147086, + "learning_rate": 1.838041853083187e-05, + "loss": 0.0543, + "step": 6656 + }, + { + "epoch": 2.707198047986987, + "grad_norm": 9.333265236202957, + "learning_rate": 1.8379865686307674e-05, + "loss": 0.4479, + "step": 6657 + }, + { + "epoch": 2.7076047173647826, + "grad_norm": 11.195466880248015, + "learning_rate": 1.8379312755759938e-05, + "loss": 0.3938, + "step": 6658 + }, + { + "epoch": 2.7080113867425784, + "grad_norm": 1.5335908536563623, + "learning_rate": 1.8378759739194342e-05, + "loss": 0.0197, + "step": 6659 + }, + { + "epoch": 2.708418056120374, + "grad_norm": 6.342903370587247, + "learning_rate": 1.837820663661656e-05, + "loss": 0.4203, + "step": 6660 + }, + { + "epoch": 2.70882472549817, + "grad_norm": 5.146119662948655, + "learning_rate": 1.8377653448032267e-05, + "loss": 0.2613, + "step": 6661 + }, + { + "epoch": 2.7092313948759656, + "grad_norm": 8.583047178086796, + "learning_rate": 1.8377100173447147e-05, + "loss": 0.3351, + "step": 6662 + }, + { + "epoch": 2.709638064253762, + "grad_norm": 14.697428770775673, + "learning_rate": 1.837654681286688e-05, + "loss": 0.6238, + "step": 6663 + }, + { + "epoch": 2.7100447336315576, + "grad_norm": 1.1462112530750552, + "learning_rate": 1.837599336629714e-05, + "loss": 0.0144, + "step": 6664 + }, + { + "epoch": 2.7104514030093534, + "grad_norm": 7.100653372732134, + "learning_rate": 1.8375439833743617e-05, + "loss": 0.2575, + "step": 6665 + }, + { + "epoch": 2.710858072387149, + "grad_norm": 15.386398790392805, + "learning_rate": 1.8374886215211984e-05, + "loss": 1.1027, + "step": 6666 + }, + { + "epoch": 2.7112647417649454, + "grad_norm": 10.076266152308156, + "learning_rate": 1.837433251070793e-05, + "loss": 0.3773, + "step": 6667 + }, + { + "epoch": 2.711671411142741, + "grad_norm": 11.383294552564195, + "learning_rate": 1.8373778720237143e-05, + "loss": 0.3364, + "step": 6668 + }, + { + "epoch": 2.712078080520537, + "grad_norm": 2.658934911117586, + "learning_rate": 1.83732248438053e-05, + "loss": 0.1501, + "step": 6669 + }, + { + "epoch": 2.7124847498983327, + "grad_norm": 4.903029571946568, + "learning_rate": 1.837267088141809e-05, + "loss": 0.0895, + "step": 6670 + }, + { + "epoch": 2.7128914192761284, + "grad_norm": 5.366704773763358, + "learning_rate": 1.8372116833081193e-05, + "loss": 0.2228, + "step": 6671 + }, + { + "epoch": 2.713298088653924, + "grad_norm": 3.564611915658751, + "learning_rate": 1.8371562698800313e-05, + "loss": 0.0805, + "step": 6672 + }, + { + "epoch": 2.71370475803172, + "grad_norm": 7.031451016903749, + "learning_rate": 1.8371008478581123e-05, + "loss": 0.2123, + "step": 6673 + }, + { + "epoch": 2.714111427409516, + "grad_norm": 5.344370505926532, + "learning_rate": 1.837045417242932e-05, + "loss": 0.3334, + "step": 6674 + }, + { + "epoch": 2.714518096787312, + "grad_norm": 4.687776079081969, + "learning_rate": 1.836989978035059e-05, + "loss": 0.1456, + "step": 6675 + }, + { + "epoch": 2.7149247661651077, + "grad_norm": 3.2626844186313386, + "learning_rate": 1.836934530235063e-05, + "loss": 0.0889, + "step": 6676 + }, + { + "epoch": 2.7153314355429035, + "grad_norm": 0.8947715332610343, + "learning_rate": 1.8368790738435126e-05, + "loss": 0.0171, + "step": 6677 + }, + { + "epoch": 2.7157381049206997, + "grad_norm": 7.316030659447687, + "learning_rate": 1.8368236088609773e-05, + "loss": 0.372, + "step": 6678 + }, + { + "epoch": 2.7161447742984954, + "grad_norm": 18.027918302032493, + "learning_rate": 1.8367681352880263e-05, + "loss": 0.7901, + "step": 6679 + }, + { + "epoch": 2.716551443676291, + "grad_norm": 15.797082904820984, + "learning_rate": 1.8367126531252295e-05, + "loss": 0.7462, + "step": 6680 + }, + { + "epoch": 2.716958113054087, + "grad_norm": 13.50826307298729, + "learning_rate": 1.836657162373156e-05, + "loss": 0.9383, + "step": 6681 + }, + { + "epoch": 2.7173647824318827, + "grad_norm": 11.666953832918203, + "learning_rate": 1.8366016630323763e-05, + "loss": 0.2146, + "step": 6682 + }, + { + "epoch": 2.7177714518096785, + "grad_norm": 12.800719712394379, + "learning_rate": 1.836546155103459e-05, + "loss": 0.8274, + "step": 6683 + }, + { + "epoch": 2.7181781211874747, + "grad_norm": 17.543645703427345, + "learning_rate": 1.836490638586974e-05, + "loss": 1.2205, + "step": 6684 + }, + { + "epoch": 2.7185847905652705, + "grad_norm": 6.9205916070132485, + "learning_rate": 1.8364351134834922e-05, + "loss": 0.1022, + "step": 6685 + }, + { + "epoch": 2.7189914599430662, + "grad_norm": 15.193143867840476, + "learning_rate": 1.836379579793583e-05, + "loss": 0.8044, + "step": 6686 + }, + { + "epoch": 2.719398129320862, + "grad_norm": 11.095814723103866, + "learning_rate": 1.836324037517816e-05, + "loss": 0.3871, + "step": 6687 + }, + { + "epoch": 2.719804798698658, + "grad_norm": 14.937070297762814, + "learning_rate": 1.836268486656762e-05, + "loss": 0.8058, + "step": 6688 + }, + { + "epoch": 2.720211468076454, + "grad_norm": 6.296347074062145, + "learning_rate": 1.8362129272109915e-05, + "loss": 0.1294, + "step": 6689 + }, + { + "epoch": 2.7206181374542497, + "grad_norm": 6.89873662810695, + "learning_rate": 1.836157359181074e-05, + "loss": 0.3519, + "step": 6690 + }, + { + "epoch": 2.7210248068320455, + "grad_norm": 1.8238314503157973, + "learning_rate": 1.8361017825675803e-05, + "loss": 0.026, + "step": 6691 + }, + { + "epoch": 2.7214314762098413, + "grad_norm": 13.431640589498963, + "learning_rate": 1.8360461973710815e-05, + "loss": 0.6424, + "step": 6692 + }, + { + "epoch": 2.721838145587637, + "grad_norm": 8.703119584375935, + "learning_rate": 1.8359906035921475e-05, + "loss": 0.2635, + "step": 6693 + }, + { + "epoch": 2.7222448149654332, + "grad_norm": 1.508956251913736, + "learning_rate": 1.835935001231349e-05, + "loss": 0.0351, + "step": 6694 + }, + { + "epoch": 2.722651484343229, + "grad_norm": 8.5012262229368, + "learning_rate": 1.835879390289257e-05, + "loss": 0.6542, + "step": 6695 + }, + { + "epoch": 2.7230581537210248, + "grad_norm": 8.619315444614251, + "learning_rate": 1.8358237707664424e-05, + "loss": 0.5911, + "step": 6696 + }, + { + "epoch": 2.7234648230988205, + "grad_norm": 1.1252251560815827, + "learning_rate": 1.8357681426634765e-05, + "loss": 0.0397, + "step": 6697 + }, + { + "epoch": 2.7238714924766168, + "grad_norm": 7.80459183037892, + "learning_rate": 1.8357125059809294e-05, + "loss": 0.3463, + "step": 6698 + }, + { + "epoch": 2.7242781618544125, + "grad_norm": 11.977730972736923, + "learning_rate": 1.8356568607193734e-05, + "loss": 0.4314, + "step": 6699 + }, + { + "epoch": 2.7246848312322083, + "grad_norm": 14.212359150596038, + "learning_rate": 1.8356012068793788e-05, + "loss": 0.7668, + "step": 6700 + }, + { + "epoch": 2.725091500610004, + "grad_norm": 7.43146913188963, + "learning_rate": 1.8355455444615174e-05, + "loss": 0.2258, + "step": 6701 + }, + { + "epoch": 2.7254981699878, + "grad_norm": 18.127220843787526, + "learning_rate": 1.8354898734663602e-05, + "loss": 1.1925, + "step": 6702 + }, + { + "epoch": 2.7259048393655956, + "grad_norm": 5.714041550725084, + "learning_rate": 1.8354341938944794e-05, + "loss": 0.175, + "step": 6703 + }, + { + "epoch": 2.726311508743392, + "grad_norm": 11.17236142066589, + "learning_rate": 1.8353785057464456e-05, + "loss": 0.3546, + "step": 6704 + }, + { + "epoch": 2.7267181781211876, + "grad_norm": 9.78387139540289, + "learning_rate": 1.8353228090228317e-05, + "loss": 0.0726, + "step": 6705 + }, + { + "epoch": 2.7271248474989833, + "grad_norm": 4.5335955384930955, + "learning_rate": 1.8352671037242082e-05, + "loss": 0.0882, + "step": 6706 + }, + { + "epoch": 2.727531516876779, + "grad_norm": 5.706669003411959, + "learning_rate": 1.8352113898511475e-05, + "loss": 0.2105, + "step": 6707 + }, + { + "epoch": 2.7279381862545753, + "grad_norm": 3.102853637540906, + "learning_rate": 1.8351556674042218e-05, + "loss": 0.0694, + "step": 6708 + }, + { + "epoch": 2.728344855632371, + "grad_norm": 5.146243201401123, + "learning_rate": 1.8350999363840026e-05, + "loss": 0.0694, + "step": 6709 + }, + { + "epoch": 2.728751525010167, + "grad_norm": 9.799593168474418, + "learning_rate": 1.8350441967910626e-05, + "loss": 0.4236, + "step": 6710 + }, + { + "epoch": 2.7291581943879626, + "grad_norm": 1.819477412368037, + "learning_rate": 1.8349884486259736e-05, + "loss": 0.0264, + "step": 6711 + }, + { + "epoch": 2.7295648637657584, + "grad_norm": 3.115891474939162, + "learning_rate": 1.8349326918893076e-05, + "loss": 0.0658, + "step": 6712 + }, + { + "epoch": 2.729971533143554, + "grad_norm": 8.598311503709146, + "learning_rate": 1.8348769265816376e-05, + "loss": 0.2194, + "step": 6713 + }, + { + "epoch": 2.73037820252135, + "grad_norm": 9.27524242013627, + "learning_rate": 1.8348211527035357e-05, + "loss": 0.4498, + "step": 6714 + }, + { + "epoch": 2.730784871899146, + "grad_norm": 7.502951939464303, + "learning_rate": 1.8347653702555742e-05, + "loss": 0.2485, + "step": 6715 + }, + { + "epoch": 2.731191541276942, + "grad_norm": 2.3809213256555473, + "learning_rate": 1.8347095792383265e-05, + "loss": 0.0354, + "step": 6716 + }, + { + "epoch": 2.7315982106547376, + "grad_norm": 13.10732264501427, + "learning_rate": 1.8346537796523643e-05, + "loss": 0.7853, + "step": 6717 + }, + { + "epoch": 2.7320048800325334, + "grad_norm": 8.070078493128367, + "learning_rate": 1.8345979714982614e-05, + "loss": 0.435, + "step": 6718 + }, + { + "epoch": 2.7324115494103296, + "grad_norm": 0.04434041498697248, + "learning_rate": 1.8345421547765903e-05, + "loss": 0.0009, + "step": 6719 + }, + { + "epoch": 2.7328182187881254, + "grad_norm": 14.383460104422806, + "learning_rate": 1.8344863294879237e-05, + "loss": 0.6172, + "step": 6720 + }, + { + "epoch": 2.733224888165921, + "grad_norm": 8.410670809434828, + "learning_rate": 1.8344304956328357e-05, + "loss": 0.1977, + "step": 6721 + }, + { + "epoch": 2.733631557543717, + "grad_norm": 6.290602229713852, + "learning_rate": 1.834374653211898e-05, + "loss": 0.1299, + "step": 6722 + }, + { + "epoch": 2.7340382269215127, + "grad_norm": 4.057648297845761, + "learning_rate": 1.8343188022256846e-05, + "loss": 0.0661, + "step": 6723 + }, + { + "epoch": 2.7344448962993084, + "grad_norm": 6.492812326798085, + "learning_rate": 1.834262942674769e-05, + "loss": 0.1902, + "step": 6724 + }, + { + "epoch": 2.7348515656771046, + "grad_norm": 0.6859377839169164, + "learning_rate": 1.834207074559724e-05, + "loss": 0.0098, + "step": 6725 + }, + { + "epoch": 2.7352582350549004, + "grad_norm": 8.84676174679921, + "learning_rate": 1.834151197881124e-05, + "loss": 0.2756, + "step": 6726 + }, + { + "epoch": 2.735664904432696, + "grad_norm": 11.142911253253045, + "learning_rate": 1.834095312639542e-05, + "loss": 0.7808, + "step": 6727 + }, + { + "epoch": 2.736071573810492, + "grad_norm": 24.635104659595093, + "learning_rate": 1.8340394188355518e-05, + "loss": 0.8601, + "step": 6728 + }, + { + "epoch": 2.736478243188288, + "grad_norm": 8.492427717605217, + "learning_rate": 1.8339835164697272e-05, + "loss": 0.447, + "step": 6729 + }, + { + "epoch": 2.736884912566084, + "grad_norm": 2.567809260128552, + "learning_rate": 1.833927605542642e-05, + "loss": 0.0415, + "step": 6730 + }, + { + "epoch": 2.7372915819438797, + "grad_norm": 9.082268506727333, + "learning_rate": 1.83387168605487e-05, + "loss": 0.6475, + "step": 6731 + }, + { + "epoch": 2.7376982513216754, + "grad_norm": 9.760043638717217, + "learning_rate": 1.833815758006986e-05, + "loss": 0.5545, + "step": 6732 + }, + { + "epoch": 2.738104920699471, + "grad_norm": 6.520522501854892, + "learning_rate": 1.8337598213995634e-05, + "loss": 0.2657, + "step": 6733 + }, + { + "epoch": 2.738511590077267, + "grad_norm": 8.229021560749826, + "learning_rate": 1.8337038762331765e-05, + "loss": 0.2003, + "step": 6734 + }, + { + "epoch": 2.738918259455063, + "grad_norm": 0.876794722833442, + "learning_rate": 1.8336479225083995e-05, + "loss": 0.0085, + "step": 6735 + }, + { + "epoch": 2.739324928832859, + "grad_norm": 10.612803580395308, + "learning_rate": 1.8335919602258078e-05, + "loss": 0.5757, + "step": 6736 + }, + { + "epoch": 2.7397315982106547, + "grad_norm": 5.163196141095662, + "learning_rate": 1.8335359893859743e-05, + "loss": 0.2384, + "step": 6737 + }, + { + "epoch": 2.7401382675884505, + "grad_norm": 4.307929254792448, + "learning_rate": 1.8334800099894745e-05, + "loss": 0.0601, + "step": 6738 + }, + { + "epoch": 2.7405449369662467, + "grad_norm": 11.95965688528543, + "learning_rate": 1.8334240220368832e-05, + "loss": 0.509, + "step": 6739 + }, + { + "epoch": 2.7409516063440424, + "grad_norm": 0.33332939857576294, + "learning_rate": 1.8333680255287747e-05, + "loss": 0.005, + "step": 6740 + }, + { + "epoch": 2.741358275721838, + "grad_norm": 9.220387986448918, + "learning_rate": 1.8333120204657238e-05, + "loss": 0.5952, + "step": 6741 + }, + { + "epoch": 2.741764945099634, + "grad_norm": 5.891056060266104, + "learning_rate": 1.833256006848306e-05, + "loss": 0.2765, + "step": 6742 + }, + { + "epoch": 2.7421716144774297, + "grad_norm": 18.54985268414165, + "learning_rate": 1.8331999846770957e-05, + "loss": 0.8634, + "step": 6743 + }, + { + "epoch": 2.7425782838552255, + "grad_norm": 8.122106156448396, + "learning_rate": 1.8331439539526684e-05, + "loss": 0.4142, + "step": 6744 + }, + { + "epoch": 2.7429849532330217, + "grad_norm": 9.777595953978622, + "learning_rate": 1.833087914675599e-05, + "loss": 0.4944, + "step": 6745 + }, + { + "epoch": 2.7433916226108175, + "grad_norm": 17.555992930376444, + "learning_rate": 1.8330318668464625e-05, + "loss": 1.1314, + "step": 6746 + }, + { + "epoch": 2.7437982919886132, + "grad_norm": 0.6526592458776757, + "learning_rate": 1.8329758104658348e-05, + "loss": 0.0117, + "step": 6747 + }, + { + "epoch": 2.744204961366409, + "grad_norm": 1.980303502101511, + "learning_rate": 1.8329197455342916e-05, + "loss": 0.0335, + "step": 6748 + }, + { + "epoch": 2.744611630744205, + "grad_norm": 1.985806157929343, + "learning_rate": 1.8328636720524075e-05, + "loss": 0.0301, + "step": 6749 + }, + { + "epoch": 2.745018300122001, + "grad_norm": 3.699464587278121, + "learning_rate": 1.832807590020759e-05, + "loss": 0.0924, + "step": 6750 + }, + { + "epoch": 2.7454249694997968, + "grad_norm": 12.074611806750479, + "learning_rate": 1.8327514994399213e-05, + "loss": 0.4674, + "step": 6751 + }, + { + "epoch": 2.7458316388775925, + "grad_norm": 9.175815996524051, + "learning_rate": 1.8326954003104704e-05, + "loss": 0.2705, + "step": 6752 + }, + { + "epoch": 2.7462383082553883, + "grad_norm": 9.403725525423614, + "learning_rate": 1.832639292632982e-05, + "loss": 0.4684, + "step": 6753 + }, + { + "epoch": 2.746644977633184, + "grad_norm": 11.068996044984889, + "learning_rate": 1.8325831764080325e-05, + "loss": 0.2681, + "step": 6754 + }, + { + "epoch": 2.74705164701098, + "grad_norm": 6.16083105138564, + "learning_rate": 1.832527051636197e-05, + "loss": 0.1442, + "step": 6755 + }, + { + "epoch": 2.747458316388776, + "grad_norm": 11.900304860542898, + "learning_rate": 1.832470918318053e-05, + "loss": 0.495, + "step": 6756 + }, + { + "epoch": 2.747864985766572, + "grad_norm": 3.939084407972601, + "learning_rate": 1.832414776454176e-05, + "loss": 0.1222, + "step": 6757 + }, + { + "epoch": 2.7482716551443676, + "grad_norm": 9.280059783781399, + "learning_rate": 1.8323586260451424e-05, + "loss": 0.1824, + "step": 6758 + }, + { + "epoch": 2.7486783245221633, + "grad_norm": 4.442760424043157, + "learning_rate": 1.832302467091528e-05, + "loss": 0.2149, + "step": 6759 + }, + { + "epoch": 2.7490849938999595, + "grad_norm": 7.552536533810942, + "learning_rate": 1.8322462995939102e-05, + "loss": 0.2448, + "step": 6760 + }, + { + "epoch": 2.7494916632777553, + "grad_norm": 6.75682497099455, + "learning_rate": 1.8321901235528656e-05, + "loss": 0.203, + "step": 6761 + }, + { + "epoch": 2.749898332655551, + "grad_norm": 10.727343698898707, + "learning_rate": 1.83213393896897e-05, + "loss": 0.4865, + "step": 6762 + }, + { + "epoch": 2.750305002033347, + "grad_norm": 6.107753491372794, + "learning_rate": 1.832077745842801e-05, + "loss": 0.1049, + "step": 6763 + }, + { + "epoch": 2.7507116714111426, + "grad_norm": 2.6397985096422008, + "learning_rate": 1.832021544174935e-05, + "loss": 0.0423, + "step": 6764 + }, + { + "epoch": 2.7511183407889384, + "grad_norm": 13.262648381472566, + "learning_rate": 1.8319653339659495e-05, + "loss": 0.5934, + "step": 6765 + }, + { + "epoch": 2.7515250101667346, + "grad_norm": 5.608801146001685, + "learning_rate": 1.831909115216421e-05, + "loss": 0.1246, + "step": 6766 + }, + { + "epoch": 2.7519316795445303, + "grad_norm": 0.24317358250432528, + "learning_rate": 1.8318528879269264e-05, + "loss": 0.0035, + "step": 6767 + }, + { + "epoch": 2.752338348922326, + "grad_norm": 3.487012484134183, + "learning_rate": 1.831796652098044e-05, + "loss": 0.0728, + "step": 6768 + }, + { + "epoch": 2.752745018300122, + "grad_norm": 9.362711690577278, + "learning_rate": 1.8317404077303496e-05, + "loss": 0.3151, + "step": 6769 + }, + { + "epoch": 2.753151687677918, + "grad_norm": 4.461111731164923, + "learning_rate": 1.8316841548244212e-05, + "loss": 0.1662, + "step": 6770 + }, + { + "epoch": 2.753558357055714, + "grad_norm": 13.63829397764834, + "learning_rate": 1.831627893380837e-05, + "loss": 0.652, + "step": 6771 + }, + { + "epoch": 2.7539650264335096, + "grad_norm": 14.287299993603183, + "learning_rate": 1.8315716234001734e-05, + "loss": 0.5204, + "step": 6772 + }, + { + "epoch": 2.7543716958113054, + "grad_norm": 4.836427549374376, + "learning_rate": 1.831515344883009e-05, + "loss": 0.0633, + "step": 6773 + }, + { + "epoch": 2.754778365189101, + "grad_norm": 20.226519965951844, + "learning_rate": 1.8314590578299207e-05, + "loss": 1.2763, + "step": 6774 + }, + { + "epoch": 2.755185034566897, + "grad_norm": 9.520664140730805, + "learning_rate": 1.8314027622414868e-05, + "loss": 0.5065, + "step": 6775 + }, + { + "epoch": 2.755591703944693, + "grad_norm": 4.9966785276577355, + "learning_rate": 1.8313464581182853e-05, + "loss": 0.109, + "step": 6776 + }, + { + "epoch": 2.755998373322489, + "grad_norm": 7.39779870437714, + "learning_rate": 1.8312901454608937e-05, + "loss": 0.1878, + "step": 6777 + }, + { + "epoch": 2.7564050427002846, + "grad_norm": 12.683426280365518, + "learning_rate": 1.8312338242698905e-05, + "loss": 0.6112, + "step": 6778 + }, + { + "epoch": 2.7568117120780804, + "grad_norm": 1.109730834658562, + "learning_rate": 1.831177494545854e-05, + "loss": 0.0194, + "step": 6779 + }, + { + "epoch": 2.7572183814558766, + "grad_norm": 10.95773232744958, + "learning_rate": 1.831121156289362e-05, + "loss": 0.2313, + "step": 6780 + }, + { + "epoch": 2.7576250508336724, + "grad_norm": 2.4347754181766845, + "learning_rate": 1.8310648095009932e-05, + "loss": 0.0431, + "step": 6781 + }, + { + "epoch": 2.758031720211468, + "grad_norm": 3.4176487212433573, + "learning_rate": 1.8310084541813257e-05, + "loss": 0.1061, + "step": 6782 + }, + { + "epoch": 2.758438389589264, + "grad_norm": 3.5911318157589673, + "learning_rate": 1.830952090330938e-05, + "loss": 0.0814, + "step": 6783 + }, + { + "epoch": 2.7588450589670597, + "grad_norm": 7.392462360881622, + "learning_rate": 1.830895717950409e-05, + "loss": 0.1725, + "step": 6784 + }, + { + "epoch": 2.7592517283448554, + "grad_norm": 4.851698222622828, + "learning_rate": 1.830839337040317e-05, + "loss": 0.2051, + "step": 6785 + }, + { + "epoch": 2.7596583977226516, + "grad_norm": 2.689717780916894, + "learning_rate": 1.830782947601242e-05, + "loss": 0.0375, + "step": 6786 + }, + { + "epoch": 2.7600650671004474, + "grad_norm": 7.683379074865459, + "learning_rate": 1.830726549633761e-05, + "loss": 0.3494, + "step": 6787 + }, + { + "epoch": 2.760471736478243, + "grad_norm": 0.3152417571772049, + "learning_rate": 1.830670143138454e-05, + "loss": 0.0043, + "step": 6788 + }, + { + "epoch": 2.760878405856039, + "grad_norm": 12.915130360736738, + "learning_rate": 1.8306137281159002e-05, + "loss": 0.5929, + "step": 6789 + }, + { + "epoch": 2.761285075233835, + "grad_norm": 34.27431226502686, + "learning_rate": 1.8305573045666784e-05, + "loss": 0.5364, + "step": 6790 + }, + { + "epoch": 2.761691744611631, + "grad_norm": 9.588581470639756, + "learning_rate": 1.8305008724913677e-05, + "loss": 0.595, + "step": 6791 + }, + { + "epoch": 2.7620984139894267, + "grad_norm": 5.261164968994514, + "learning_rate": 1.830444431890548e-05, + "loss": 0.3629, + "step": 6792 + }, + { + "epoch": 2.7625050833672224, + "grad_norm": 6.018048207060124, + "learning_rate": 1.8303879827647977e-05, + "loss": 0.2102, + "step": 6793 + }, + { + "epoch": 2.762911752745018, + "grad_norm": 6.171677193044641, + "learning_rate": 1.830331525114697e-05, + "loss": 0.1108, + "step": 6794 + }, + { + "epoch": 2.763318422122814, + "grad_norm": 12.09539036097732, + "learning_rate": 1.8302750589408253e-05, + "loss": 0.2995, + "step": 6795 + }, + { + "epoch": 2.7637250915006097, + "grad_norm": 8.254162554617842, + "learning_rate": 1.8302185842437622e-05, + "loss": 0.4034, + "step": 6796 + }, + { + "epoch": 2.764131760878406, + "grad_norm": 2.173575799851196, + "learning_rate": 1.8301621010240878e-05, + "loss": 0.0285, + "step": 6797 + }, + { + "epoch": 2.7645384302562017, + "grad_norm": 8.386654437195375, + "learning_rate": 1.8301056092823813e-05, + "loss": 0.3196, + "step": 6798 + }, + { + "epoch": 2.7649450996339975, + "grad_norm": 3.7721965169525045, + "learning_rate": 1.8300491090192228e-05, + "loss": 0.0655, + "step": 6799 + }, + { + "epoch": 2.7653517690117932, + "grad_norm": 6.779857943669217, + "learning_rate": 1.8299926002351928e-05, + "loss": 0.1205, + "step": 6800 + }, + { + "epoch": 2.7657584383895895, + "grad_norm": 2.9215248399990985, + "learning_rate": 1.829936082930871e-05, + "loss": 0.0415, + "step": 6801 + }, + { + "epoch": 2.7661651077673852, + "grad_norm": 3.8259998187530635, + "learning_rate": 1.8298795571068374e-05, + "loss": 0.0551, + "step": 6802 + }, + { + "epoch": 2.766571777145181, + "grad_norm": 0.3676471328234135, + "learning_rate": 1.8298230227636726e-05, + "loss": 0.0049, + "step": 6803 + }, + { + "epoch": 2.7669784465229768, + "grad_norm": 25.38937970672918, + "learning_rate": 1.8297664799019567e-05, + "loss": 0.8004, + "step": 6804 + }, + { + "epoch": 2.7673851159007725, + "grad_norm": 7.196923520305196, + "learning_rate": 1.8297099285222707e-05, + "loss": 0.1338, + "step": 6805 + }, + { + "epoch": 2.7677917852785683, + "grad_norm": 17.52444547988911, + "learning_rate": 1.8296533686251945e-05, + "loss": 0.6175, + "step": 6806 + }, + { + "epoch": 2.7681984546563645, + "grad_norm": 2.1190102931840102, + "learning_rate": 1.8295968002113088e-05, + "loss": 0.0312, + "step": 6807 + }, + { + "epoch": 2.7686051240341603, + "grad_norm": 5.5714061529382555, + "learning_rate": 1.8295402232811942e-05, + "loss": 0.0884, + "step": 6808 + }, + { + "epoch": 2.769011793411956, + "grad_norm": 10.347379902829884, + "learning_rate": 1.829483637835432e-05, + "loss": 0.2952, + "step": 6809 + }, + { + "epoch": 2.769418462789752, + "grad_norm": 5.529274812049858, + "learning_rate": 1.8294270438746028e-05, + "loss": 0.0397, + "step": 6810 + }, + { + "epoch": 2.769825132167548, + "grad_norm": 11.723589264416633, + "learning_rate": 1.8293704413992874e-05, + "loss": 0.2412, + "step": 6811 + }, + { + "epoch": 2.7702318015453438, + "grad_norm": 12.34510588547873, + "learning_rate": 1.829313830410067e-05, + "loss": 0.7674, + "step": 6812 + }, + { + "epoch": 2.7706384709231395, + "grad_norm": 0.6342554676077706, + "learning_rate": 1.829257210907523e-05, + "loss": 0.0101, + "step": 6813 + }, + { + "epoch": 2.7710451403009353, + "grad_norm": 14.427761176876976, + "learning_rate": 1.829200582892236e-05, + "loss": 0.308, + "step": 6814 + }, + { + "epoch": 2.771451809678731, + "grad_norm": 0.21146162592883866, + "learning_rate": 1.829143946364788e-05, + "loss": 0.0026, + "step": 6815 + }, + { + "epoch": 2.771858479056527, + "grad_norm": 5.4561283399514044, + "learning_rate": 1.8290873013257598e-05, + "loss": 0.1259, + "step": 6816 + }, + { + "epoch": 2.772265148434323, + "grad_norm": 2.823718279521012, + "learning_rate": 1.8290306477757336e-05, + "loss": 0.0441, + "step": 6817 + }, + { + "epoch": 2.772671817812119, + "grad_norm": 12.145578337611305, + "learning_rate": 1.8289739857152903e-05, + "loss": 0.6932, + "step": 6818 + }, + { + "epoch": 2.7730784871899146, + "grad_norm": 18.583530951905534, + "learning_rate": 1.8289173151450118e-05, + "loss": 0.4169, + "step": 6819 + }, + { + "epoch": 2.7734851565677103, + "grad_norm": 11.918339044247205, + "learning_rate": 1.82886063606548e-05, + "loss": 0.9229, + "step": 6820 + }, + { + "epoch": 2.7738918259455065, + "grad_norm": 4.831695011198518, + "learning_rate": 1.8288039484772767e-05, + "loss": 0.086, + "step": 6821 + }, + { + "epoch": 2.7742984953233023, + "grad_norm": 8.478234920235572, + "learning_rate": 1.8287472523809834e-05, + "loss": 0.1731, + "step": 6822 + }, + { + "epoch": 2.774705164701098, + "grad_norm": 9.360684979833048, + "learning_rate": 1.8286905477771827e-05, + "loss": 0.9263, + "step": 6823 + }, + { + "epoch": 2.775111834078894, + "grad_norm": 10.44095316123758, + "learning_rate": 1.8286338346664563e-05, + "loss": 0.6375, + "step": 6824 + }, + { + "epoch": 2.7755185034566896, + "grad_norm": 6.318917508233332, + "learning_rate": 1.8285771130493866e-05, + "loss": 0.1362, + "step": 6825 + }, + { + "epoch": 2.7759251728344854, + "grad_norm": 2.683407411570116, + "learning_rate": 1.8285203829265558e-05, + "loss": 0.0655, + "step": 6826 + }, + { + "epoch": 2.7763318422122816, + "grad_norm": 6.295672701242414, + "learning_rate": 1.8284636442985464e-05, + "loss": 0.3361, + "step": 6827 + }, + { + "epoch": 2.7767385115900773, + "grad_norm": 6.045474715520141, + "learning_rate": 1.8284068971659404e-05, + "loss": 0.0999, + "step": 6828 + }, + { + "epoch": 2.777145180967873, + "grad_norm": 11.46799295819404, + "learning_rate": 1.828350141529321e-05, + "loss": 1.1637, + "step": 6829 + }, + { + "epoch": 2.777551850345669, + "grad_norm": 7.174936360456397, + "learning_rate": 1.8282933773892707e-05, + "loss": 0.238, + "step": 6830 + }, + { + "epoch": 2.777958519723465, + "grad_norm": 67.47629465484151, + "learning_rate": 1.8282366047463718e-05, + "loss": 1.2531, + "step": 6831 + }, + { + "epoch": 2.778365189101261, + "grad_norm": 5.07855133380671, + "learning_rate": 1.8281798236012073e-05, + "loss": 0.09, + "step": 6832 + }, + { + "epoch": 2.7787718584790566, + "grad_norm": 2.719655483289113, + "learning_rate": 1.82812303395436e-05, + "loss": 0.0569, + "step": 6833 + }, + { + "epoch": 2.7791785278568524, + "grad_norm": 15.906461025976014, + "learning_rate": 1.828066235806413e-05, + "loss": 0.565, + "step": 6834 + }, + { + "epoch": 2.779585197234648, + "grad_norm": 7.436047193901008, + "learning_rate": 1.8280094291579493e-05, + "loss": 0.2855, + "step": 6835 + }, + { + "epoch": 2.779991866612444, + "grad_norm": 22.251007503800253, + "learning_rate": 1.827952614009552e-05, + "loss": 1.2279, + "step": 6836 + }, + { + "epoch": 2.7803985359902397, + "grad_norm": 8.411252057677292, + "learning_rate": 1.827895790361805e-05, + "loss": 0.3404, + "step": 6837 + }, + { + "epoch": 2.780805205368036, + "grad_norm": 16.665261777635273, + "learning_rate": 1.82783895821529e-05, + "loss": 0.1118, + "step": 6838 + }, + { + "epoch": 2.7812118747458316, + "grad_norm": 10.603347351178028, + "learning_rate": 1.8277821175705923e-05, + "loss": 0.2283, + "step": 6839 + }, + { + "epoch": 2.7816185441236274, + "grad_norm": 13.182853547085532, + "learning_rate": 1.8277252684282943e-05, + "loss": 0.8399, + "step": 6840 + }, + { + "epoch": 2.782025213501423, + "grad_norm": 9.708663619278227, + "learning_rate": 1.82766841078898e-05, + "loss": 0.4716, + "step": 6841 + }, + { + "epoch": 2.7824318828792194, + "grad_norm": 25.61964272142809, + "learning_rate": 1.8276115446532324e-05, + "loss": 0.9543, + "step": 6842 + }, + { + "epoch": 2.782838552257015, + "grad_norm": 51.464804488281096, + "learning_rate": 1.827554670021636e-05, + "loss": 2.3398, + "step": 6843 + }, + { + "epoch": 2.783245221634811, + "grad_norm": 3.755784205560984, + "learning_rate": 1.8274977868947747e-05, + "loss": 0.0688, + "step": 6844 + }, + { + "epoch": 2.7836518910126067, + "grad_norm": 12.104516105505121, + "learning_rate": 1.827440895273232e-05, + "loss": 0.4361, + "step": 6845 + }, + { + "epoch": 2.7840585603904024, + "grad_norm": 4.584563579545444, + "learning_rate": 1.8273839951575923e-05, + "loss": 0.2576, + "step": 6846 + }, + { + "epoch": 2.784465229768198, + "grad_norm": 9.68850328815009, + "learning_rate": 1.8273270865484392e-05, + "loss": 0.3784, + "step": 6847 + }, + { + "epoch": 2.7848718991459944, + "grad_norm": 10.551199337735822, + "learning_rate": 1.8272701694463572e-05, + "loss": 0.277, + "step": 6848 + }, + { + "epoch": 2.78527856852379, + "grad_norm": 10.818683045740393, + "learning_rate": 1.8272132438519302e-05, + "loss": 0.5643, + "step": 6849 + }, + { + "epoch": 2.785685237901586, + "grad_norm": 15.259158810705712, + "learning_rate": 1.8271563097657436e-05, + "loss": 0.6567, + "step": 6850 + }, + { + "epoch": 2.7860919072793817, + "grad_norm": 11.495331897031553, + "learning_rate": 1.8270993671883806e-05, + "loss": 0.8476, + "step": 6851 + }, + { + "epoch": 2.786498576657178, + "grad_norm": 3.6115242822818727, + "learning_rate": 1.8270424161204264e-05, + "loss": 0.0724, + "step": 6852 + }, + { + "epoch": 2.7869052460349737, + "grad_norm": 12.589604841738648, + "learning_rate": 1.826985456562466e-05, + "loss": 0.3671, + "step": 6853 + }, + { + "epoch": 2.7873119154127695, + "grad_norm": 8.860332519055023, + "learning_rate": 1.8269284885150834e-05, + "loss": 0.2482, + "step": 6854 + }, + { + "epoch": 2.7877185847905652, + "grad_norm": 10.31041613714681, + "learning_rate": 1.8268715119788637e-05, + "loss": 0.1561, + "step": 6855 + }, + { + "epoch": 2.788125254168361, + "grad_norm": 11.076922169630777, + "learning_rate": 1.8268145269543914e-05, + "loss": 0.8996, + "step": 6856 + }, + { + "epoch": 2.7885319235461568, + "grad_norm": 8.554114927398295, + "learning_rate": 1.8267575334422525e-05, + "loss": 0.272, + "step": 6857 + }, + { + "epoch": 2.788938592923953, + "grad_norm": 11.485884310649613, + "learning_rate": 1.826700531443031e-05, + "loss": 0.7594, + "step": 6858 + }, + { + "epoch": 2.7893452623017487, + "grad_norm": 12.238139917420234, + "learning_rate": 1.8266435209573126e-05, + "loss": 0.3446, + "step": 6859 + }, + { + "epoch": 2.7897519316795445, + "grad_norm": 9.189734795537493, + "learning_rate": 1.8265865019856822e-05, + "loss": 0.4925, + "step": 6860 + }, + { + "epoch": 2.7901586010573403, + "grad_norm": 24.976679199394766, + "learning_rate": 1.8265294745287258e-05, + "loss": 1.6601, + "step": 6861 + }, + { + "epoch": 2.7905652704351365, + "grad_norm": 12.369061425655875, + "learning_rate": 1.826472438587028e-05, + "loss": 0.5269, + "step": 6862 + }, + { + "epoch": 2.7909719398129322, + "grad_norm": 4.818549543550937, + "learning_rate": 1.826415394161175e-05, + "loss": 0.1167, + "step": 6863 + }, + { + "epoch": 2.791378609190728, + "grad_norm": 13.621365593628095, + "learning_rate": 1.826358341251752e-05, + "loss": 0.8368, + "step": 6864 + }, + { + "epoch": 2.7917852785685238, + "grad_norm": 5.369494769316124, + "learning_rate": 1.826301279859344e-05, + "loss": 0.069, + "step": 6865 + }, + { + "epoch": 2.7921919479463195, + "grad_norm": 8.73591818168577, + "learning_rate": 1.8262442099845382e-05, + "loss": 0.1536, + "step": 6866 + }, + { + "epoch": 2.7925986173241153, + "grad_norm": 10.333900437834718, + "learning_rate": 1.8261871316279198e-05, + "loss": 0.4695, + "step": 6867 + }, + { + "epoch": 2.7930052867019115, + "grad_norm": 8.428033715840835, + "learning_rate": 1.8261300447900744e-05, + "loss": 0.664, + "step": 6868 + }, + { + "epoch": 2.7934119560797073, + "grad_norm": 7.148892492916774, + "learning_rate": 1.8260729494715887e-05, + "loss": 0.259, + "step": 6869 + }, + { + "epoch": 2.793818625457503, + "grad_norm": 8.735888524021393, + "learning_rate": 1.8260158456730475e-05, + "loss": 0.4913, + "step": 6870 + }, + { + "epoch": 2.794225294835299, + "grad_norm": 9.721432153701238, + "learning_rate": 1.825958733395039e-05, + "loss": 0.5012, + "step": 6871 + }, + { + "epoch": 2.794631964213095, + "grad_norm": 9.255868697154202, + "learning_rate": 1.825901612638148e-05, + "loss": 0.2676, + "step": 6872 + }, + { + "epoch": 2.7950386335908908, + "grad_norm": 5.538470901971823, + "learning_rate": 1.825844483402961e-05, + "loss": 0.119, + "step": 6873 + }, + { + "epoch": 2.7954453029686865, + "grad_norm": 9.570743778690153, + "learning_rate": 1.825787345690065e-05, + "loss": 0.6066, + "step": 6874 + }, + { + "epoch": 2.7958519723464823, + "grad_norm": 14.263990762863221, + "learning_rate": 1.825730199500046e-05, + "loss": 0.7298, + "step": 6875 + }, + { + "epoch": 2.796258641724278, + "grad_norm": 7.574517374173188, + "learning_rate": 1.8256730448334915e-05, + "loss": 0.2449, + "step": 6876 + }, + { + "epoch": 2.796665311102074, + "grad_norm": 9.108924312700568, + "learning_rate": 1.825615881690987e-05, + "loss": 0.3205, + "step": 6877 + }, + { + "epoch": 2.7970719804798696, + "grad_norm": 5.85495746148092, + "learning_rate": 1.8255587100731204e-05, + "loss": 0.1418, + "step": 6878 + }, + { + "epoch": 2.797478649857666, + "grad_norm": 12.146537193590486, + "learning_rate": 1.825501529980478e-05, + "loss": 0.5245, + "step": 6879 + }, + { + "epoch": 2.7978853192354616, + "grad_norm": 6.78318186409458, + "learning_rate": 1.8254443414136468e-05, + "loss": 0.1507, + "step": 6880 + }, + { + "epoch": 2.7982919886132573, + "grad_norm": 8.638164714538483, + "learning_rate": 1.8253871443732144e-05, + "loss": 0.4682, + "step": 6881 + }, + { + "epoch": 2.7986986579910536, + "grad_norm": 10.106327282288882, + "learning_rate": 1.8253299388597673e-05, + "loss": 0.5129, + "step": 6882 + }, + { + "epoch": 2.7991053273688493, + "grad_norm": 9.832817617189374, + "learning_rate": 1.825272724873893e-05, + "loss": 0.3338, + "step": 6883 + }, + { + "epoch": 2.799511996746645, + "grad_norm": 10.70588305073557, + "learning_rate": 1.8252155024161786e-05, + "loss": 0.4029, + "step": 6884 + }, + { + "epoch": 2.799918666124441, + "grad_norm": 17.59457798473913, + "learning_rate": 1.8251582714872122e-05, + "loss": 1.1299, + "step": 6885 + }, + { + "epoch": 2.8003253355022366, + "grad_norm": 9.458341740688917, + "learning_rate": 1.8251010320875805e-05, + "loss": 0.234, + "step": 6886 + }, + { + "epoch": 2.8007320048800324, + "grad_norm": 11.233650301945412, + "learning_rate": 1.8250437842178715e-05, + "loss": 0.4788, + "step": 6887 + }, + { + "epoch": 2.801138674257828, + "grad_norm": 27.749909747278732, + "learning_rate": 1.824986527878673e-05, + "loss": 0.4172, + "step": 6888 + }, + { + "epoch": 2.8015453436356244, + "grad_norm": 3.7398490695470277, + "learning_rate": 1.8249292630705723e-05, + "loss": 0.0708, + "step": 6889 + }, + { + "epoch": 2.80195201301342, + "grad_norm": 4.076922075953393, + "learning_rate": 1.8248719897941578e-05, + "loss": 0.1759, + "step": 6890 + }, + { + "epoch": 2.802358682391216, + "grad_norm": 5.594072629722326, + "learning_rate": 1.824814708050017e-05, + "loss": 0.2786, + "step": 6891 + }, + { + "epoch": 2.8027653517690116, + "grad_norm": 13.538986811451078, + "learning_rate": 1.8247574178387382e-05, + "loss": 0.8015, + "step": 6892 + }, + { + "epoch": 2.803172021146808, + "grad_norm": 4.848889585920012, + "learning_rate": 1.8247001191609094e-05, + "loss": 0.0786, + "step": 6893 + }, + { + "epoch": 2.8035786905246036, + "grad_norm": 7.965314420029231, + "learning_rate": 1.8246428120171185e-05, + "loss": 0.2214, + "step": 6894 + }, + { + "epoch": 2.8039853599023994, + "grad_norm": 7.410569353423573, + "learning_rate": 1.8245854964079546e-05, + "loss": 0.4729, + "step": 6895 + }, + { + "epoch": 2.804392029280195, + "grad_norm": 9.834407492676915, + "learning_rate": 1.8245281723340053e-05, + "loss": 0.5347, + "step": 6896 + }, + { + "epoch": 2.804798698657991, + "grad_norm": 9.02942137242485, + "learning_rate": 1.8244708397958596e-05, + "loss": 0.2939, + "step": 6897 + }, + { + "epoch": 2.8052053680357867, + "grad_norm": 8.066454279452957, + "learning_rate": 1.8244134987941052e-05, + "loss": 0.3532, + "step": 6898 + }, + { + "epoch": 2.805612037413583, + "grad_norm": 3.3487845530141294, + "learning_rate": 1.824356149329332e-05, + "loss": 0.0392, + "step": 6899 + }, + { + "epoch": 2.8060187067913787, + "grad_norm": 6.796887942798016, + "learning_rate": 1.824298791402128e-05, + "loss": 0.3508, + "step": 6900 + }, + { + "epoch": 2.8064253761691744, + "grad_norm": 0.5670416278103849, + "learning_rate": 1.8242414250130813e-05, + "loss": 0.0115, + "step": 6901 + }, + { + "epoch": 2.80683204554697, + "grad_norm": 10.24520317945498, + "learning_rate": 1.824184050162782e-05, + "loss": 0.531, + "step": 6902 + }, + { + "epoch": 2.8072387149247664, + "grad_norm": 5.86184652432764, + "learning_rate": 1.8241266668518187e-05, + "loss": 0.1142, + "step": 6903 + }, + { + "epoch": 2.807645384302562, + "grad_norm": 9.97658337579319, + "learning_rate": 1.8240692750807803e-05, + "loss": 0.3299, + "step": 6904 + }, + { + "epoch": 2.808052053680358, + "grad_norm": 4.6302934581467605, + "learning_rate": 1.824011874850256e-05, + "loss": 0.172, + "step": 6905 + }, + { + "epoch": 2.8084587230581537, + "grad_norm": 5.154750867029752, + "learning_rate": 1.8239544661608356e-05, + "loss": 0.1166, + "step": 6906 + }, + { + "epoch": 2.8088653924359495, + "grad_norm": 12.020735564178958, + "learning_rate": 1.8238970490131072e-05, + "loss": 0.3746, + "step": 6907 + }, + { + "epoch": 2.8092720618137452, + "grad_norm": 8.893990464622297, + "learning_rate": 1.8238396234076614e-05, + "loss": 0.2015, + "step": 6908 + }, + { + "epoch": 2.8096787311915414, + "grad_norm": 11.551989284488844, + "learning_rate": 1.823782189345087e-05, + "loss": 0.4577, + "step": 6909 + }, + { + "epoch": 2.810085400569337, + "grad_norm": 7.66833419142754, + "learning_rate": 1.8237247468259736e-05, + "loss": 0.4241, + "step": 6910 + }, + { + "epoch": 2.810492069947133, + "grad_norm": 77.79649092132586, + "learning_rate": 1.8236672958509115e-05, + "loss": 0.3323, + "step": 6911 + }, + { + "epoch": 2.8108987393249287, + "grad_norm": 13.124634441325796, + "learning_rate": 1.82360983642049e-05, + "loss": 0.2918, + "step": 6912 + }, + { + "epoch": 2.811305408702725, + "grad_norm": 20.92724001038439, + "learning_rate": 1.823552368535299e-05, + "loss": 0.998, + "step": 6913 + }, + { + "epoch": 2.8117120780805207, + "grad_norm": 8.668422062064723, + "learning_rate": 1.8234948921959285e-05, + "loss": 0.206, + "step": 6914 + }, + { + "epoch": 2.8121187474583165, + "grad_norm": 1.8203344461124475, + "learning_rate": 1.8234374074029682e-05, + "loss": 0.0264, + "step": 6915 + }, + { + "epoch": 2.8125254168361122, + "grad_norm": 2.0686033983203407, + "learning_rate": 1.8233799141570087e-05, + "loss": 0.1147, + "step": 6916 + }, + { + "epoch": 2.812932086213908, + "grad_norm": 0.5635481315377332, + "learning_rate": 1.82332241245864e-05, + "loss": 0.0063, + "step": 6917 + }, + { + "epoch": 2.8133387555917038, + "grad_norm": 5.116387845242363, + "learning_rate": 1.823264902308452e-05, + "loss": 0.0797, + "step": 6918 + }, + { + "epoch": 2.8137454249694995, + "grad_norm": 5.974090805069978, + "learning_rate": 1.823207383707036e-05, + "loss": 0.1307, + "step": 6919 + }, + { + "epoch": 2.8141520943472957, + "grad_norm": 20.730224734448512, + "learning_rate": 1.8231498566549816e-05, + "loss": 1.7863, + "step": 6920 + }, + { + "epoch": 2.8145587637250915, + "grad_norm": 2.603791785686001, + "learning_rate": 1.82309232115288e-05, + "loss": 0.058, + "step": 6921 + }, + { + "epoch": 2.8149654331028873, + "grad_norm": 14.344509054027275, + "learning_rate": 1.823034777201321e-05, + "loss": 0.6735, + "step": 6922 + }, + { + "epoch": 2.8153721024806835, + "grad_norm": 3.591512907141688, + "learning_rate": 1.8229772248008956e-05, + "loss": 0.0621, + "step": 6923 + }, + { + "epoch": 2.8157787718584792, + "grad_norm": 6.534189783437571, + "learning_rate": 1.8229196639521954e-05, + "loss": 0.1397, + "step": 6924 + }, + { + "epoch": 2.816185441236275, + "grad_norm": 5.563016349039377, + "learning_rate": 1.8228620946558104e-05, + "loss": 0.1272, + "step": 6925 + }, + { + "epoch": 2.8165921106140708, + "grad_norm": 7.824926083122868, + "learning_rate": 1.822804516912332e-05, + "loss": 0.2364, + "step": 6926 + }, + { + "epoch": 2.8169987799918665, + "grad_norm": 8.804408841390327, + "learning_rate": 1.822746930722351e-05, + "loss": 0.2081, + "step": 6927 + }, + { + "epoch": 2.8174054493696623, + "grad_norm": 14.252702009807438, + "learning_rate": 1.8226893360864585e-05, + "loss": 0.649, + "step": 6928 + }, + { + "epoch": 2.817812118747458, + "grad_norm": 2.7290350129785668, + "learning_rate": 1.8226317330052465e-05, + "loss": 0.0435, + "step": 6929 + }, + { + "epoch": 2.8182187881252543, + "grad_norm": 3.7624005567097476, + "learning_rate": 1.822574121479305e-05, + "loss": 0.0793, + "step": 6930 + }, + { + "epoch": 2.81862545750305, + "grad_norm": 34.73138462910311, + "learning_rate": 1.8225165015092264e-05, + "loss": 0.4586, + "step": 6931 + }, + { + "epoch": 2.819032126880846, + "grad_norm": 7.975766325928812, + "learning_rate": 1.8224588730956022e-05, + "loss": 0.2259, + "step": 6932 + }, + { + "epoch": 2.8194387962586416, + "grad_norm": 18.487565154820587, + "learning_rate": 1.8224012362390236e-05, + "loss": 0.9358, + "step": 6933 + }, + { + "epoch": 2.819845465636438, + "grad_norm": 8.369459004616905, + "learning_rate": 1.8223435909400825e-05, + "loss": 0.1607, + "step": 6934 + }, + { + "epoch": 2.8202521350142336, + "grad_norm": 14.991295702139663, + "learning_rate": 1.82228593719937e-05, + "loss": 0.9282, + "step": 6935 + }, + { + "epoch": 2.8206588043920293, + "grad_norm": 11.069968697972753, + "learning_rate": 1.822228275017479e-05, + "loss": 0.7709, + "step": 6936 + }, + { + "epoch": 2.821065473769825, + "grad_norm": 11.402229965860219, + "learning_rate": 1.822170604395001e-05, + "loss": 0.5345, + "step": 6937 + }, + { + "epoch": 2.821472143147621, + "grad_norm": 12.271502637174121, + "learning_rate": 1.8221129253325278e-05, + "loss": 0.5281, + "step": 6938 + }, + { + "epoch": 2.8218788125254166, + "grad_norm": 3.679031011847212, + "learning_rate": 1.822055237830652e-05, + "loss": 0.1601, + "step": 6939 + }, + { + "epoch": 2.822285481903213, + "grad_norm": 20.54460779185343, + "learning_rate": 1.821997541889965e-05, + "loss": 1.4528, + "step": 6940 + }, + { + "epoch": 2.8226921512810086, + "grad_norm": 8.21680905614301, + "learning_rate": 1.82193983751106e-05, + "loss": 0.2248, + "step": 6941 + }, + { + "epoch": 2.8230988206588044, + "grad_norm": 9.18484204025711, + "learning_rate": 1.8218821246945283e-05, + "loss": 0.5718, + "step": 6942 + }, + { + "epoch": 2.8235054900366, + "grad_norm": 12.240550723415364, + "learning_rate": 1.821824403440963e-05, + "loss": 0.5067, + "step": 6943 + }, + { + "epoch": 2.8239121594143963, + "grad_norm": 23.900881651952798, + "learning_rate": 1.8217666737509568e-05, + "loss": 2.3142, + "step": 6944 + }, + { + "epoch": 2.824318828792192, + "grad_norm": 11.929281297774287, + "learning_rate": 1.821708935625102e-05, + "loss": 0.6224, + "step": 6945 + }, + { + "epoch": 2.824725498169988, + "grad_norm": 15.606157532102163, + "learning_rate": 1.821651189063992e-05, + "loss": 0.5394, + "step": 6946 + }, + { + "epoch": 2.8251321675477836, + "grad_norm": 15.908209491784666, + "learning_rate": 1.8215934340682183e-05, + "loss": 0.558, + "step": 6947 + }, + { + "epoch": 2.8255388369255794, + "grad_norm": 12.460061850469144, + "learning_rate": 1.8215356706383748e-05, + "loss": 0.3392, + "step": 6948 + }, + { + "epoch": 2.825945506303375, + "grad_norm": 3.039834940948464, + "learning_rate": 1.8214778987750538e-05, + "loss": 0.0644, + "step": 6949 + }, + { + "epoch": 2.8263521756811714, + "grad_norm": 13.660623247630639, + "learning_rate": 1.8214201184788493e-05, + "loss": 0.8802, + "step": 6950 + }, + { + "epoch": 2.826758845058967, + "grad_norm": 9.181134679582861, + "learning_rate": 1.8213623297503535e-05, + "loss": 0.4236, + "step": 6951 + }, + { + "epoch": 2.827165514436763, + "grad_norm": 9.334842673353785, + "learning_rate": 1.82130453259016e-05, + "loss": 0.661, + "step": 6952 + }, + { + "epoch": 2.8275721838145587, + "grad_norm": 5.734242924554125, + "learning_rate": 1.8212467269988627e-05, + "loss": 0.1893, + "step": 6953 + }, + { + "epoch": 2.827978853192355, + "grad_norm": 13.393479711244161, + "learning_rate": 1.8211889129770534e-05, + "loss": 0.8018, + "step": 6954 + }, + { + "epoch": 2.8283855225701506, + "grad_norm": 10.79369884721406, + "learning_rate": 1.8211310905253276e-05, + "loss": 0.5076, + "step": 6955 + }, + { + "epoch": 2.8287921919479464, + "grad_norm": 13.930062026817943, + "learning_rate": 1.8210732596442772e-05, + "loss": 0.5944, + "step": 6956 + }, + { + "epoch": 2.829198861325742, + "grad_norm": 10.383191837211946, + "learning_rate": 1.8210154203344968e-05, + "loss": 0.5159, + "step": 6957 + }, + { + "epoch": 2.829605530703538, + "grad_norm": 8.529235268661449, + "learning_rate": 1.82095757259658e-05, + "loss": 0.2853, + "step": 6958 + }, + { + "epoch": 2.8300122000813337, + "grad_norm": 12.1429286994843, + "learning_rate": 1.8208997164311205e-05, + "loss": 0.7991, + "step": 6959 + }, + { + "epoch": 2.8304188694591295, + "grad_norm": 2.899133209999944, + "learning_rate": 1.8208418518387124e-05, + "loss": 0.0476, + "step": 6960 + }, + { + "epoch": 2.8308255388369257, + "grad_norm": 7.3129045107460655, + "learning_rate": 1.8207839788199494e-05, + "loss": 0.2941, + "step": 6961 + }, + { + "epoch": 2.8312322082147214, + "grad_norm": 11.01588130842547, + "learning_rate": 1.8207260973754255e-05, + "loss": 0.5708, + "step": 6962 + }, + { + "epoch": 2.831638877592517, + "grad_norm": 4.260498098845905, + "learning_rate": 1.8206682075057355e-05, + "loss": 0.0915, + "step": 6963 + }, + { + "epoch": 2.8320455469703134, + "grad_norm": 9.023097798736245, + "learning_rate": 1.820610309211473e-05, + "loss": 0.5135, + "step": 6964 + }, + { + "epoch": 2.832452216348109, + "grad_norm": 13.79210638444526, + "learning_rate": 1.820552402493233e-05, + "loss": 0.4941, + "step": 6965 + }, + { + "epoch": 2.832858885725905, + "grad_norm": 3.194295610029268, + "learning_rate": 1.8204944873516096e-05, + "loss": 0.0536, + "step": 6966 + }, + { + "epoch": 2.8332655551037007, + "grad_norm": 8.311571997992678, + "learning_rate": 1.820436563787197e-05, + "loss": 0.2796, + "step": 6967 + }, + { + "epoch": 2.8336722244814965, + "grad_norm": 3.052709495079013, + "learning_rate": 1.8203786318005904e-05, + "loss": 0.0589, + "step": 6968 + }, + { + "epoch": 2.8340788938592922, + "grad_norm": 4.620635902127593, + "learning_rate": 1.8203206913923842e-05, + "loss": 0.0813, + "step": 6969 + }, + { + "epoch": 2.834485563237088, + "grad_norm": 6.418859454976095, + "learning_rate": 1.8202627425631733e-05, + "loss": 0.2938, + "step": 6970 + }, + { + "epoch": 2.834892232614884, + "grad_norm": 1.5887703801920718, + "learning_rate": 1.8202047853135526e-05, + "loss": 0.0298, + "step": 6971 + }, + { + "epoch": 2.83529890199268, + "grad_norm": 8.076598684064178, + "learning_rate": 1.820146819644117e-05, + "loss": 0.2004, + "step": 6972 + }, + { + "epoch": 2.8357055713704757, + "grad_norm": 11.668037063068525, + "learning_rate": 1.820088845555461e-05, + "loss": 0.6928, + "step": 6973 + }, + { + "epoch": 2.8361122407482715, + "grad_norm": 10.359289012750933, + "learning_rate": 1.820030863048181e-05, + "loss": 0.799, + "step": 6974 + }, + { + "epoch": 2.8365189101260677, + "grad_norm": 5.666475515549229, + "learning_rate": 1.819972872122871e-05, + "loss": 0.3035, + "step": 6975 + }, + { + "epoch": 2.8369255795038635, + "grad_norm": 2.7670233415217464, + "learning_rate": 1.819914872780127e-05, + "loss": 0.0471, + "step": 6976 + }, + { + "epoch": 2.8373322488816592, + "grad_norm": 6.96408313880083, + "learning_rate": 1.8198568650205443e-05, + "loss": 0.1746, + "step": 6977 + }, + { + "epoch": 2.837738918259455, + "grad_norm": 5.766878523371252, + "learning_rate": 1.8197988488447178e-05, + "loss": 0.2661, + "step": 6978 + }, + { + "epoch": 2.8381455876372508, + "grad_norm": 2.6111163174235497, + "learning_rate": 1.819740824253244e-05, + "loss": 0.0538, + "step": 6979 + }, + { + "epoch": 2.8385522570150465, + "grad_norm": 6.546772649293999, + "learning_rate": 1.819682791246718e-05, + "loss": 0.284, + "step": 6980 + }, + { + "epoch": 2.8389589263928428, + "grad_norm": 5.884667565897406, + "learning_rate": 1.8196247498257352e-05, + "loss": 0.0901, + "step": 6981 + }, + { + "epoch": 2.8393655957706385, + "grad_norm": 12.494726683778069, + "learning_rate": 1.819566699990892e-05, + "loss": 1.1729, + "step": 6982 + }, + { + "epoch": 2.8397722651484343, + "grad_norm": 6.086824276950609, + "learning_rate": 1.8195086417427845e-05, + "loss": 0.3056, + "step": 6983 + }, + { + "epoch": 2.84017893452623, + "grad_norm": 1.6845605392751322, + "learning_rate": 1.819450575082008e-05, + "loss": 0.0292, + "step": 6984 + }, + { + "epoch": 2.8405856039040263, + "grad_norm": 2.9275273994817534, + "learning_rate": 1.819392500009159e-05, + "loss": 0.0493, + "step": 6985 + }, + { + "epoch": 2.840992273281822, + "grad_norm": 1.6765611140706158, + "learning_rate": 1.8193344165248333e-05, + "loss": 0.0247, + "step": 6986 + }, + { + "epoch": 2.841398942659618, + "grad_norm": 8.205006704448657, + "learning_rate": 1.819276324629628e-05, + "loss": 0.1582, + "step": 6987 + }, + { + "epoch": 2.8418056120374136, + "grad_norm": 7.347431629900932, + "learning_rate": 1.8192182243241387e-05, + "loss": 0.1057, + "step": 6988 + }, + { + "epoch": 2.8422122814152093, + "grad_norm": 15.663073387452538, + "learning_rate": 1.819160115608962e-05, + "loss": 0.3551, + "step": 6989 + }, + { + "epoch": 2.842618950793005, + "grad_norm": 9.294117767773797, + "learning_rate": 1.8191019984846944e-05, + "loss": 0.4459, + "step": 6990 + }, + { + "epoch": 2.8430256201708013, + "grad_norm": 5.772444883549947, + "learning_rate": 1.8190438729519325e-05, + "loss": 0.2832, + "step": 6991 + }, + { + "epoch": 2.843432289548597, + "grad_norm": 2.929386231687769, + "learning_rate": 1.8189857390112732e-05, + "loss": 0.0608, + "step": 6992 + }, + { + "epoch": 2.843838958926393, + "grad_norm": 12.29575497142164, + "learning_rate": 1.8189275966633132e-05, + "loss": 0.4479, + "step": 6993 + }, + { + "epoch": 2.8442456283041886, + "grad_norm": 7.08718581481562, + "learning_rate": 1.818869445908649e-05, + "loss": 0.216, + "step": 6994 + }, + { + "epoch": 2.844652297681985, + "grad_norm": 8.386959321223348, + "learning_rate": 1.8188112867478777e-05, + "loss": 0.3085, + "step": 6995 + }, + { + "epoch": 2.8450589670597806, + "grad_norm": 9.76894239537005, + "learning_rate": 1.8187531191815967e-05, + "loss": 0.2837, + "step": 6996 + }, + { + "epoch": 2.8454656364375763, + "grad_norm": 9.70529298991684, + "learning_rate": 1.8186949432104027e-05, + "loss": 0.5836, + "step": 6997 + }, + { + "epoch": 2.845872305815372, + "grad_norm": 11.24237158944708, + "learning_rate": 1.8186367588348928e-05, + "loss": 0.3439, + "step": 6998 + }, + { + "epoch": 2.846278975193168, + "grad_norm": 27.464806375870104, + "learning_rate": 1.8185785660556652e-05, + "loss": 1.5837, + "step": 6999 + }, + { + "epoch": 2.8466856445709636, + "grad_norm": 14.88453574967283, + "learning_rate": 1.818520364873316e-05, + "loss": 0.5174, + "step": 7000 + }, + { + "epoch": 2.8470923139487594, + "grad_norm": 1.3217919486871288, + "learning_rate": 1.818462155288444e-05, + "loss": 0.0314, + "step": 7001 + }, + { + "epoch": 2.8474989833265556, + "grad_norm": 5.6353546389449845, + "learning_rate": 1.8184039373016455e-05, + "loss": 0.1669, + "step": 7002 + }, + { + "epoch": 2.8479056527043514, + "grad_norm": 2.982676047493815, + "learning_rate": 1.818345710913519e-05, + "loss": 0.0767, + "step": 7003 + }, + { + "epoch": 2.848312322082147, + "grad_norm": 6.452966909145852, + "learning_rate": 1.818287476124662e-05, + "loss": 0.2805, + "step": 7004 + }, + { + "epoch": 2.8487189914599433, + "grad_norm": 7.52725791011491, + "learning_rate": 1.818229232935672e-05, + "loss": 0.3142, + "step": 7005 + }, + { + "epoch": 2.849125660837739, + "grad_norm": 8.44618137280823, + "learning_rate": 1.818170981347147e-05, + "loss": 0.2162, + "step": 7006 + }, + { + "epoch": 2.849532330215535, + "grad_norm": 6.290217183364609, + "learning_rate": 1.8181127213596852e-05, + "loss": 0.3082, + "step": 7007 + }, + { + "epoch": 2.8499389995933306, + "grad_norm": 10.405554465549391, + "learning_rate": 1.8180544529738844e-05, + "loss": 0.2746, + "step": 7008 + }, + { + "epoch": 2.8503456689711264, + "grad_norm": 1.9585995487209367, + "learning_rate": 1.8179961761903434e-05, + "loss": 0.0278, + "step": 7009 + }, + { + "epoch": 2.850752338348922, + "grad_norm": 13.052089715555798, + "learning_rate": 1.8179378910096594e-05, + "loss": 0.9455, + "step": 7010 + }, + { + "epoch": 2.851159007726718, + "grad_norm": 9.800640907685356, + "learning_rate": 1.8178795974324316e-05, + "loss": 0.8643, + "step": 7011 + }, + { + "epoch": 2.851565677104514, + "grad_norm": 4.753690055266219, + "learning_rate": 1.817821295459258e-05, + "loss": 0.2157, + "step": 7012 + }, + { + "epoch": 2.85197234648231, + "grad_norm": 2.426033860072143, + "learning_rate": 1.8177629850907374e-05, + "loss": 0.1432, + "step": 7013 + }, + { + "epoch": 2.8523790158601057, + "grad_norm": 7.045258504472825, + "learning_rate": 1.817704666327468e-05, + "loss": 0.1701, + "step": 7014 + }, + { + "epoch": 2.8527856852379014, + "grad_norm": 12.800146925327903, + "learning_rate": 1.8176463391700486e-05, + "loss": 0.9158, + "step": 7015 + }, + { + "epoch": 2.8531923546156976, + "grad_norm": 17.882170553796843, + "learning_rate": 1.8175880036190782e-05, + "loss": 0.955, + "step": 7016 + }, + { + "epoch": 2.8535990239934934, + "grad_norm": 14.875101927600383, + "learning_rate": 1.8175296596751555e-05, + "loss": 0.1495, + "step": 7017 + }, + { + "epoch": 2.854005693371289, + "grad_norm": 1.9376902130659996, + "learning_rate": 1.817471307338879e-05, + "loss": 0.042, + "step": 7018 + }, + { + "epoch": 2.854412362749085, + "grad_norm": 8.334329481681328, + "learning_rate": 1.8174129466108484e-05, + "loss": 0.3381, + "step": 7019 + }, + { + "epoch": 2.8548190321268807, + "grad_norm": 2.85471013034377, + "learning_rate": 1.8173545774916628e-05, + "loss": 0.0444, + "step": 7020 + }, + { + "epoch": 2.8552257015046765, + "grad_norm": 7.511058187520727, + "learning_rate": 1.817296199981921e-05, + "loss": 0.2503, + "step": 7021 + }, + { + "epoch": 2.8556323708824727, + "grad_norm": 4.155289058776409, + "learning_rate": 1.8172378140822218e-05, + "loss": 0.1127, + "step": 7022 + }, + { + "epoch": 2.8560390402602684, + "grad_norm": 0.5447545730745381, + "learning_rate": 1.8171794197931655e-05, + "loss": 0.0075, + "step": 7023 + }, + { + "epoch": 2.856445709638064, + "grad_norm": 0.9442566815701182, + "learning_rate": 1.8171210171153514e-05, + "loss": 0.007, + "step": 7024 + }, + { + "epoch": 2.85685237901586, + "grad_norm": 5.870827131061366, + "learning_rate": 1.8170626060493785e-05, + "loss": 0.3246, + "step": 7025 + }, + { + "epoch": 2.857259048393656, + "grad_norm": 9.47178435983115, + "learning_rate": 1.8170041865958468e-05, + "loss": 0.6245, + "step": 7026 + }, + { + "epoch": 2.857665717771452, + "grad_norm": 2.70798504649836, + "learning_rate": 1.8169457587553557e-05, + "loss": 0.0494, + "step": 7027 + }, + { + "epoch": 2.8580723871492477, + "grad_norm": 0.3441743142586396, + "learning_rate": 1.8168873225285056e-05, + "loss": 0.0089, + "step": 7028 + }, + { + "epoch": 2.8584790565270435, + "grad_norm": 0.9526724311421989, + "learning_rate": 1.8168288779158958e-05, + "loss": 0.0148, + "step": 7029 + }, + { + "epoch": 2.8588857259048392, + "grad_norm": 7.631196838754344, + "learning_rate": 1.8167704249181267e-05, + "loss": 0.2758, + "step": 7030 + }, + { + "epoch": 2.859292395282635, + "grad_norm": 4.187213498882356, + "learning_rate": 1.816711963535798e-05, + "loss": 0.3299, + "step": 7031 + }, + { + "epoch": 2.8596990646604312, + "grad_norm": 8.637462910862736, + "learning_rate": 1.81665349376951e-05, + "loss": 0.2736, + "step": 7032 + }, + { + "epoch": 2.860105734038227, + "grad_norm": 10.26637311096328, + "learning_rate": 1.8165950156198627e-05, + "loss": 0.3206, + "step": 7033 + }, + { + "epoch": 2.8605124034160228, + "grad_norm": 1.4084881318115332, + "learning_rate": 1.8165365290874565e-05, + "loss": 0.175, + "step": 7034 + }, + { + "epoch": 2.8609190727938185, + "grad_norm": 7.844986346040784, + "learning_rate": 1.816478034172892e-05, + "loss": 0.3295, + "step": 7035 + }, + { + "epoch": 2.8613257421716147, + "grad_norm": 22.84110294900949, + "learning_rate": 1.8164195308767695e-05, + "loss": 0.3351, + "step": 7036 + }, + { + "epoch": 2.8617324115494105, + "grad_norm": 10.56075275693853, + "learning_rate": 1.8163610191996898e-05, + "loss": 0.1957, + "step": 7037 + }, + { + "epoch": 2.8621390809272063, + "grad_norm": 12.956631689826333, + "learning_rate": 1.816302499142253e-05, + "loss": 0.8066, + "step": 7038 + }, + { + "epoch": 2.862545750305002, + "grad_norm": 3.7176533207260385, + "learning_rate": 1.816243970705061e-05, + "loss": 0.0711, + "step": 7039 + }, + { + "epoch": 2.862952419682798, + "grad_norm": 1.820515589027728, + "learning_rate": 1.816185433888713e-05, + "loss": 0.0243, + "step": 7040 + }, + { + "epoch": 2.8633590890605936, + "grad_norm": 11.352961843350132, + "learning_rate": 1.8161268886938114e-05, + "loss": 0.7098, + "step": 7041 + }, + { + "epoch": 2.8637657584383893, + "grad_norm": 4.323229037385327, + "learning_rate": 1.816068335120956e-05, + "loss": 0.1093, + "step": 7042 + }, + { + "epoch": 2.8641724278161855, + "grad_norm": 12.6816202999559, + "learning_rate": 1.8160097731707488e-05, + "loss": 0.6642, + "step": 7043 + }, + { + "epoch": 2.8645790971939813, + "grad_norm": 5.757596728436934, + "learning_rate": 1.8159512028437905e-05, + "loss": 0.1769, + "step": 7044 + }, + { + "epoch": 2.864985766571777, + "grad_norm": 3.6003356276697467, + "learning_rate": 1.8158926241406824e-05, + "loss": 0.0684, + "step": 7045 + }, + { + "epoch": 2.8653924359495733, + "grad_norm": 8.359428010309005, + "learning_rate": 1.8158340370620257e-05, + "loss": 0.1857, + "step": 7046 + }, + { + "epoch": 2.865799105327369, + "grad_norm": 6.226941144418232, + "learning_rate": 1.815775441608422e-05, + "loss": 0.1559, + "step": 7047 + }, + { + "epoch": 2.866205774705165, + "grad_norm": 7.597637767695719, + "learning_rate": 1.815716837780473e-05, + "loss": 0.1952, + "step": 7048 + }, + { + "epoch": 2.8666124440829606, + "grad_norm": 3.366715176703923, + "learning_rate": 1.81565822557878e-05, + "loss": 0.0784, + "step": 7049 + }, + { + "epoch": 2.8670191134607563, + "grad_norm": 9.861048820833165, + "learning_rate": 1.815599605003945e-05, + "loss": 0.4896, + "step": 7050 + }, + { + "epoch": 2.867425782838552, + "grad_norm": 4.009731263660564, + "learning_rate": 1.8155409760565693e-05, + "loss": 0.1078, + "step": 7051 + }, + { + "epoch": 2.867832452216348, + "grad_norm": 4.8006970299139144, + "learning_rate": 1.8154823387372552e-05, + "loss": 0.1532, + "step": 7052 + }, + { + "epoch": 2.868239121594144, + "grad_norm": 9.600073594969793, + "learning_rate": 1.8154236930466047e-05, + "loss": 0.3161, + "step": 7053 + }, + { + "epoch": 2.86864579097194, + "grad_norm": 9.224038076913159, + "learning_rate": 1.8153650389852195e-05, + "loss": 0.1544, + "step": 7054 + }, + { + "epoch": 2.8690524603497356, + "grad_norm": 5.709266024815616, + "learning_rate": 1.8153063765537017e-05, + "loss": 0.2491, + "step": 7055 + }, + { + "epoch": 2.8694591297275314, + "grad_norm": 0.04112605027723804, + "learning_rate": 1.8152477057526534e-05, + "loss": 0.0008, + "step": 7056 + }, + { + "epoch": 2.8698657991053276, + "grad_norm": 4.901865125369795, + "learning_rate": 1.8151890265826773e-05, + "loss": 0.1201, + "step": 7057 + }, + { + "epoch": 2.8702724684831233, + "grad_norm": 2.184723282938029, + "learning_rate": 1.8151303390443755e-05, + "loss": 0.0403, + "step": 7058 + }, + { + "epoch": 2.870679137860919, + "grad_norm": 11.261697132307013, + "learning_rate": 1.8150716431383505e-05, + "loss": 0.6553, + "step": 7059 + }, + { + "epoch": 2.871085807238715, + "grad_norm": 3.2036294318241056, + "learning_rate": 1.8150129388652052e-05, + "loss": 0.0439, + "step": 7060 + }, + { + "epoch": 2.8714924766165106, + "grad_norm": 12.708621755356775, + "learning_rate": 1.814954226225541e-05, + "loss": 0.3006, + "step": 7061 + }, + { + "epoch": 2.8718991459943064, + "grad_norm": 2.084447799918636, + "learning_rate": 1.8148955052199627e-05, + "loss": 0.0431, + "step": 7062 + }, + { + "epoch": 2.8723058153721026, + "grad_norm": 9.680906113136675, + "learning_rate": 1.8148367758490713e-05, + "loss": 0.2492, + "step": 7063 + }, + { + "epoch": 2.8727124847498984, + "grad_norm": 4.625075791970698, + "learning_rate": 1.8147780381134704e-05, + "loss": 0.1127, + "step": 7064 + }, + { + "epoch": 2.873119154127694, + "grad_norm": 7.991780062560415, + "learning_rate": 1.8147192920137626e-05, + "loss": 0.1273, + "step": 7065 + }, + { + "epoch": 2.87352582350549, + "grad_norm": 2.729451193676538, + "learning_rate": 1.8146605375505515e-05, + "loss": 0.0579, + "step": 7066 + }, + { + "epoch": 2.873932492883286, + "grad_norm": 15.76158650843842, + "learning_rate": 1.81460177472444e-05, + "loss": 0.6652, + "step": 7067 + }, + { + "epoch": 2.874339162261082, + "grad_norm": 13.826954284507561, + "learning_rate": 1.814543003536031e-05, + "loss": 0.7146, + "step": 7068 + }, + { + "epoch": 2.8747458316388776, + "grad_norm": 11.867702616361276, + "learning_rate": 1.8144842239859284e-05, + "loss": 0.7104, + "step": 7069 + }, + { + "epoch": 2.8751525010166734, + "grad_norm": 8.069044940977838, + "learning_rate": 1.8144254360747354e-05, + "loss": 0.2543, + "step": 7070 + }, + { + "epoch": 2.875559170394469, + "grad_norm": 11.573560914892761, + "learning_rate": 1.8143666398030554e-05, + "loss": 0.444, + "step": 7071 + }, + { + "epoch": 2.875965839772265, + "grad_norm": 4.712987688447739, + "learning_rate": 1.8143078351714916e-05, + "loss": 0.1708, + "step": 7072 + }, + { + "epoch": 2.876372509150061, + "grad_norm": 1.8306672812713563, + "learning_rate": 1.8142490221806484e-05, + "loss": 0.0411, + "step": 7073 + }, + { + "epoch": 2.876779178527857, + "grad_norm": 5.93780675028229, + "learning_rate": 1.814190200831129e-05, + "loss": 0.126, + "step": 7074 + }, + { + "epoch": 2.8771858479056527, + "grad_norm": 15.706686933984692, + "learning_rate": 1.8141313711235376e-05, + "loss": 0.8545, + "step": 7075 + }, + { + "epoch": 2.8775925172834484, + "grad_norm": 3.0176595101002404, + "learning_rate": 1.8140725330584778e-05, + "loss": 0.0526, + "step": 7076 + }, + { + "epoch": 2.8779991866612447, + "grad_norm": 2.3238647612105883, + "learning_rate": 1.8140136866365537e-05, + "loss": 0.0357, + "step": 7077 + }, + { + "epoch": 2.8784058560390404, + "grad_norm": 5.047261245646813, + "learning_rate": 1.8139548318583694e-05, + "loss": 0.143, + "step": 7078 + }, + { + "epoch": 2.878812525416836, + "grad_norm": 9.28129095271577, + "learning_rate": 1.8138959687245293e-05, + "loss": 0.3477, + "step": 7079 + }, + { + "epoch": 2.879219194794632, + "grad_norm": 7.221643986075301, + "learning_rate": 1.813837097235637e-05, + "loss": 0.1174, + "step": 7080 + }, + { + "epoch": 2.8796258641724277, + "grad_norm": 7.12283404365464, + "learning_rate": 1.8137782173922978e-05, + "loss": 0.2828, + "step": 7081 + }, + { + "epoch": 2.8800325335502235, + "grad_norm": 10.185932915733416, + "learning_rate": 1.8137193291951153e-05, + "loss": 0.4623, + "step": 7082 + }, + { + "epoch": 2.8804392029280192, + "grad_norm": 8.580870100057227, + "learning_rate": 1.8136604326446948e-05, + "loss": 0.2443, + "step": 7083 + }, + { + "epoch": 2.8808458723058155, + "grad_norm": 6.285458773891728, + "learning_rate": 1.81360152774164e-05, + "loss": 0.3106, + "step": 7084 + }, + { + "epoch": 2.8812525416836112, + "grad_norm": 1.2720402902762415, + "learning_rate": 1.813542614486556e-05, + "loss": 0.0227, + "step": 7085 + }, + { + "epoch": 2.881659211061407, + "grad_norm": 9.494025565585936, + "learning_rate": 1.813483692880048e-05, + "loss": 0.4095, + "step": 7086 + }, + { + "epoch": 2.882065880439203, + "grad_norm": 11.026827660850588, + "learning_rate": 1.8134247629227202e-05, + "loss": 0.8185, + "step": 7087 + }, + { + "epoch": 2.882472549816999, + "grad_norm": 4.716290323922159, + "learning_rate": 1.813365824615178e-05, + "loss": 0.2075, + "step": 7088 + }, + { + "epoch": 2.8828792191947947, + "grad_norm": 8.954486250778686, + "learning_rate": 1.8133068779580265e-05, + "loss": 0.5129, + "step": 7089 + }, + { + "epoch": 2.8832858885725905, + "grad_norm": 10.24088204107534, + "learning_rate": 1.8132479229518697e-05, + "loss": 0.3275, + "step": 7090 + }, + { + "epoch": 2.8836925579503863, + "grad_norm": 9.515056942851839, + "learning_rate": 1.8131889595973146e-05, + "loss": 0.3067, + "step": 7091 + }, + { + "epoch": 2.884099227328182, + "grad_norm": 8.359413523340143, + "learning_rate": 1.8131299878949648e-05, + "loss": 0.2143, + "step": 7092 + }, + { + "epoch": 2.884505896705978, + "grad_norm": 10.322548823770951, + "learning_rate": 1.813071007845427e-05, + "loss": 0.5565, + "step": 7093 + }, + { + "epoch": 2.884912566083774, + "grad_norm": 4.534677738009429, + "learning_rate": 1.813012019449306e-05, + "loss": 0.2821, + "step": 7094 + }, + { + "epoch": 2.8853192354615698, + "grad_norm": 3.9845063912474976, + "learning_rate": 1.8129530227072073e-05, + "loss": 0.0456, + "step": 7095 + }, + { + "epoch": 2.8857259048393655, + "grad_norm": 6.099199113445303, + "learning_rate": 1.8128940176197366e-05, + "loss": 0.2563, + "step": 7096 + }, + { + "epoch": 2.8861325742171613, + "grad_norm": 0.6317062300788251, + "learning_rate": 1.8128350041874995e-05, + "loss": 0.0103, + "step": 7097 + }, + { + "epoch": 2.8865392435949575, + "grad_norm": 8.75427596187834, + "learning_rate": 1.8127759824111023e-05, + "loss": 0.3778, + "step": 7098 + }, + { + "epoch": 2.8869459129727533, + "grad_norm": 10.436570680814226, + "learning_rate": 1.8127169522911507e-05, + "loss": 0.4232, + "step": 7099 + }, + { + "epoch": 2.887352582350549, + "grad_norm": 15.58373463630531, + "learning_rate": 1.8126579138282502e-05, + "loss": 1.0273, + "step": 7100 + }, + { + "epoch": 2.887759251728345, + "grad_norm": 9.495223701426546, + "learning_rate": 1.8125988670230076e-05, + "loss": 0.2356, + "step": 7101 + }, + { + "epoch": 2.8881659211061406, + "grad_norm": 3.4106297648307504, + "learning_rate": 1.8125398118760286e-05, + "loss": 0.133, + "step": 7102 + }, + { + "epoch": 2.8885725904839363, + "grad_norm": 3.253968902742257, + "learning_rate": 1.812480748387919e-05, + "loss": 0.0724, + "step": 7103 + }, + { + "epoch": 2.8889792598617325, + "grad_norm": 1.784972187954403, + "learning_rate": 1.8124216765592863e-05, + "loss": 0.0312, + "step": 7104 + }, + { + "epoch": 2.8893859292395283, + "grad_norm": 2.882330565027155, + "learning_rate": 1.812362596390736e-05, + "loss": 0.0566, + "step": 7105 + }, + { + "epoch": 2.889792598617324, + "grad_norm": 2.8546078515281677, + "learning_rate": 1.8123035078828747e-05, + "loss": 0.0345, + "step": 7106 + }, + { + "epoch": 2.89019926799512, + "grad_norm": 0.7991980592025638, + "learning_rate": 1.812244411036309e-05, + "loss": 0.0114, + "step": 7107 + }, + { + "epoch": 2.890605937372916, + "grad_norm": 13.822976908872013, + "learning_rate": 1.8121853058516453e-05, + "loss": 0.7048, + "step": 7108 + }, + { + "epoch": 2.891012606750712, + "grad_norm": 3.6020250612328075, + "learning_rate": 1.8121261923294914e-05, + "loss": 0.1372, + "step": 7109 + }, + { + "epoch": 2.8914192761285076, + "grad_norm": 5.650842507631702, + "learning_rate": 1.812067070470453e-05, + "loss": 0.1444, + "step": 7110 + }, + { + "epoch": 2.8918259455063033, + "grad_norm": 8.797052998683595, + "learning_rate": 1.8120079402751375e-05, + "loss": 0.2128, + "step": 7111 + }, + { + "epoch": 2.892232614884099, + "grad_norm": 6.942633151317794, + "learning_rate": 1.811948801744152e-05, + "loss": 0.3612, + "step": 7112 + }, + { + "epoch": 2.892639284261895, + "grad_norm": 11.580374024462307, + "learning_rate": 1.8118896548781028e-05, + "loss": 0.6842, + "step": 7113 + }, + { + "epoch": 2.893045953639691, + "grad_norm": 10.901101396090686, + "learning_rate": 1.8118304996775982e-05, + "loss": 0.334, + "step": 7114 + }, + { + "epoch": 2.893452623017487, + "grad_norm": 13.981071119715946, + "learning_rate": 1.8117713361432448e-05, + "loss": 0.9435, + "step": 7115 + }, + { + "epoch": 2.8938592923952826, + "grad_norm": 6.4244055542831395, + "learning_rate": 1.81171216427565e-05, + "loss": 0.127, + "step": 7116 + }, + { + "epoch": 2.8942659617730784, + "grad_norm": 8.338872338054498, + "learning_rate": 1.8116529840754215e-05, + "loss": 0.3793, + "step": 7117 + }, + { + "epoch": 2.8946726311508746, + "grad_norm": 11.645659499709982, + "learning_rate": 1.8115937955431665e-05, + "loss": 0.461, + "step": 7118 + }, + { + "epoch": 2.8950793005286704, + "grad_norm": 9.167519017734065, + "learning_rate": 1.8115345986794924e-05, + "loss": 0.3097, + "step": 7119 + }, + { + "epoch": 2.895485969906466, + "grad_norm": 6.576875266165023, + "learning_rate": 1.8114753934850077e-05, + "loss": 0.2052, + "step": 7120 + }, + { + "epoch": 2.895892639284262, + "grad_norm": 9.061485331661082, + "learning_rate": 1.8114161799603195e-05, + "loss": 0.3425, + "step": 7121 + }, + { + "epoch": 2.8962993086620576, + "grad_norm": 10.5794289072781, + "learning_rate": 1.811356958106036e-05, + "loss": 0.6587, + "step": 7122 + }, + { + "epoch": 2.8967059780398534, + "grad_norm": 8.300140702019966, + "learning_rate": 1.8112977279227645e-05, + "loss": 0.4367, + "step": 7123 + }, + { + "epoch": 2.897112647417649, + "grad_norm": 7.013911036891591, + "learning_rate": 1.811238489411114e-05, + "loss": 0.1838, + "step": 7124 + }, + { + "epoch": 2.8975193167954454, + "grad_norm": 9.575515585994225, + "learning_rate": 1.811179242571692e-05, + "loss": 0.3246, + "step": 7125 + }, + { + "epoch": 2.897925986173241, + "grad_norm": 8.27481727029642, + "learning_rate": 1.8111199874051066e-05, + "loss": 0.2327, + "step": 7126 + }, + { + "epoch": 2.898332655551037, + "grad_norm": 10.553963889808156, + "learning_rate": 1.8110607239119662e-05, + "loss": 0.4205, + "step": 7127 + }, + { + "epoch": 2.898739324928833, + "grad_norm": 0.1975437110620114, + "learning_rate": 1.81100145209288e-05, + "loss": 0.0024, + "step": 7128 + }, + { + "epoch": 2.899145994306629, + "grad_norm": 4.936276116074863, + "learning_rate": 1.810942171948455e-05, + "loss": 0.0752, + "step": 7129 + }, + { + "epoch": 2.8995526636844247, + "grad_norm": 7.7212989631735836, + "learning_rate": 1.8108828834793008e-05, + "loss": 0.166, + "step": 7130 + }, + { + "epoch": 2.8999593330622204, + "grad_norm": 5.864285266821484, + "learning_rate": 1.8108235866860257e-05, + "loss": 0.1026, + "step": 7131 + }, + { + "epoch": 2.900366002440016, + "grad_norm": 5.763310164972057, + "learning_rate": 1.810764281569238e-05, + "loss": 0.2864, + "step": 7132 + }, + { + "epoch": 2.900772671817812, + "grad_norm": 13.320851897619862, + "learning_rate": 1.8107049681295476e-05, + "loss": 0.3866, + "step": 7133 + }, + { + "epoch": 2.9011793411956077, + "grad_norm": 0.4420217574905801, + "learning_rate": 1.810645646367562e-05, + "loss": 0.007, + "step": 7134 + }, + { + "epoch": 2.901586010573404, + "grad_norm": 15.071721308181333, + "learning_rate": 1.8105863162838914e-05, + "loss": 0.898, + "step": 7135 + }, + { + "epoch": 2.9019926799511997, + "grad_norm": 9.50483438222178, + "learning_rate": 1.810526977879144e-05, + "loss": 0.3882, + "step": 7136 + }, + { + "epoch": 2.9023993493289955, + "grad_norm": 8.170999589211645, + "learning_rate": 1.8104676311539295e-05, + "loss": 0.2133, + "step": 7137 + }, + { + "epoch": 2.9028060187067912, + "grad_norm": 6.209557728033436, + "learning_rate": 1.8104082761088565e-05, + "loss": 0.3077, + "step": 7138 + }, + { + "epoch": 2.9032126880845874, + "grad_norm": 2.4342363076212234, + "learning_rate": 1.810348912744535e-05, + "loss": 0.0426, + "step": 7139 + }, + { + "epoch": 2.903619357462383, + "grad_norm": 8.627536743711822, + "learning_rate": 1.8102895410615737e-05, + "loss": 0.5741, + "step": 7140 + }, + { + "epoch": 2.904026026840179, + "grad_norm": 2.1948418321899927, + "learning_rate": 1.8102301610605825e-05, + "loss": 0.035, + "step": 7141 + }, + { + "epoch": 2.9044326962179747, + "grad_norm": 12.65915194756081, + "learning_rate": 1.8101707727421713e-05, + "loss": 0.5594, + "step": 7142 + }, + { + "epoch": 2.9048393655957705, + "grad_norm": 9.622347160967378, + "learning_rate": 1.8101113761069492e-05, + "loss": 0.2844, + "step": 7143 + }, + { + "epoch": 2.9052460349735663, + "grad_norm": 4.236853537648078, + "learning_rate": 1.810051971155526e-05, + "loss": 0.1563, + "step": 7144 + }, + { + "epoch": 2.9056527043513625, + "grad_norm": 5.0551297936364605, + "learning_rate": 1.8099925578885116e-05, + "loss": 0.1594, + "step": 7145 + }, + { + "epoch": 2.9060593737291582, + "grad_norm": 3.44880077749254, + "learning_rate": 1.809933136306516e-05, + "loss": 0.0838, + "step": 7146 + }, + { + "epoch": 2.906466043106954, + "grad_norm": 2.672985194079714, + "learning_rate": 1.8098737064101493e-05, + "loss": 0.0433, + "step": 7147 + }, + { + "epoch": 2.9068727124847498, + "grad_norm": 0.5464796537511032, + "learning_rate": 1.809814268200021e-05, + "loss": 0.0036, + "step": 7148 + }, + { + "epoch": 2.907279381862546, + "grad_norm": 12.425015172425022, + "learning_rate": 1.809754821676742e-05, + "loss": 0.5852, + "step": 7149 + }, + { + "epoch": 2.9076860512403417, + "grad_norm": 4.942057146252584, + "learning_rate": 1.809695366840922e-05, + "loss": 0.1355, + "step": 7150 + }, + { + "epoch": 2.9080927206181375, + "grad_norm": 15.069769086938699, + "learning_rate": 1.8096359036931717e-05, + "loss": 1.3991, + "step": 7151 + }, + { + "epoch": 2.9084993899959333, + "grad_norm": 18.10854227241587, + "learning_rate": 1.8095764322341012e-05, + "loss": 0.9517, + "step": 7152 + }, + { + "epoch": 2.908906059373729, + "grad_norm": 0.8356074976816968, + "learning_rate": 1.8095169524643213e-05, + "loss": 0.0142, + "step": 7153 + }, + { + "epoch": 2.909312728751525, + "grad_norm": 4.1267048420705015, + "learning_rate": 1.8094574643844427e-05, + "loss": 0.0724, + "step": 7154 + }, + { + "epoch": 2.909719398129321, + "grad_norm": 3.0171694327949528, + "learning_rate": 1.8093979679950754e-05, + "loss": 0.0459, + "step": 7155 + }, + { + "epoch": 2.9101260675071168, + "grad_norm": 9.134864888254446, + "learning_rate": 1.8093384632968312e-05, + "loss": 0.3271, + "step": 7156 + }, + { + "epoch": 2.9105327368849125, + "grad_norm": 7.398900513022735, + "learning_rate": 1.8092789502903197e-05, + "loss": 0.2982, + "step": 7157 + }, + { + "epoch": 2.9109394062627083, + "grad_norm": 10.153727613384369, + "learning_rate": 1.8092194289761528e-05, + "loss": 0.3122, + "step": 7158 + }, + { + "epoch": 2.9113460756405045, + "grad_norm": 6.201439815405887, + "learning_rate": 1.809159899354941e-05, + "loss": 0.1087, + "step": 7159 + }, + { + "epoch": 2.9117527450183003, + "grad_norm": 9.705139318109364, + "learning_rate": 1.8091003614272957e-05, + "loss": 0.3424, + "step": 7160 + }, + { + "epoch": 2.912159414396096, + "grad_norm": 0.4366307386706852, + "learning_rate": 1.809040815193828e-05, + "loss": 0.0085, + "step": 7161 + }, + { + "epoch": 2.912566083773892, + "grad_norm": 36.49472744832033, + "learning_rate": 1.8089812606551495e-05, + "loss": 1.7615, + "step": 7162 + }, + { + "epoch": 2.9129727531516876, + "grad_norm": 8.783550296731269, + "learning_rate": 1.8089216978118706e-05, + "loss": 0.6009, + "step": 7163 + }, + { + "epoch": 2.9133794225294833, + "grad_norm": 12.16533641600292, + "learning_rate": 1.8088621266646035e-05, + "loss": 0.8294, + "step": 7164 + }, + { + "epoch": 2.913786091907279, + "grad_norm": 8.19713598280732, + "learning_rate": 1.8088025472139596e-05, + "loss": 0.1843, + "step": 7165 + }, + { + "epoch": 2.9141927612850753, + "grad_norm": 8.792307307445396, + "learning_rate": 1.8087429594605506e-05, + "loss": 0.2185, + "step": 7166 + }, + { + "epoch": 2.914599430662871, + "grad_norm": 9.189054536320906, + "learning_rate": 1.8086833634049882e-05, + "loss": 0.4916, + "step": 7167 + }, + { + "epoch": 2.915006100040667, + "grad_norm": 4.492418240077644, + "learning_rate": 1.8086237590478835e-05, + "loss": 0.104, + "step": 7168 + }, + { + "epoch": 2.915412769418463, + "grad_norm": 18.23643500566493, + "learning_rate": 1.8085641463898496e-05, + "loss": 0.6032, + "step": 7169 + }, + { + "epoch": 2.915819438796259, + "grad_norm": 2.549364587719034, + "learning_rate": 1.8085045254314978e-05, + "loss": 0.0928, + "step": 7170 + }, + { + "epoch": 2.9162261081740546, + "grad_norm": 12.419105544349838, + "learning_rate": 1.80844489617344e-05, + "loss": 0.571, + "step": 7171 + }, + { + "epoch": 2.9166327775518504, + "grad_norm": 4.868077386889878, + "learning_rate": 1.808385258616288e-05, + "loss": 0.1632, + "step": 7172 + }, + { + "epoch": 2.917039446929646, + "grad_norm": 10.691663039949994, + "learning_rate": 1.8083256127606547e-05, + "loss": 0.8229, + "step": 7173 + }, + { + "epoch": 2.917446116307442, + "grad_norm": 5.904129239594515, + "learning_rate": 1.8082659586071525e-05, + "loss": 0.2395, + "step": 7174 + }, + { + "epoch": 2.9178527856852376, + "grad_norm": 6.753908790225205, + "learning_rate": 1.808206296156393e-05, + "loss": 0.2105, + "step": 7175 + }, + { + "epoch": 2.918259455063034, + "grad_norm": 11.058310094026488, + "learning_rate": 1.8081466254089894e-05, + "loss": 0.7333, + "step": 7176 + }, + { + "epoch": 2.9186661244408296, + "grad_norm": 5.888136067199823, + "learning_rate": 1.808086946365554e-05, + "loss": 0.1609, + "step": 7177 + }, + { + "epoch": 2.9190727938186254, + "grad_norm": 4.945002885468374, + "learning_rate": 1.8080272590266992e-05, + "loss": 0.1228, + "step": 7178 + }, + { + "epoch": 2.919479463196421, + "grad_norm": 13.495018972007292, + "learning_rate": 1.8079675633930382e-05, + "loss": 0.9931, + "step": 7179 + }, + { + "epoch": 2.9198861325742174, + "grad_norm": 7.156250829243066, + "learning_rate": 1.807907859465183e-05, + "loss": 0.3386, + "step": 7180 + }, + { + "epoch": 2.920292801952013, + "grad_norm": 11.415121419637098, + "learning_rate": 1.8078481472437476e-05, + "loss": 0.735, + "step": 7181 + }, + { + "epoch": 2.920699471329809, + "grad_norm": 1.0297157308941174, + "learning_rate": 1.807788426729344e-05, + "loss": 0.0154, + "step": 7182 + }, + { + "epoch": 2.9211061407076047, + "grad_norm": 14.502128178731246, + "learning_rate": 1.8077286979225858e-05, + "loss": 1.075, + "step": 7183 + }, + { + "epoch": 2.9215128100854004, + "grad_norm": 10.273165792373241, + "learning_rate": 1.8076689608240857e-05, + "loss": 0.5085, + "step": 7184 + }, + { + "epoch": 2.921919479463196, + "grad_norm": 12.402526707829754, + "learning_rate": 1.8076092154344578e-05, + "loss": 0.5536, + "step": 7185 + }, + { + "epoch": 2.9223261488409924, + "grad_norm": 5.929668002809604, + "learning_rate": 1.8075494617543143e-05, + "loss": 0.1373, + "step": 7186 + }, + { + "epoch": 2.922732818218788, + "grad_norm": 1.3279904483756872, + "learning_rate": 1.8074896997842696e-05, + "loss": 0.0254, + "step": 7187 + }, + { + "epoch": 2.923139487596584, + "grad_norm": 13.419809091954889, + "learning_rate": 1.807429929524936e-05, + "loss": 0.6157, + "step": 7188 + }, + { + "epoch": 2.9235461569743797, + "grad_norm": 9.164432319388885, + "learning_rate": 1.8073701509769285e-05, + "loss": 0.6416, + "step": 7189 + }, + { + "epoch": 2.923952826352176, + "grad_norm": 0.3209042939815448, + "learning_rate": 1.80731036414086e-05, + "loss": 0.0047, + "step": 7190 + }, + { + "epoch": 2.9243594957299717, + "grad_norm": 16.006959749443137, + "learning_rate": 1.8072505690173444e-05, + "loss": 0.9237, + "step": 7191 + }, + { + "epoch": 2.9247661651077674, + "grad_norm": 9.151605244896944, + "learning_rate": 1.807190765606995e-05, + "loss": 0.5454, + "step": 7192 + }, + { + "epoch": 2.925172834485563, + "grad_norm": 8.44523773614607, + "learning_rate": 1.8071309539104262e-05, + "loss": 0.4295, + "step": 7193 + }, + { + "epoch": 2.925579503863359, + "grad_norm": 8.670963952611771, + "learning_rate": 1.8070711339282523e-05, + "loss": 0.4557, + "step": 7194 + }, + { + "epoch": 2.9259861732411547, + "grad_norm": 2.6365921955511165, + "learning_rate": 1.8070113056610865e-05, + "loss": 0.0589, + "step": 7195 + }, + { + "epoch": 2.926392842618951, + "grad_norm": 4.916136849821289, + "learning_rate": 1.806951469109544e-05, + "loss": 0.0998, + "step": 7196 + }, + { + "epoch": 2.9267995119967467, + "grad_norm": 6.927515373184368, + "learning_rate": 1.8068916242742383e-05, + "loss": 0.1834, + "step": 7197 + }, + { + "epoch": 2.9272061813745425, + "grad_norm": 5.17505928733065, + "learning_rate": 1.8068317711557838e-05, + "loss": 0.1053, + "step": 7198 + }, + { + "epoch": 2.9276128507523382, + "grad_norm": 11.646746922754213, + "learning_rate": 1.8067719097547955e-05, + "loss": 0.5595, + "step": 7199 + }, + { + "epoch": 2.9280195201301344, + "grad_norm": 10.04774098939559, + "learning_rate": 1.8067120400718873e-05, + "loss": 0.5709, + "step": 7200 + }, + { + "epoch": 2.92842618950793, + "grad_norm": 2.391109008675213, + "learning_rate": 1.806652162107674e-05, + "loss": 0.0359, + "step": 7201 + }, + { + "epoch": 2.928832858885726, + "grad_norm": 11.284719910641783, + "learning_rate": 1.8065922758627705e-05, + "loss": 0.5331, + "step": 7202 + }, + { + "epoch": 2.9292395282635217, + "grad_norm": 19.901803932693188, + "learning_rate": 1.8065323813377907e-05, + "loss": 0.6077, + "step": 7203 + }, + { + "epoch": 2.9296461976413175, + "grad_norm": 10.768567166814599, + "learning_rate": 1.8064724785333508e-05, + "loss": 0.4495, + "step": 7204 + }, + { + "epoch": 2.9300528670191133, + "grad_norm": 2.226614795298646, + "learning_rate": 1.8064125674500644e-05, + "loss": 0.0304, + "step": 7205 + }, + { + "epoch": 2.930459536396909, + "grad_norm": 2.571504181288228, + "learning_rate": 1.8063526480885475e-05, + "loss": 0.0407, + "step": 7206 + }, + { + "epoch": 2.9308662057747052, + "grad_norm": 15.858336138642553, + "learning_rate": 1.806292720449415e-05, + "loss": 0.6251, + "step": 7207 + }, + { + "epoch": 2.931272875152501, + "grad_norm": 8.719292837036782, + "learning_rate": 1.8062327845332815e-05, + "loss": 0.1819, + "step": 7208 + }, + { + "epoch": 2.9316795445302968, + "grad_norm": 13.3736966725848, + "learning_rate": 1.806172840340763e-05, + "loss": 0.6001, + "step": 7209 + }, + { + "epoch": 2.932086213908093, + "grad_norm": 0.6802013979596825, + "learning_rate": 1.8061128878724745e-05, + "loss": 0.0119, + "step": 7210 + }, + { + "epoch": 2.9324928832858888, + "grad_norm": 7.601733804207967, + "learning_rate": 1.8060529271290315e-05, + "loss": 0.2668, + "step": 7211 + }, + { + "epoch": 2.9328995526636845, + "grad_norm": 8.312957970540136, + "learning_rate": 1.8059929581110496e-05, + "loss": 0.2371, + "step": 7212 + }, + { + "epoch": 2.9333062220414803, + "grad_norm": 9.286448022813305, + "learning_rate": 1.8059329808191443e-05, + "loss": 0.3846, + "step": 7213 + }, + { + "epoch": 2.933712891419276, + "grad_norm": 8.320125743623636, + "learning_rate": 1.805872995253931e-05, + "loss": 0.2669, + "step": 7214 + }, + { + "epoch": 2.934119560797072, + "grad_norm": 4.137404323055359, + "learning_rate": 1.805813001416026e-05, + "loss": 0.1691, + "step": 7215 + }, + { + "epoch": 2.9345262301748676, + "grad_norm": 0.30311854572158514, + "learning_rate": 1.8057529993060452e-05, + "loss": 0.0044, + "step": 7216 + }, + { + "epoch": 2.934932899552664, + "grad_norm": 0.9722997849043923, + "learning_rate": 1.8056929889246043e-05, + "loss": 0.0215, + "step": 7217 + }, + { + "epoch": 2.9353395689304596, + "grad_norm": 12.674222311014576, + "learning_rate": 1.8056329702723193e-05, + "loss": 0.7481, + "step": 7218 + }, + { + "epoch": 2.9357462383082553, + "grad_norm": 7.285016410792033, + "learning_rate": 1.8055729433498062e-05, + "loss": 0.2346, + "step": 7219 + }, + { + "epoch": 2.936152907686051, + "grad_norm": 4.34840612221664, + "learning_rate": 1.8055129081576813e-05, + "loss": 0.1754, + "step": 7220 + }, + { + "epoch": 2.9365595770638473, + "grad_norm": 5.040100212683389, + "learning_rate": 1.8054528646965613e-05, + "loss": 0.0703, + "step": 7221 + }, + { + "epoch": 2.936966246441643, + "grad_norm": 6.404354528132973, + "learning_rate": 1.8053928129670624e-05, + "loss": 0.1437, + "step": 7222 + }, + { + "epoch": 2.937372915819439, + "grad_norm": 2.8960776520760625, + "learning_rate": 1.805332752969801e-05, + "loss": 0.0835, + "step": 7223 + }, + { + "epoch": 2.9377795851972346, + "grad_norm": 5.6471977795263815, + "learning_rate": 1.805272684705393e-05, + "loss": 0.2476, + "step": 7224 + }, + { + "epoch": 2.9381862545750304, + "grad_norm": 12.98339001245358, + "learning_rate": 1.8052126081744562e-05, + "loss": 0.5083, + "step": 7225 + }, + { + "epoch": 2.938592923952826, + "grad_norm": 13.977932824101655, + "learning_rate": 1.8051525233776065e-05, + "loss": 0.9842, + "step": 7226 + }, + { + "epoch": 2.9389995933306223, + "grad_norm": 10.238702436273968, + "learning_rate": 1.8050924303154606e-05, + "loss": 0.3323, + "step": 7227 + }, + { + "epoch": 2.939406262708418, + "grad_norm": 13.983704524643363, + "learning_rate": 1.8050323289886362e-05, + "loss": 0.5017, + "step": 7228 + }, + { + "epoch": 2.939812932086214, + "grad_norm": 6.949662731385659, + "learning_rate": 1.8049722193977495e-05, + "loss": 0.4096, + "step": 7229 + }, + { + "epoch": 2.9402196014640096, + "grad_norm": 7.761793034524094, + "learning_rate": 1.8049121015434177e-05, + "loss": 0.226, + "step": 7230 + }, + { + "epoch": 2.940626270841806, + "grad_norm": 6.894957301440695, + "learning_rate": 1.8048519754262582e-05, + "loss": 0.2601, + "step": 7231 + }, + { + "epoch": 2.9410329402196016, + "grad_norm": 9.378263532219314, + "learning_rate": 1.8047918410468886e-05, + "loss": 0.2845, + "step": 7232 + }, + { + "epoch": 2.9414396095973974, + "grad_norm": 0.08635945371706107, + "learning_rate": 1.8047316984059252e-05, + "loss": 0.0013, + "step": 7233 + }, + { + "epoch": 2.941846278975193, + "grad_norm": 4.194956013920631, + "learning_rate": 1.8046715475039857e-05, + "loss": 0.1118, + "step": 7234 + }, + { + "epoch": 2.942252948352989, + "grad_norm": 5.100393213167046, + "learning_rate": 1.8046113883416884e-05, + "loss": 0.1023, + "step": 7235 + }, + { + "epoch": 2.9426596177307847, + "grad_norm": 6.347034541201617, + "learning_rate": 1.80455122091965e-05, + "loss": 0.1856, + "step": 7236 + }, + { + "epoch": 2.943066287108581, + "grad_norm": 10.639844671665719, + "learning_rate": 1.804491045238488e-05, + "loss": 0.2232, + "step": 7237 + }, + { + "epoch": 2.9434729564863766, + "grad_norm": 0.3377792436889046, + "learning_rate": 1.8044308612988208e-05, + "loss": 0.0038, + "step": 7238 + }, + { + "epoch": 2.9438796258641724, + "grad_norm": 9.694485882869207, + "learning_rate": 1.804370669101266e-05, + "loss": 0.4752, + "step": 7239 + }, + { + "epoch": 2.944286295241968, + "grad_norm": 5.413464244886046, + "learning_rate": 1.8043104686464416e-05, + "loss": 0.1305, + "step": 7240 + }, + { + "epoch": 2.9446929646197644, + "grad_norm": 6.436108366562378, + "learning_rate": 1.8042502599349652e-05, + "loss": 0.4212, + "step": 7241 + }, + { + "epoch": 2.94509963399756, + "grad_norm": 11.33273107625532, + "learning_rate": 1.8041900429674553e-05, + "loss": 0.3752, + "step": 7242 + }, + { + "epoch": 2.945506303375356, + "grad_norm": 0.23700277357315255, + "learning_rate": 1.8041298177445295e-05, + "loss": 0.004, + "step": 7243 + }, + { + "epoch": 2.9459129727531517, + "grad_norm": 12.99756581837103, + "learning_rate": 1.804069584266807e-05, + "loss": 0.5344, + "step": 7244 + }, + { + "epoch": 2.9463196421309474, + "grad_norm": 12.097969391814472, + "learning_rate": 1.804009342534905e-05, + "loss": 0.4467, + "step": 7245 + }, + { + "epoch": 2.946726311508743, + "grad_norm": 8.047245767699232, + "learning_rate": 1.8039490925494427e-05, + "loss": 0.2542, + "step": 7246 + }, + { + "epoch": 2.947132980886539, + "grad_norm": 9.915586559577005, + "learning_rate": 1.8038888343110382e-05, + "loss": 0.3117, + "step": 7247 + }, + { + "epoch": 2.947539650264335, + "grad_norm": 6.952276429689937, + "learning_rate": 1.80382856782031e-05, + "loss": 0.2695, + "step": 7248 + }, + { + "epoch": 2.947946319642131, + "grad_norm": 14.805959334887865, + "learning_rate": 1.8037682930778775e-05, + "loss": 0.5651, + "step": 7249 + }, + { + "epoch": 2.9483529890199267, + "grad_norm": 2.2594507373263197, + "learning_rate": 1.8037080100843588e-05, + "loss": 0.03, + "step": 7250 + }, + { + "epoch": 2.948759658397723, + "grad_norm": 9.602523722344678, + "learning_rate": 1.803647718840373e-05, + "loss": 0.2147, + "step": 7251 + }, + { + "epoch": 2.9491663277755187, + "grad_norm": 1.399011892561067, + "learning_rate": 1.8035874193465388e-05, + "loss": 0.0228, + "step": 7252 + }, + { + "epoch": 2.9495729971533144, + "grad_norm": 0.08097694166189828, + "learning_rate": 1.8035271116034753e-05, + "loss": 0.0018, + "step": 7253 + }, + { + "epoch": 2.94997966653111, + "grad_norm": 10.68676743912511, + "learning_rate": 1.8034667956118013e-05, + "loss": 0.2961, + "step": 7254 + }, + { + "epoch": 2.950386335908906, + "grad_norm": 13.83399177172995, + "learning_rate": 1.8034064713721368e-05, + "loss": 0.3351, + "step": 7255 + }, + { + "epoch": 2.9507930052867017, + "grad_norm": 0.6066447178539799, + "learning_rate": 1.8033461388851e-05, + "loss": 0.0101, + "step": 7256 + }, + { + "epoch": 2.9511996746644975, + "grad_norm": 9.512249760974564, + "learning_rate": 1.803285798151311e-05, + "loss": 0.3, + "step": 7257 + }, + { + "epoch": 2.9516063440422937, + "grad_norm": 6.566372554638419, + "learning_rate": 1.803225449171389e-05, + "loss": 0.1677, + "step": 7258 + }, + { + "epoch": 2.9520130134200895, + "grad_norm": 18.435102695659374, + "learning_rate": 1.8031650919459535e-05, + "loss": 0.4395, + "step": 7259 + }, + { + "epoch": 2.9524196827978852, + "grad_norm": 8.191951761627331, + "learning_rate": 1.803104726475624e-05, + "loss": 0.2996, + "step": 7260 + }, + { + "epoch": 2.952826352175681, + "grad_norm": 2.6439748623618957, + "learning_rate": 1.8030443527610203e-05, + "loss": 0.0976, + "step": 7261 + }, + { + "epoch": 2.9532330215534772, + "grad_norm": 8.618394653866398, + "learning_rate": 1.8029839708027625e-05, + "loss": 0.2541, + "step": 7262 + }, + { + "epoch": 2.953639690931273, + "grad_norm": 12.382293962277478, + "learning_rate": 1.80292358060147e-05, + "loss": 0.0259, + "step": 7263 + }, + { + "epoch": 2.9540463603090688, + "grad_norm": 12.93517696776738, + "learning_rate": 1.8028631821577625e-05, + "loss": 0.5761, + "step": 7264 + }, + { + "epoch": 2.9544530296868645, + "grad_norm": 9.587774423468474, + "learning_rate": 1.8028027754722607e-05, + "loss": 0.2539, + "step": 7265 + }, + { + "epoch": 2.9548596990646603, + "grad_norm": 2.805841411078095, + "learning_rate": 1.802742360545584e-05, + "loss": 0.0643, + "step": 7266 + }, + { + "epoch": 2.955266368442456, + "grad_norm": 2.7869339726843845, + "learning_rate": 1.8026819373783534e-05, + "loss": 0.0519, + "step": 7267 + }, + { + "epoch": 2.9556730378202523, + "grad_norm": 3.174354106967893, + "learning_rate": 1.8026215059711884e-05, + "loss": 0.0652, + "step": 7268 + }, + { + "epoch": 2.956079707198048, + "grad_norm": 3.8606419005908017, + "learning_rate": 1.8025610663247096e-05, + "loss": 0.2079, + "step": 7269 + }, + { + "epoch": 2.956486376575844, + "grad_norm": 11.764419682633777, + "learning_rate": 1.802500618439538e-05, + "loss": 0.7176, + "step": 7270 + }, + { + "epoch": 2.9568930459536396, + "grad_norm": 9.421132026930792, + "learning_rate": 1.802440162316293e-05, + "loss": 0.2224, + "step": 7271 + }, + { + "epoch": 2.9572997153314358, + "grad_norm": 9.306827556122762, + "learning_rate": 1.8023796979555966e-05, + "loss": 0.5137, + "step": 7272 + }, + { + "epoch": 2.9577063847092315, + "grad_norm": 5.998282097216688, + "learning_rate": 1.8023192253580687e-05, + "loss": 0.1381, + "step": 7273 + }, + { + "epoch": 2.9581130540870273, + "grad_norm": 8.721055652289493, + "learning_rate": 1.8022587445243296e-05, + "loss": 0.8526, + "step": 7274 + }, + { + "epoch": 2.958519723464823, + "grad_norm": 7.968651872816975, + "learning_rate": 1.802198255455001e-05, + "loss": 0.3203, + "step": 7275 + }, + { + "epoch": 2.958926392842619, + "grad_norm": 14.654046719757371, + "learning_rate": 1.8021377581507038e-05, + "loss": 1.0536, + "step": 7276 + }, + { + "epoch": 2.9593330622204146, + "grad_norm": 7.199869606146152, + "learning_rate": 1.8020772526120585e-05, + "loss": 0.2796, + "step": 7277 + }, + { + "epoch": 2.959739731598211, + "grad_norm": 12.741422368400716, + "learning_rate": 1.8020167388396865e-05, + "loss": 0.5155, + "step": 7278 + }, + { + "epoch": 2.9601464009760066, + "grad_norm": 9.446717763835023, + "learning_rate": 1.801956216834209e-05, + "loss": 0.331, + "step": 7279 + }, + { + "epoch": 2.9605530703538023, + "grad_norm": 2.3751581693583046, + "learning_rate": 1.8018956865962475e-05, + "loss": 0.0599, + "step": 7280 + }, + { + "epoch": 2.960959739731598, + "grad_norm": 5.956314352577559, + "learning_rate": 1.8018351481264235e-05, + "loss": 0.1824, + "step": 7281 + }, + { + "epoch": 2.9613664091093943, + "grad_norm": 18.27507064074344, + "learning_rate": 1.8017746014253577e-05, + "loss": 0.8883, + "step": 7282 + }, + { + "epoch": 2.96177307848719, + "grad_norm": 5.4880149577740935, + "learning_rate": 1.8017140464936724e-05, + "loss": 0.1614, + "step": 7283 + }, + { + "epoch": 2.962179747864986, + "grad_norm": 5.076843566332377, + "learning_rate": 1.8016534833319886e-05, + "loss": 0.1033, + "step": 7284 + }, + { + "epoch": 2.9625864172427816, + "grad_norm": 6.31258640050921, + "learning_rate": 1.8015929119409285e-05, + "loss": 0.2568, + "step": 7285 + }, + { + "epoch": 2.9629930866205774, + "grad_norm": 8.117825139994912, + "learning_rate": 1.8015323323211133e-05, + "loss": 0.2041, + "step": 7286 + }, + { + "epoch": 2.963399755998373, + "grad_norm": 4.3664675520445995, + "learning_rate": 1.8014717444731656e-05, + "loss": 0.0688, + "step": 7287 + }, + { + "epoch": 2.9638064253761693, + "grad_norm": 9.353338446604825, + "learning_rate": 1.8014111483977073e-05, + "loss": 0.1244, + "step": 7288 + }, + { + "epoch": 2.964213094753965, + "grad_norm": 7.795290568859452, + "learning_rate": 1.8013505440953602e-05, + "loss": 0.2249, + "step": 7289 + }, + { + "epoch": 2.964619764131761, + "grad_norm": 11.036563122451978, + "learning_rate": 1.8012899315667464e-05, + "loss": 0.663, + "step": 7290 + }, + { + "epoch": 2.9650264335095566, + "grad_norm": 22.973241028480377, + "learning_rate": 1.8012293108124882e-05, + "loss": 0.4459, + "step": 7291 + }, + { + "epoch": 2.965433102887353, + "grad_norm": 11.020341194280912, + "learning_rate": 1.8011686818332073e-05, + "loss": 0.6012, + "step": 7292 + }, + { + "epoch": 2.9658397722651486, + "grad_norm": 11.286895072144056, + "learning_rate": 1.8011080446295276e-05, + "loss": 0.4021, + "step": 7293 + }, + { + "epoch": 2.9662464416429444, + "grad_norm": 8.348054346205576, + "learning_rate": 1.8010473992020698e-05, + "loss": 0.3898, + "step": 7294 + }, + { + "epoch": 2.96665311102074, + "grad_norm": 9.110451516200014, + "learning_rate": 1.8009867455514578e-05, + "loss": 0.0769, + "step": 7295 + }, + { + "epoch": 2.967059780398536, + "grad_norm": 8.983086974978615, + "learning_rate": 1.8009260836783135e-05, + "loss": 0.2547, + "step": 7296 + }, + { + "epoch": 2.9674664497763317, + "grad_norm": 2.7521167405969567, + "learning_rate": 1.8008654135832602e-05, + "loss": 0.0567, + "step": 7297 + }, + { + "epoch": 2.9678731191541274, + "grad_norm": 7.028507812413822, + "learning_rate": 1.8008047352669198e-05, + "loss": 0.2202, + "step": 7298 + }, + { + "epoch": 2.9682797885319236, + "grad_norm": 15.550473788685206, + "learning_rate": 1.800744048729916e-05, + "loss": 0.7155, + "step": 7299 + }, + { + "epoch": 2.9686864579097194, + "grad_norm": 10.69830057876022, + "learning_rate": 1.800683353972872e-05, + "loss": 0.5898, + "step": 7300 + }, + { + "epoch": 2.969093127287515, + "grad_norm": 0.22307116461905518, + "learning_rate": 1.8006226509964095e-05, + "loss": 0.0031, + "step": 7301 + }, + { + "epoch": 2.969499796665311, + "grad_norm": 8.245529409724057, + "learning_rate": 1.800561939801153e-05, + "loss": 0.3738, + "step": 7302 + }, + { + "epoch": 2.969906466043107, + "grad_norm": 5.849180351741808, + "learning_rate": 1.8005012203877257e-05, + "loss": 0.2085, + "step": 7303 + }, + { + "epoch": 2.970313135420903, + "grad_norm": 0.7440809318295538, + "learning_rate": 1.80044049275675e-05, + "loss": 0.0146, + "step": 7304 + }, + { + "epoch": 2.9707198047986987, + "grad_norm": 1.6139885470098465, + "learning_rate": 1.8003797569088498e-05, + "loss": 0.0312, + "step": 7305 + }, + { + "epoch": 2.9711264741764944, + "grad_norm": 8.241140031243974, + "learning_rate": 1.8003190128446485e-05, + "loss": 0.8376, + "step": 7306 + }, + { + "epoch": 2.97153314355429, + "grad_norm": 13.316050824071237, + "learning_rate": 1.80025826056477e-05, + "loss": 0.7359, + "step": 7307 + }, + { + "epoch": 2.971939812932086, + "grad_norm": 7.6685745792099596, + "learning_rate": 1.8001975000698375e-05, + "loss": 0.2443, + "step": 7308 + }, + { + "epoch": 2.972346482309882, + "grad_norm": 10.206244588355808, + "learning_rate": 1.8001367313604753e-05, + "loss": 0.3571, + "step": 7309 + }, + { + "epoch": 2.972753151687678, + "grad_norm": 4.84886437075153, + "learning_rate": 1.8000759544373062e-05, + "loss": 0.163, + "step": 7310 + }, + { + "epoch": 2.9731598210654737, + "grad_norm": 7.9314751026787755, + "learning_rate": 1.8000151693009554e-05, + "loss": 0.2629, + "step": 7311 + }, + { + "epoch": 2.9735664904432695, + "grad_norm": 11.042367223192246, + "learning_rate": 1.7999543759520458e-05, + "loss": 0.8229, + "step": 7312 + }, + { + "epoch": 2.9739731598210657, + "grad_norm": 9.493722123288535, + "learning_rate": 1.7998935743912023e-05, + "loss": 0.3689, + "step": 7313 + }, + { + "epoch": 2.9743798291988615, + "grad_norm": 9.484884744421045, + "learning_rate": 1.7998327646190482e-05, + "loss": 0.3789, + "step": 7314 + }, + { + "epoch": 2.9747864985766572, + "grad_norm": 1.6578313708607681, + "learning_rate": 1.7997719466362084e-05, + "loss": 0.0243, + "step": 7315 + }, + { + "epoch": 2.975193167954453, + "grad_norm": 7.431501716259836, + "learning_rate": 1.7997111204433073e-05, + "loss": 0.2298, + "step": 7316 + }, + { + "epoch": 2.9755998373322488, + "grad_norm": 9.727311757791716, + "learning_rate": 1.799650286040969e-05, + "loss": 0.3147, + "step": 7317 + }, + { + "epoch": 2.9760065067100445, + "grad_norm": 2.1583763076032945, + "learning_rate": 1.7995894434298182e-05, + "loss": 0.0371, + "step": 7318 + }, + { + "epoch": 2.9764131760878407, + "grad_norm": 7.5442099146903265, + "learning_rate": 1.799528592610479e-05, + "loss": 0.3875, + "step": 7319 + }, + { + "epoch": 2.9768198454656365, + "grad_norm": 12.207736440354738, + "learning_rate": 1.7994677335835763e-05, + "loss": 0.6274, + "step": 7320 + }, + { + "epoch": 2.9772265148434323, + "grad_norm": 13.886247382366138, + "learning_rate": 1.7994068663497354e-05, + "loss": 0.4382, + "step": 7321 + }, + { + "epoch": 2.977633184221228, + "grad_norm": 8.184911164033124, + "learning_rate": 1.7993459909095804e-05, + "loss": 0.2318, + "step": 7322 + }, + { + "epoch": 2.9780398535990242, + "grad_norm": 5.449645607561733, + "learning_rate": 1.7992851072637366e-05, + "loss": 0.4068, + "step": 7323 + }, + { + "epoch": 2.97844652297682, + "grad_norm": 0.5638636260180304, + "learning_rate": 1.799224215412829e-05, + "loss": 0.0104, + "step": 7324 + }, + { + "epoch": 2.9788531923546158, + "grad_norm": 5.589935894109937, + "learning_rate": 1.799163315357482e-05, + "loss": 0.2659, + "step": 7325 + }, + { + "epoch": 2.9792598617324115, + "grad_norm": 4.399606041028569, + "learning_rate": 1.7991024070983218e-05, + "loss": 0.1367, + "step": 7326 + }, + { + "epoch": 2.9796665311102073, + "grad_norm": 11.624529721857462, + "learning_rate": 1.7990414906359733e-05, + "loss": 0.6261, + "step": 7327 + }, + { + "epoch": 2.980073200488003, + "grad_norm": 8.062737426986036, + "learning_rate": 1.7989805659710614e-05, + "loss": 0.0569, + "step": 7328 + }, + { + "epoch": 2.9804798698657993, + "grad_norm": 5.39652450867727, + "learning_rate": 1.7989196331042125e-05, + "loss": 0.1859, + "step": 7329 + }, + { + "epoch": 2.980886539243595, + "grad_norm": 9.635151247602165, + "learning_rate": 1.798858692036051e-05, + "loss": 0.3241, + "step": 7330 + }, + { + "epoch": 2.981293208621391, + "grad_norm": 10.29894035901896, + "learning_rate": 1.798797742767203e-05, + "loss": 0.5298, + "step": 7331 + }, + { + "epoch": 2.9816998779991866, + "grad_norm": 2.5271464405576487, + "learning_rate": 1.7987367852982943e-05, + "loss": 0.0793, + "step": 7332 + }, + { + "epoch": 2.9821065473769828, + "grad_norm": 2.084297125030908, + "learning_rate": 1.7986758196299506e-05, + "loss": 0.0429, + "step": 7333 + }, + { + "epoch": 2.9825132167547785, + "grad_norm": 8.357085738773824, + "learning_rate": 1.7986148457627973e-05, + "loss": 0.2436, + "step": 7334 + }, + { + "epoch": 2.9829198861325743, + "grad_norm": 13.973360578052695, + "learning_rate": 1.7985538636974607e-05, + "loss": 0.8752, + "step": 7335 + }, + { + "epoch": 2.98332655551037, + "grad_norm": 1.6020849432670274, + "learning_rate": 1.798492873434567e-05, + "loss": 0.0105, + "step": 7336 + }, + { + "epoch": 2.983733224888166, + "grad_norm": 13.705224299978127, + "learning_rate": 1.798431874974742e-05, + "loss": 0.6802, + "step": 7337 + }, + { + "epoch": 2.9841398942659616, + "grad_norm": 4.425607807080641, + "learning_rate": 1.798370868318612e-05, + "loss": 0.1213, + "step": 7338 + }, + { + "epoch": 2.9845465636437574, + "grad_norm": 5.166970651106219, + "learning_rate": 1.7983098534668032e-05, + "loss": 0.1736, + "step": 7339 + }, + { + "epoch": 2.9849532330215536, + "grad_norm": 5.134364595116251, + "learning_rate": 1.7982488304199418e-05, + "loss": 0.1767, + "step": 7340 + }, + { + "epoch": 2.9853599023993493, + "grad_norm": 5.986554105959887, + "learning_rate": 1.7981877991786546e-05, + "loss": 0.167, + "step": 7341 + }, + { + "epoch": 2.985766571777145, + "grad_norm": 5.745481896978231, + "learning_rate": 1.7981267597435682e-05, + "loss": 0.1109, + "step": 7342 + }, + { + "epoch": 2.986173241154941, + "grad_norm": 8.69646288466723, + "learning_rate": 1.7980657121153085e-05, + "loss": 0.4572, + "step": 7343 + }, + { + "epoch": 2.986579910532737, + "grad_norm": 2.644737702581125, + "learning_rate": 1.7980046562945025e-05, + "loss": 0.0511, + "step": 7344 + }, + { + "epoch": 2.986986579910533, + "grad_norm": 0.18301288137342037, + "learning_rate": 1.7979435922817776e-05, + "loss": 0.0025, + "step": 7345 + }, + { + "epoch": 2.9873932492883286, + "grad_norm": 9.516439568357763, + "learning_rate": 1.7978825200777596e-05, + "loss": 0.5373, + "step": 7346 + }, + { + "epoch": 2.9877999186661244, + "grad_norm": 5.6965557299678915, + "learning_rate": 1.7978214396830763e-05, + "loss": 0.2207, + "step": 7347 + }, + { + "epoch": 2.98820658804392, + "grad_norm": 8.749857843124504, + "learning_rate": 1.7977603510983542e-05, + "loss": 0.3152, + "step": 7348 + }, + { + "epoch": 2.988613257421716, + "grad_norm": 2.8365359596350044, + "learning_rate": 1.7976992543242208e-05, + "loss": 0.053, + "step": 7349 + }, + { + "epoch": 2.989019926799512, + "grad_norm": 6.417105411342828, + "learning_rate": 1.7976381493613028e-05, + "loss": 0.1572, + "step": 7350 + }, + { + "epoch": 2.989426596177308, + "grad_norm": 9.503151794559951, + "learning_rate": 1.7975770362102278e-05, + "loss": 0.3257, + "step": 7351 + }, + { + "epoch": 2.9898332655551036, + "grad_norm": 7.8941658342211785, + "learning_rate": 1.7975159148716232e-05, + "loss": 0.3745, + "step": 7352 + }, + { + "epoch": 2.9902399349328994, + "grad_norm": 1.1124070754233453, + "learning_rate": 1.7974547853461162e-05, + "loss": 0.0189, + "step": 7353 + }, + { + "epoch": 2.9906466043106956, + "grad_norm": 17.525164860893437, + "learning_rate": 1.797393647634335e-05, + "loss": 0.7664, + "step": 7354 + }, + { + "epoch": 2.9910532736884914, + "grad_norm": 12.61725567811816, + "learning_rate": 1.7973325017369062e-05, + "loss": 0.4838, + "step": 7355 + }, + { + "epoch": 2.991459943066287, + "grad_norm": 8.030260851842181, + "learning_rate": 1.7972713476544582e-05, + "loss": 0.3259, + "step": 7356 + }, + { + "epoch": 2.991866612444083, + "grad_norm": 7.8891951205220225, + "learning_rate": 1.7972101853876184e-05, + "loss": 0.3524, + "step": 7357 + }, + { + "epoch": 2.9922732818218787, + "grad_norm": 3.744330484108949, + "learning_rate": 1.797149014937015e-05, + "loss": 0.0674, + "step": 7358 + }, + { + "epoch": 2.9926799511996744, + "grad_norm": 11.019410274514984, + "learning_rate": 1.7970878363032755e-05, + "loss": 0.3496, + "step": 7359 + }, + { + "epoch": 2.9930866205774707, + "grad_norm": 4.060861444605024, + "learning_rate": 1.7970266494870285e-05, + "loss": 0.0541, + "step": 7360 + }, + { + "epoch": 2.9934932899552664, + "grad_norm": 0.3836710520159523, + "learning_rate": 1.7969654544889017e-05, + "loss": 0.0083, + "step": 7361 + }, + { + "epoch": 2.993899959333062, + "grad_norm": 9.494174374240044, + "learning_rate": 1.7969042513095234e-05, + "loss": 0.2609, + "step": 7362 + }, + { + "epoch": 2.994306628710858, + "grad_norm": 7.484961480928138, + "learning_rate": 1.7968430399495217e-05, + "loss": 0.3765, + "step": 7363 + }, + { + "epoch": 2.994713298088654, + "grad_norm": 1.965815733266233, + "learning_rate": 1.7967818204095255e-05, + "loss": 0.0225, + "step": 7364 + }, + { + "epoch": 2.99511996746645, + "grad_norm": 10.406245723410606, + "learning_rate": 1.7967205926901625e-05, + "loss": 0.2201, + "step": 7365 + }, + { + "epoch": 2.9955266368442457, + "grad_norm": 7.309916718981865, + "learning_rate": 1.7966593567920618e-05, + "loss": 0.1526, + "step": 7366 + }, + { + "epoch": 2.9959333062220415, + "grad_norm": 7.393819045394557, + "learning_rate": 1.796598112715852e-05, + "loss": 0.1553, + "step": 7367 + }, + { + "epoch": 2.9963399755998372, + "grad_norm": 8.74927210728514, + "learning_rate": 1.7965368604621616e-05, + "loss": 0.3736, + "step": 7368 + }, + { + "epoch": 2.996746644977633, + "grad_norm": 12.171318676324848, + "learning_rate": 1.796475600031619e-05, + "loss": 0.7091, + "step": 7369 + }, + { + "epoch": 2.997153314355429, + "grad_norm": 21.789304861524222, + "learning_rate": 1.7964143314248543e-05, + "loss": 0.6806, + "step": 7370 + }, + { + "epoch": 2.997559983733225, + "grad_norm": 6.281422443744096, + "learning_rate": 1.7963530546424953e-05, + "loss": 0.1986, + "step": 7371 + }, + { + "epoch": 2.9979666531110207, + "grad_norm": 13.80944046610667, + "learning_rate": 1.7962917696851712e-05, + "loss": 0.4303, + "step": 7372 + }, + { + "epoch": 2.9983733224888165, + "grad_norm": 4.680571541410495, + "learning_rate": 1.7962304765535113e-05, + "loss": 0.0812, + "step": 7373 + }, + { + "epoch": 2.9987799918666127, + "grad_norm": 0.4057218375439991, + "learning_rate": 1.796169175248145e-05, + "loss": 0.0057, + "step": 7374 + }, + { + "epoch": 2.9991866612444085, + "grad_norm": 6.8935185415409475, + "learning_rate": 1.7961078657697014e-05, + "loss": 0.238, + "step": 7375 + }, + { + "epoch": 2.9995933306222042, + "grad_norm": 8.433420108811584, + "learning_rate": 1.79604654811881e-05, + "loss": 0.1833, + "step": 7376 + }, + { + "epoch": 3.0, + "grad_norm": 0.9248566264305813, + "learning_rate": 1.7959852222961002e-05, + "loss": 0.0148, + "step": 7377 + }, + { + "epoch": 3.0004066693777958, + "grad_norm": 11.289980880802938, + "learning_rate": 1.795923888302201e-05, + "loss": 0.676, + "step": 7378 + }, + { + "epoch": 3.0008133387555915, + "grad_norm": 15.450189637591249, + "learning_rate": 1.7958625461377433e-05, + "loss": 0.665, + "step": 7379 + }, + { + "epoch": 3.0012200081333877, + "grad_norm": 6.39793566597772, + "learning_rate": 1.7958011958033555e-05, + "loss": 0.1039, + "step": 7380 + }, + { + "epoch": 3.0016266775111835, + "grad_norm": 4.248017084452978, + "learning_rate": 1.7957398372996677e-05, + "loss": 0.237, + "step": 7381 + }, + { + "epoch": 3.0020333468889793, + "grad_norm": 2.8687867358107315, + "learning_rate": 1.7956784706273104e-05, + "loss": 0.0463, + "step": 7382 + }, + { + "epoch": 3.002440016266775, + "grad_norm": 10.205833032603017, + "learning_rate": 1.7956170957869127e-05, + "loss": 0.4661, + "step": 7383 + }, + { + "epoch": 3.002846685644571, + "grad_norm": 0.8335278367151832, + "learning_rate": 1.7955557127791055e-05, + "loss": 0.0121, + "step": 7384 + }, + { + "epoch": 3.003253355022367, + "grad_norm": 7.297745812166856, + "learning_rate": 1.7954943216045183e-05, + "loss": 0.1437, + "step": 7385 + }, + { + "epoch": 3.0036600244001628, + "grad_norm": 5.4942974610163064, + "learning_rate": 1.7954329222637817e-05, + "loss": 0.1505, + "step": 7386 + }, + { + "epoch": 3.0040666937779585, + "grad_norm": 1.891187621955178, + "learning_rate": 1.7953715147575255e-05, + "loss": 0.0447, + "step": 7387 + }, + { + "epoch": 3.0044733631557543, + "grad_norm": 2.31961065682045, + "learning_rate": 1.7953100990863806e-05, + "loss": 0.0277, + "step": 7388 + }, + { + "epoch": 3.00488003253355, + "grad_norm": 4.6801115442647445, + "learning_rate": 1.7952486752509776e-05, + "loss": 0.0388, + "step": 7389 + }, + { + "epoch": 3.0052867019113463, + "grad_norm": 6.183972864132456, + "learning_rate": 1.795187243251946e-05, + "loss": 0.1104, + "step": 7390 + }, + { + "epoch": 3.005693371289142, + "grad_norm": 17.20267421558834, + "learning_rate": 1.795125803089918e-05, + "loss": 0.6286, + "step": 7391 + }, + { + "epoch": 3.006100040666938, + "grad_norm": 7.385826046810696, + "learning_rate": 1.795064354765523e-05, + "loss": 0.7132, + "step": 7392 + }, + { + "epoch": 3.0065067100447336, + "grad_norm": 2.4765855372114514, + "learning_rate": 1.795002898279392e-05, + "loss": 0.0545, + "step": 7393 + }, + { + "epoch": 3.0069133794225293, + "grad_norm": 4.492695737246924, + "learning_rate": 1.7949414336321563e-05, + "loss": 0.1087, + "step": 7394 + }, + { + "epoch": 3.0073200488003256, + "grad_norm": 5.15085753030416, + "learning_rate": 1.7948799608244467e-05, + "loss": 0.1522, + "step": 7395 + }, + { + "epoch": 3.0077267181781213, + "grad_norm": 8.504143214046834, + "learning_rate": 1.7948184798568942e-05, + "loss": 0.4259, + "step": 7396 + }, + { + "epoch": 3.008133387555917, + "grad_norm": 5.636450345882119, + "learning_rate": 1.7947569907301302e-05, + "loss": 0.2746, + "step": 7397 + }, + { + "epoch": 3.008540056933713, + "grad_norm": 7.623047165402743, + "learning_rate": 1.7946954934447856e-05, + "loss": 0.2477, + "step": 7398 + }, + { + "epoch": 3.0089467263115086, + "grad_norm": 6.231975498378385, + "learning_rate": 1.7946339880014917e-05, + "loss": 0.1253, + "step": 7399 + }, + { + "epoch": 3.0093533956893044, + "grad_norm": 3.651613744828137, + "learning_rate": 1.79457247440088e-05, + "loss": 0.0411, + "step": 7400 + }, + { + "epoch": 3.0097600650671006, + "grad_norm": 5.185160254422055, + "learning_rate": 1.7945109526435822e-05, + "loss": 0.2634, + "step": 7401 + }, + { + "epoch": 3.0101667344448964, + "grad_norm": 2.6648804394942065, + "learning_rate": 1.7944494227302295e-05, + "loss": 0.0399, + "step": 7402 + }, + { + "epoch": 3.010573403822692, + "grad_norm": 5.151236454063398, + "learning_rate": 1.7943878846614536e-05, + "loss": 0.1178, + "step": 7403 + }, + { + "epoch": 3.010980073200488, + "grad_norm": 4.398925237775226, + "learning_rate": 1.7943263384378863e-05, + "loss": 0.0573, + "step": 7404 + }, + { + "epoch": 3.0113867425782836, + "grad_norm": 11.724180288690855, + "learning_rate": 1.794264784060159e-05, + "loss": 0.8142, + "step": 7405 + }, + { + "epoch": 3.01179341195608, + "grad_norm": 3.508841790034821, + "learning_rate": 1.7942032215289042e-05, + "loss": 0.0794, + "step": 7406 + }, + { + "epoch": 3.0122000813338756, + "grad_norm": 4.417217751194621, + "learning_rate": 1.7941416508447537e-05, + "loss": 0.0847, + "step": 7407 + }, + { + "epoch": 3.0126067507116714, + "grad_norm": 10.536387071079375, + "learning_rate": 1.7940800720083392e-05, + "loss": 0.2596, + "step": 7408 + }, + { + "epoch": 3.013013420089467, + "grad_norm": 1.609181239170424, + "learning_rate": 1.7940184850202932e-05, + "loss": 0.0234, + "step": 7409 + }, + { + "epoch": 3.013420089467263, + "grad_norm": 2.251145389479954, + "learning_rate": 1.793956889881248e-05, + "loss": 0.0317, + "step": 7410 + }, + { + "epoch": 3.013826758845059, + "grad_norm": 10.987643094183426, + "learning_rate": 1.7938952865918357e-05, + "loss": 0.2403, + "step": 7411 + }, + { + "epoch": 3.014233428222855, + "grad_norm": 13.981744518470341, + "learning_rate": 1.7938336751526888e-05, + "loss": 0.5339, + "step": 7412 + }, + { + "epoch": 3.0146400976006507, + "grad_norm": 9.51176058292002, + "learning_rate": 1.7937720555644395e-05, + "loss": 0.6255, + "step": 7413 + }, + { + "epoch": 3.0150467669784464, + "grad_norm": 0.442890949315769, + "learning_rate": 1.7937104278277207e-05, + "loss": 0.0071, + "step": 7414 + }, + { + "epoch": 3.015453436356242, + "grad_norm": 10.071838342635685, + "learning_rate": 1.793648791943165e-05, + "loss": 0.4386, + "step": 7415 + }, + { + "epoch": 3.0158601057340384, + "grad_norm": 0.21115463491920536, + "learning_rate": 1.7935871479114045e-05, + "loss": 0.0067, + "step": 7416 + }, + { + "epoch": 3.016266775111834, + "grad_norm": 12.725650336077955, + "learning_rate": 1.7935254957330733e-05, + "loss": 0.6292, + "step": 7417 + }, + { + "epoch": 3.01667344448963, + "grad_norm": 10.650005564207568, + "learning_rate": 1.793463835408803e-05, + "loss": 0.4757, + "step": 7418 + }, + { + "epoch": 3.0170801138674257, + "grad_norm": 8.824609401223826, + "learning_rate": 1.793402166939227e-05, + "loss": 0.207, + "step": 7419 + }, + { + "epoch": 3.0174867832452215, + "grad_norm": 5.29423028276218, + "learning_rate": 1.7933404903249786e-05, + "loss": 0.0766, + "step": 7420 + }, + { + "epoch": 3.0178934526230177, + "grad_norm": 9.540133134163538, + "learning_rate": 1.793278805566691e-05, + "loss": 0.488, + "step": 7421 + }, + { + "epoch": 3.0183001220008134, + "grad_norm": 3.5271644423895947, + "learning_rate": 1.793217112664997e-05, + "loss": 0.1081, + "step": 7422 + }, + { + "epoch": 3.018706791378609, + "grad_norm": 10.752820195543181, + "learning_rate": 1.79315541162053e-05, + "loss": 0.4214, + "step": 7423 + }, + { + "epoch": 3.019113460756405, + "grad_norm": 1.008041807848053, + "learning_rate": 1.793093702433924e-05, + "loss": 0.0135, + "step": 7424 + }, + { + "epoch": 3.0195201301342007, + "grad_norm": 12.286812724890886, + "learning_rate": 1.7930319851058114e-05, + "loss": 0.1364, + "step": 7425 + }, + { + "epoch": 3.019926799511997, + "grad_norm": 2.616509737810849, + "learning_rate": 1.7929702596368266e-05, + "loss": 0.0558, + "step": 7426 + }, + { + "epoch": 3.0203334688897927, + "grad_norm": 9.986963741713534, + "learning_rate": 1.792908526027603e-05, + "loss": 0.5371, + "step": 7427 + }, + { + "epoch": 3.0207401382675885, + "grad_norm": 9.901891124273073, + "learning_rate": 1.7928467842787743e-05, + "loss": 0.5141, + "step": 7428 + }, + { + "epoch": 3.0211468076453842, + "grad_norm": 0.8990128351305068, + "learning_rate": 1.7927850343909744e-05, + "loss": 0.0171, + "step": 7429 + }, + { + "epoch": 3.02155347702318, + "grad_norm": 0.6094460968923848, + "learning_rate": 1.792723276364837e-05, + "loss": 0.01, + "step": 7430 + }, + { + "epoch": 3.021960146400976, + "grad_norm": 6.912901854317343, + "learning_rate": 1.792661510200996e-05, + "loss": 0.1565, + "step": 7431 + }, + { + "epoch": 3.022366815778772, + "grad_norm": 2.214283687270323, + "learning_rate": 1.7925997359000863e-05, + "loss": 0.0387, + "step": 7432 + }, + { + "epoch": 3.0227734851565677, + "grad_norm": 9.617353047256115, + "learning_rate": 1.792537953462741e-05, + "loss": 0.1211, + "step": 7433 + }, + { + "epoch": 3.0231801545343635, + "grad_norm": 5.382017560679258, + "learning_rate": 1.7924761628895947e-05, + "loss": 0.0723, + "step": 7434 + }, + { + "epoch": 3.0235868239121593, + "grad_norm": 10.242814291085576, + "learning_rate": 1.7924143641812816e-05, + "loss": 0.1919, + "step": 7435 + }, + { + "epoch": 3.0239934932899555, + "grad_norm": 11.049987197890879, + "learning_rate": 1.7923525573384365e-05, + "loss": 0.5128, + "step": 7436 + }, + { + "epoch": 3.0244001626677512, + "grad_norm": 6.630526642464049, + "learning_rate": 1.7922907423616935e-05, + "loss": 0.2954, + "step": 7437 + }, + { + "epoch": 3.024806832045547, + "grad_norm": 4.465351934047098, + "learning_rate": 1.7922289192516877e-05, + "loss": 0.0955, + "step": 7438 + }, + { + "epoch": 3.0252135014233428, + "grad_norm": 7.964609575430539, + "learning_rate": 1.792167088009053e-05, + "loss": 0.3723, + "step": 7439 + }, + { + "epoch": 3.0256201708011385, + "grad_norm": 5.231007069313041, + "learning_rate": 1.792105248634424e-05, + "loss": 0.1285, + "step": 7440 + }, + { + "epoch": 3.0260268401789343, + "grad_norm": 1.6418502921355826, + "learning_rate": 1.7920434011284364e-05, + "loss": 0.0255, + "step": 7441 + }, + { + "epoch": 3.0264335095567305, + "grad_norm": 10.408212824703494, + "learning_rate": 1.7919815454917247e-05, + "loss": 0.6002, + "step": 7442 + }, + { + "epoch": 3.0268401789345263, + "grad_norm": 9.339171325522384, + "learning_rate": 1.7919196817249234e-05, + "loss": 0.5143, + "step": 7443 + }, + { + "epoch": 3.027246848312322, + "grad_norm": 1.3581370110001478, + "learning_rate": 1.7918578098286686e-05, + "loss": 0.0149, + "step": 7444 + }, + { + "epoch": 3.027653517690118, + "grad_norm": 6.756831567006746, + "learning_rate": 1.7917959298035947e-05, + "loss": 0.2163, + "step": 7445 + }, + { + "epoch": 3.0280601870679136, + "grad_norm": 0.2852012252893311, + "learning_rate": 1.791734041650337e-05, + "loss": 0.0041, + "step": 7446 + }, + { + "epoch": 3.02846685644571, + "grad_norm": 4.467153879784997, + "learning_rate": 1.7916721453695307e-05, + "loss": 0.0683, + "step": 7447 + }, + { + "epoch": 3.0288735258235056, + "grad_norm": 3.3321264974200204, + "learning_rate": 1.7916102409618118e-05, + "loss": 0.0909, + "step": 7448 + }, + { + "epoch": 3.0292801952013013, + "grad_norm": 10.052371786609065, + "learning_rate": 1.791548328427815e-05, + "loss": 0.466, + "step": 7449 + }, + { + "epoch": 3.029686864579097, + "grad_norm": 7.408810562778364, + "learning_rate": 1.7914864077681763e-05, + "loss": 0.3533, + "step": 7450 + }, + { + "epoch": 3.030093533956893, + "grad_norm": 3.144095817087188, + "learning_rate": 1.7914244789835312e-05, + "loss": 0.08, + "step": 7451 + }, + { + "epoch": 3.030500203334689, + "grad_norm": 9.242996545720532, + "learning_rate": 1.7913625420745157e-05, + "loss": 0.3061, + "step": 7452 + }, + { + "epoch": 3.030906872712485, + "grad_norm": 11.642541138584319, + "learning_rate": 1.7913005970417655e-05, + "loss": 0.3421, + "step": 7453 + }, + { + "epoch": 3.0313135420902806, + "grad_norm": 5.213140552137732, + "learning_rate": 1.791238643885916e-05, + "loss": 0.0798, + "step": 7454 + }, + { + "epoch": 3.0317202114680764, + "grad_norm": 7.6743931733264015, + "learning_rate": 1.791176682607604e-05, + "loss": 0.4022, + "step": 7455 + }, + { + "epoch": 3.032126880845872, + "grad_norm": 11.361319107962315, + "learning_rate": 1.7911147132074653e-05, + "loss": 0.132, + "step": 7456 + }, + { + "epoch": 3.0325335502236683, + "grad_norm": 10.496099405762767, + "learning_rate": 1.7910527356861355e-05, + "loss": 0.2431, + "step": 7457 + }, + { + "epoch": 3.032940219601464, + "grad_norm": 2.688919523847179, + "learning_rate": 1.7909907500442517e-05, + "loss": 0.1005, + "step": 7458 + }, + { + "epoch": 3.03334688897926, + "grad_norm": 2.079022558800857, + "learning_rate": 1.7909287562824492e-05, + "loss": 0.0287, + "step": 7459 + }, + { + "epoch": 3.0337535583570556, + "grad_norm": 6.896968802324417, + "learning_rate": 1.7908667544013653e-05, + "loss": 0.1747, + "step": 7460 + }, + { + "epoch": 3.0341602277348514, + "grad_norm": 16.650178207765272, + "learning_rate": 1.7908047444016362e-05, + "loss": 0.2383, + "step": 7461 + }, + { + "epoch": 3.0345668971126476, + "grad_norm": 9.333772642657035, + "learning_rate": 1.790742726283898e-05, + "loss": 0.1703, + "step": 7462 + }, + { + "epoch": 3.0349735664904434, + "grad_norm": 9.005682104911703, + "learning_rate": 1.790680700048788e-05, + "loss": 0.3282, + "step": 7463 + }, + { + "epoch": 3.035380235868239, + "grad_norm": 3.1828661858106893, + "learning_rate": 1.7906186656969426e-05, + "loss": 0.1219, + "step": 7464 + }, + { + "epoch": 3.035786905246035, + "grad_norm": 12.471786113358407, + "learning_rate": 1.7905566232289987e-05, + "loss": 0.5584, + "step": 7465 + }, + { + "epoch": 3.0361935746238307, + "grad_norm": 6.177747226750564, + "learning_rate": 1.7904945726455932e-05, + "loss": 0.1639, + "step": 7466 + }, + { + "epoch": 3.036600244001627, + "grad_norm": 8.212069977222802, + "learning_rate": 1.790432513947363e-05, + "loss": 0.2094, + "step": 7467 + }, + { + "epoch": 3.0370069133794226, + "grad_norm": 0.7182574109744732, + "learning_rate": 1.7903704471349454e-05, + "loss": 0.011, + "step": 7468 + }, + { + "epoch": 3.0374135827572184, + "grad_norm": 2.0244183264307978, + "learning_rate": 1.7903083722089775e-05, + "loss": 0.0419, + "step": 7469 + }, + { + "epoch": 3.037820252135014, + "grad_norm": 3.941454642014003, + "learning_rate": 1.790246289170096e-05, + "loss": 0.0753, + "step": 7470 + }, + { + "epoch": 3.03822692151281, + "grad_norm": 5.542654164412738, + "learning_rate": 1.7901841980189387e-05, + "loss": 0.1126, + "step": 7471 + }, + { + "epoch": 3.038633590890606, + "grad_norm": 5.328266303920616, + "learning_rate": 1.790122098756143e-05, + "loss": 0.1454, + "step": 7472 + }, + { + "epoch": 3.039040260268402, + "grad_norm": 8.176230763684886, + "learning_rate": 1.7900599913823464e-05, + "loss": 0.1766, + "step": 7473 + }, + { + "epoch": 3.0394469296461977, + "grad_norm": 8.49835132157862, + "learning_rate": 1.789997875898186e-05, + "loss": 0.1534, + "step": 7474 + }, + { + "epoch": 3.0398535990239934, + "grad_norm": 11.662483655317692, + "learning_rate": 1.7899357523043003e-05, + "loss": 0.289, + "step": 7475 + }, + { + "epoch": 3.040260268401789, + "grad_norm": 17.71816269624062, + "learning_rate": 1.7898736206013267e-05, + "loss": 0.2322, + "step": 7476 + }, + { + "epoch": 3.0406669377795854, + "grad_norm": 4.509976464156638, + "learning_rate": 1.789811480789902e-05, + "loss": 0.2154, + "step": 7477 + }, + { + "epoch": 3.041073607157381, + "grad_norm": 9.810629019563267, + "learning_rate": 1.7897493328706656e-05, + "loss": 0.3113, + "step": 7478 + }, + { + "epoch": 3.041480276535177, + "grad_norm": 12.386784056944, + "learning_rate": 1.789687176844255e-05, + "loss": 0.5428, + "step": 7479 + }, + { + "epoch": 3.0418869459129727, + "grad_norm": 14.440198577478204, + "learning_rate": 1.7896250127113072e-05, + "loss": 0.3504, + "step": 7480 + }, + { + "epoch": 3.0422936152907685, + "grad_norm": 9.824198508099583, + "learning_rate": 1.789562840472462e-05, + "loss": 0.332, + "step": 7481 + }, + { + "epoch": 3.0427002846685642, + "grad_norm": 0.32827321709066076, + "learning_rate": 1.7895006601283567e-05, + "loss": 0.0042, + "step": 7482 + }, + { + "epoch": 3.0431069540463604, + "grad_norm": 5.095295167627074, + "learning_rate": 1.7894384716796294e-05, + "loss": 0.1855, + "step": 7483 + }, + { + "epoch": 3.043513623424156, + "grad_norm": 13.67296093469979, + "learning_rate": 1.7893762751269195e-05, + "loss": 0.7284, + "step": 7484 + }, + { + "epoch": 3.043920292801952, + "grad_norm": 0.012614437602724125, + "learning_rate": 1.7893140704708643e-05, + "loss": 0.0002, + "step": 7485 + }, + { + "epoch": 3.0443269621797477, + "grad_norm": 0.24453070466211937, + "learning_rate": 1.789251857712103e-05, + "loss": 0.003, + "step": 7486 + }, + { + "epoch": 3.0447336315575435, + "grad_norm": 1.2147574417083467, + "learning_rate": 1.7891896368512746e-05, + "loss": 0.0117, + "step": 7487 + }, + { + "epoch": 3.0451403009353397, + "grad_norm": 0.26965522369176204, + "learning_rate": 1.789127407889017e-05, + "loss": 0.0033, + "step": 7488 + }, + { + "epoch": 3.0455469703131355, + "grad_norm": 17.20070465032024, + "learning_rate": 1.7890651708259695e-05, + "loss": 0.2015, + "step": 7489 + }, + { + "epoch": 3.0459536396909312, + "grad_norm": 7.65152039268005, + "learning_rate": 1.789002925662771e-05, + "loss": 0.3997, + "step": 7490 + }, + { + "epoch": 3.046360309068727, + "grad_norm": 7.573902984342475, + "learning_rate": 1.78894067240006e-05, + "loss": 0.1438, + "step": 7491 + }, + { + "epoch": 3.0467669784465228, + "grad_norm": 12.54057402398419, + "learning_rate": 1.788878411038476e-05, + "loss": 0.3375, + "step": 7492 + }, + { + "epoch": 3.047173647824319, + "grad_norm": 13.863056328931503, + "learning_rate": 1.7888161415786586e-05, + "loss": 0.7057, + "step": 7493 + }, + { + "epoch": 3.0475803172021148, + "grad_norm": 3.3122805377587463, + "learning_rate": 1.788753864021246e-05, + "loss": 0.1345, + "step": 7494 + }, + { + "epoch": 3.0479869865799105, + "grad_norm": 15.82496216948585, + "learning_rate": 1.7886915783668785e-05, + "loss": 0.5438, + "step": 7495 + }, + { + "epoch": 3.0483936559577063, + "grad_norm": 10.030607635541294, + "learning_rate": 1.7886292846161947e-05, + "loss": 0.2771, + "step": 7496 + }, + { + "epoch": 3.048800325335502, + "grad_norm": 11.815788795559305, + "learning_rate": 1.788566982769834e-05, + "loss": 0.2473, + "step": 7497 + }, + { + "epoch": 3.0492069947132983, + "grad_norm": 0.2616025187541233, + "learning_rate": 1.7885046728284368e-05, + "loss": 0.0038, + "step": 7498 + }, + { + "epoch": 3.049613664091094, + "grad_norm": 10.510090546218995, + "learning_rate": 1.7884423547926424e-05, + "loss": 0.4422, + "step": 7499 + }, + { + "epoch": 3.05002033346889, + "grad_norm": 10.611836880059593, + "learning_rate": 1.78838002866309e-05, + "loss": 0.7115, + "step": 7500 + }, + { + "epoch": 3.0504270028466856, + "grad_norm": 0.06278551026815447, + "learning_rate": 1.7883176944404202e-05, + "loss": 0.0012, + "step": 7501 + }, + { + "epoch": 3.0508336722244813, + "grad_norm": 4.958051520157383, + "learning_rate": 1.788255352125272e-05, + "loss": 0.0501, + "step": 7502 + }, + { + "epoch": 3.0512403416022775, + "grad_norm": 11.221568333001345, + "learning_rate": 1.7881930017182864e-05, + "loss": 0.6905, + "step": 7503 + }, + { + "epoch": 3.0516470109800733, + "grad_norm": 12.830187528645805, + "learning_rate": 1.788130643220103e-05, + "loss": 0.6048, + "step": 7504 + }, + { + "epoch": 3.052053680357869, + "grad_norm": 8.405043203882165, + "learning_rate": 1.7880682766313615e-05, + "loss": 0.2194, + "step": 7505 + }, + { + "epoch": 3.052460349735665, + "grad_norm": 9.036850387993118, + "learning_rate": 1.7880059019527026e-05, + "loss": 0.3805, + "step": 7506 + }, + { + "epoch": 3.0528670191134606, + "grad_norm": 6.35251666543569, + "learning_rate": 1.7879435191847666e-05, + "loss": 0.2306, + "step": 7507 + }, + { + "epoch": 3.053273688491257, + "grad_norm": 11.329394602991155, + "learning_rate": 1.7878811283281942e-05, + "loss": 0.8071, + "step": 7508 + }, + { + "epoch": 3.0536803578690526, + "grad_norm": 4.159752431949089, + "learning_rate": 1.787818729383625e-05, + "loss": 0.1141, + "step": 7509 + }, + { + "epoch": 3.0540870272468483, + "grad_norm": 7.472766015462523, + "learning_rate": 1.7877563223517e-05, + "loss": 0.3417, + "step": 7510 + }, + { + "epoch": 3.054493696624644, + "grad_norm": 0.3077816183131289, + "learning_rate": 1.78769390723306e-05, + "loss": 0.0041, + "step": 7511 + }, + { + "epoch": 3.05490036600244, + "grad_norm": 8.145130209459246, + "learning_rate": 1.787631484028346e-05, + "loss": 0.3772, + "step": 7512 + }, + { + "epoch": 3.055307035380236, + "grad_norm": 10.204499359875989, + "learning_rate": 1.7875690527381976e-05, + "loss": 0.5998, + "step": 7513 + }, + { + "epoch": 3.055713704758032, + "grad_norm": 13.725512848465744, + "learning_rate": 1.787506613363257e-05, + "loss": 0.3507, + "step": 7514 + }, + { + "epoch": 3.0561203741358276, + "grad_norm": 5.9738520045256225, + "learning_rate": 1.7874441659041647e-05, + "loss": 0.1776, + "step": 7515 + }, + { + "epoch": 3.0565270435136234, + "grad_norm": 12.23677495169518, + "learning_rate": 1.7873817103615617e-05, + "loss": 0.3585, + "step": 7516 + }, + { + "epoch": 3.056933712891419, + "grad_norm": 5.045274519538837, + "learning_rate": 1.7873192467360893e-05, + "loss": 0.0873, + "step": 7517 + }, + { + "epoch": 3.0573403822692153, + "grad_norm": 6.761217724854286, + "learning_rate": 1.787256775028388e-05, + "loss": 0.4044, + "step": 7518 + }, + { + "epoch": 3.057747051647011, + "grad_norm": 34.15465231744469, + "learning_rate": 1.7871942952391004e-05, + "loss": 0.6923, + "step": 7519 + }, + { + "epoch": 3.058153721024807, + "grad_norm": 7.766353125947408, + "learning_rate": 1.7871318073688667e-05, + "loss": 0.4036, + "step": 7520 + }, + { + "epoch": 3.0585603904026026, + "grad_norm": 5.895060448124197, + "learning_rate": 1.787069311418329e-05, + "loss": 0.1815, + "step": 7521 + }, + { + "epoch": 3.0589670597803984, + "grad_norm": 7.969817504251766, + "learning_rate": 1.7870068073881286e-05, + "loss": 0.3748, + "step": 7522 + }, + { + "epoch": 3.059373729158194, + "grad_norm": 1.544016252899977, + "learning_rate": 1.7869442952789073e-05, + "loss": 0.0238, + "step": 7523 + }, + { + "epoch": 3.0597803985359904, + "grad_norm": 5.4238489580443305, + "learning_rate": 1.786881775091307e-05, + "loss": 0.1823, + "step": 7524 + }, + { + "epoch": 3.060187067913786, + "grad_norm": 11.093448750677105, + "learning_rate": 1.7868192468259686e-05, + "loss": 0.2189, + "step": 7525 + }, + { + "epoch": 3.060593737291582, + "grad_norm": 13.143116384542221, + "learning_rate": 1.786756710483535e-05, + "loss": 0.9465, + "step": 7526 + }, + { + "epoch": 3.0610004066693777, + "grad_norm": 14.592310893722335, + "learning_rate": 1.7866941660646483e-05, + "loss": 0.7069, + "step": 7527 + }, + { + "epoch": 3.0614070760471734, + "grad_norm": 1.562146978955314, + "learning_rate": 1.7866316135699495e-05, + "loss": 0.0194, + "step": 7528 + }, + { + "epoch": 3.0618137454249696, + "grad_norm": 17.882519442626414, + "learning_rate": 1.786569053000081e-05, + "loss": 1.0744, + "step": 7529 + }, + { + "epoch": 3.0622204148027654, + "grad_norm": 12.01244208879385, + "learning_rate": 1.786506484355686e-05, + "loss": 0.2251, + "step": 7530 + }, + { + "epoch": 3.062627084180561, + "grad_norm": 3.9563612690277803, + "learning_rate": 1.7864439076374055e-05, + "loss": 0.0638, + "step": 7531 + }, + { + "epoch": 3.063033753558357, + "grad_norm": 10.128017785069222, + "learning_rate": 1.7863813228458832e-05, + "loss": 0.3486, + "step": 7532 + }, + { + "epoch": 3.0634404229361527, + "grad_norm": 10.35313086093516, + "learning_rate": 1.78631872998176e-05, + "loss": 0.77, + "step": 7533 + }, + { + "epoch": 3.063847092313949, + "grad_norm": 1.2319794973658886, + "learning_rate": 1.78625612904568e-05, + "loss": 0.0237, + "step": 7534 + }, + { + "epoch": 3.0642537616917447, + "grad_norm": 8.327273724108812, + "learning_rate": 1.7861935200382847e-05, + "loss": 0.3645, + "step": 7535 + }, + { + "epoch": 3.0646604310695404, + "grad_norm": 3.8760394917291174, + "learning_rate": 1.7861309029602174e-05, + "loss": 0.066, + "step": 7536 + }, + { + "epoch": 3.065067100447336, + "grad_norm": 4.453391007689209, + "learning_rate": 1.7860682778121207e-05, + "loss": 0.0788, + "step": 7537 + }, + { + "epoch": 3.065473769825132, + "grad_norm": 9.315864396632291, + "learning_rate": 1.7860056445946372e-05, + "loss": 0.3248, + "step": 7538 + }, + { + "epoch": 3.065880439202928, + "grad_norm": 8.7666911152161, + "learning_rate": 1.7859430033084107e-05, + "loss": 0.115, + "step": 7539 + }, + { + "epoch": 3.066287108580724, + "grad_norm": 4.083676785471791, + "learning_rate": 1.785880353954083e-05, + "loss": 0.1453, + "step": 7540 + }, + { + "epoch": 3.0666937779585197, + "grad_norm": 3.634714684747258, + "learning_rate": 1.7858176965322988e-05, + "loss": 0.117, + "step": 7541 + }, + { + "epoch": 3.0671004473363155, + "grad_norm": 1.8789137724225933, + "learning_rate": 1.7857550310437e-05, + "loss": 0.0304, + "step": 7542 + }, + { + "epoch": 3.0675071167141112, + "grad_norm": 10.14047535870293, + "learning_rate": 1.7856923574889306e-05, + "loss": 0.3317, + "step": 7543 + }, + { + "epoch": 3.0679137860919075, + "grad_norm": 11.111516992901997, + "learning_rate": 1.7856296758686333e-05, + "loss": 0.2444, + "step": 7544 + }, + { + "epoch": 3.0683204554697032, + "grad_norm": 10.774618663848837, + "learning_rate": 1.785566986183452e-05, + "loss": 0.3267, + "step": 7545 + }, + { + "epoch": 3.068727124847499, + "grad_norm": 1.9167444282220503, + "learning_rate": 1.7855042884340306e-05, + "loss": 0.0361, + "step": 7546 + }, + { + "epoch": 3.0691337942252948, + "grad_norm": 50.05743904821063, + "learning_rate": 1.785441582621012e-05, + "loss": 0.4688, + "step": 7547 + }, + { + "epoch": 3.0695404636030905, + "grad_norm": 4.9504744239992675, + "learning_rate": 1.7853788687450407e-05, + "loss": 0.0692, + "step": 7548 + }, + { + "epoch": 3.0699471329808867, + "grad_norm": 10.436105267965608, + "learning_rate": 1.78531614680676e-05, + "loss": 0.3942, + "step": 7549 + }, + { + "epoch": 3.0703538023586825, + "grad_norm": 8.758859028304448, + "learning_rate": 1.785253416806813e-05, + "loss": 0.2771, + "step": 7550 + }, + { + "epoch": 3.0707604717364783, + "grad_norm": 2.2903586907251654, + "learning_rate": 1.785190678745845e-05, + "loss": 0.0329, + "step": 7551 + }, + { + "epoch": 3.071167141114274, + "grad_norm": 20.617445789091043, + "learning_rate": 1.7851279326244998e-05, + "loss": 0.2277, + "step": 7552 + }, + { + "epoch": 3.07157381049207, + "grad_norm": 16.501506801472303, + "learning_rate": 1.7850651784434208e-05, + "loss": 1.0164, + "step": 7553 + }, + { + "epoch": 3.071980479869866, + "grad_norm": 3.338578777804294, + "learning_rate": 1.785002416203253e-05, + "loss": 0.0513, + "step": 7554 + }, + { + "epoch": 3.0723871492476618, + "grad_norm": 10.66355989257746, + "learning_rate": 1.78493964590464e-05, + "loss": 0.2055, + "step": 7555 + }, + { + "epoch": 3.0727938186254575, + "grad_norm": 7.1941869377757826, + "learning_rate": 1.7848768675482267e-05, + "loss": 0.1844, + "step": 7556 + }, + { + "epoch": 3.0732004880032533, + "grad_norm": 12.852823567388198, + "learning_rate": 1.7848140811346574e-05, + "loss": 0.5172, + "step": 7557 + }, + { + "epoch": 3.073607157381049, + "grad_norm": 0.9061515679244602, + "learning_rate": 1.7847512866645762e-05, + "loss": 0.0107, + "step": 7558 + }, + { + "epoch": 3.0740138267588453, + "grad_norm": 4.448772505795109, + "learning_rate": 1.784688484138628e-05, + "loss": 0.0556, + "step": 7559 + }, + { + "epoch": 3.074420496136641, + "grad_norm": 13.217515459703254, + "learning_rate": 1.784625673557458e-05, + "loss": 0.6212, + "step": 7560 + }, + { + "epoch": 3.074827165514437, + "grad_norm": 10.937041553443168, + "learning_rate": 1.7845628549217106e-05, + "loss": 0.2975, + "step": 7561 + }, + { + "epoch": 3.0752338348922326, + "grad_norm": 1.4418830217601042, + "learning_rate": 1.7845000282320302e-05, + "loss": 0.0241, + "step": 7562 + }, + { + "epoch": 3.0756405042700283, + "grad_norm": 10.348643426699809, + "learning_rate": 1.7844371934890625e-05, + "loss": 0.459, + "step": 7563 + }, + { + "epoch": 3.076047173647824, + "grad_norm": 0.3306119864385659, + "learning_rate": 1.7843743506934522e-05, + "loss": 0.0061, + "step": 7564 + }, + { + "epoch": 3.0764538430256203, + "grad_norm": 1.4018866183757033, + "learning_rate": 1.784311499845844e-05, + "loss": 0.0221, + "step": 7565 + }, + { + "epoch": 3.076860512403416, + "grad_norm": 9.1118546134467, + "learning_rate": 1.784248640946884e-05, + "loss": 0.2844, + "step": 7566 + }, + { + "epoch": 3.077267181781212, + "grad_norm": 16.425202916518007, + "learning_rate": 1.784185773997217e-05, + "loss": 1.2129, + "step": 7567 + }, + { + "epoch": 3.0776738511590076, + "grad_norm": 8.537310855630937, + "learning_rate": 1.7841228989974885e-05, + "loss": 0.3156, + "step": 7568 + }, + { + "epoch": 3.0780805205368034, + "grad_norm": 8.409665751365559, + "learning_rate": 1.7840600159483436e-05, + "loss": 0.2696, + "step": 7569 + }, + { + "epoch": 3.0784871899145996, + "grad_norm": 0.2950014489583783, + "learning_rate": 1.7839971248504277e-05, + "loss": 0.0042, + "step": 7570 + }, + { + "epoch": 3.0788938592923953, + "grad_norm": 9.006943818895643, + "learning_rate": 1.783934225704387e-05, + "loss": 0.7465, + "step": 7571 + }, + { + "epoch": 3.079300528670191, + "grad_norm": 5.6354802974240785, + "learning_rate": 1.783871318510867e-05, + "loss": 0.1299, + "step": 7572 + }, + { + "epoch": 3.079707198047987, + "grad_norm": 2.9939374173844815, + "learning_rate": 1.7838084032705132e-05, + "loss": 0.0471, + "step": 7573 + }, + { + "epoch": 3.0801138674257826, + "grad_norm": 12.97137095712827, + "learning_rate": 1.783745479983972e-05, + "loss": 0.3595, + "step": 7574 + }, + { + "epoch": 3.080520536803579, + "grad_norm": 0.18413034676998857, + "learning_rate": 1.7836825486518886e-05, + "loss": 0.0036, + "step": 7575 + }, + { + "epoch": 3.0809272061813746, + "grad_norm": 6.676289153804329, + "learning_rate": 1.7836196092749098e-05, + "loss": 0.1493, + "step": 7576 + }, + { + "epoch": 3.0813338755591704, + "grad_norm": 9.87185117544567, + "learning_rate": 1.783556661853681e-05, + "loss": 0.4593, + "step": 7577 + }, + { + "epoch": 3.081740544936966, + "grad_norm": 10.088612977243429, + "learning_rate": 1.783493706388849e-05, + "loss": 0.4093, + "step": 7578 + }, + { + "epoch": 3.082147214314762, + "grad_norm": 9.114954423150778, + "learning_rate": 1.7834307428810594e-05, + "loss": 0.593, + "step": 7579 + }, + { + "epoch": 3.082553883692558, + "grad_norm": 3.851684517782282, + "learning_rate": 1.7833677713309595e-05, + "loss": 0.0483, + "step": 7580 + }, + { + "epoch": 3.082960553070354, + "grad_norm": 1.3745587288599315, + "learning_rate": 1.7833047917391944e-05, + "loss": 0.0261, + "step": 7581 + }, + { + "epoch": 3.0833672224481496, + "grad_norm": 5.7411522952736505, + "learning_rate": 1.7832418041064118e-05, + "loss": 0.071, + "step": 7582 + }, + { + "epoch": 3.0837738918259454, + "grad_norm": 5.497148775805715, + "learning_rate": 1.783178808433258e-05, + "loss": 0.2903, + "step": 7583 + }, + { + "epoch": 3.084180561203741, + "grad_norm": 8.672906146894118, + "learning_rate": 1.7831158047203792e-05, + "loss": 0.1947, + "step": 7584 + }, + { + "epoch": 3.0845872305815374, + "grad_norm": 5.33778372342442, + "learning_rate": 1.7830527929684232e-05, + "loss": 0.1113, + "step": 7585 + }, + { + "epoch": 3.084993899959333, + "grad_norm": 1.071247064910352, + "learning_rate": 1.7829897731780358e-05, + "loss": 0.023, + "step": 7586 + }, + { + "epoch": 3.085400569337129, + "grad_norm": 0.7284818200326502, + "learning_rate": 1.782926745349864e-05, + "loss": 0.0119, + "step": 7587 + }, + { + "epoch": 3.0858072387149247, + "grad_norm": 0.8488938335342666, + "learning_rate": 1.7828637094845554e-05, + "loss": 0.0146, + "step": 7588 + }, + { + "epoch": 3.0862139080927204, + "grad_norm": 7.433653261006115, + "learning_rate": 1.782800665582757e-05, + "loss": 0.2516, + "step": 7589 + }, + { + "epoch": 3.0866205774705167, + "grad_norm": 8.604000613661867, + "learning_rate": 1.7827376136451152e-05, + "loss": 0.1754, + "step": 7590 + }, + { + "epoch": 3.0870272468483124, + "grad_norm": 5.2989339914583615, + "learning_rate": 1.7826745536722786e-05, + "loss": 0.1128, + "step": 7591 + }, + { + "epoch": 3.087433916226108, + "grad_norm": 3.2526694364820905, + "learning_rate": 1.7826114856648933e-05, + "loss": 0.0619, + "step": 7592 + }, + { + "epoch": 3.087840585603904, + "grad_norm": 8.057381199611633, + "learning_rate": 1.7825484096236075e-05, + "loss": 0.1979, + "step": 7593 + }, + { + "epoch": 3.0882472549816997, + "grad_norm": 7.682711336175831, + "learning_rate": 1.782485325549068e-05, + "loss": 0.3076, + "step": 7594 + }, + { + "epoch": 3.088653924359496, + "grad_norm": 18.632963978053695, + "learning_rate": 1.782422233441923e-05, + "loss": 0.4887, + "step": 7595 + }, + { + "epoch": 3.0890605937372917, + "grad_norm": 2.1952055898907723, + "learning_rate": 1.7823591333028207e-05, + "loss": 0.0259, + "step": 7596 + }, + { + "epoch": 3.0894672631150875, + "grad_norm": 4.417088465024285, + "learning_rate": 1.782296025132407e-05, + "loss": 0.114, + "step": 7597 + }, + { + "epoch": 3.0898739324928832, + "grad_norm": 20.77587870112115, + "learning_rate": 1.782232908931332e-05, + "loss": 0.4419, + "step": 7598 + }, + { + "epoch": 3.090280601870679, + "grad_norm": 13.232801203903636, + "learning_rate": 1.7821697847002414e-05, + "loss": 0.574, + "step": 7599 + }, + { + "epoch": 3.090687271248475, + "grad_norm": 9.099103867719478, + "learning_rate": 1.782106652439785e-05, + "loss": 0.3353, + "step": 7600 + }, + { + "epoch": 3.091093940626271, + "grad_norm": 11.37998814829923, + "learning_rate": 1.78204351215061e-05, + "loss": 0.428, + "step": 7601 + }, + { + "epoch": 3.0915006100040667, + "grad_norm": 15.679171821018874, + "learning_rate": 1.7819803638333648e-05, + "loss": 0.2893, + "step": 7602 + }, + { + "epoch": 3.0919072793818625, + "grad_norm": 3.074479305945526, + "learning_rate": 1.7819172074886976e-05, + "loss": 0.0615, + "step": 7603 + }, + { + "epoch": 3.0923139487596583, + "grad_norm": 17.346925691744612, + "learning_rate": 1.7818540431172566e-05, + "loss": 1.0605, + "step": 7604 + }, + { + "epoch": 3.092720618137454, + "grad_norm": 6.134524124769817, + "learning_rate": 1.7817908707196904e-05, + "loss": 0.2473, + "step": 7605 + }, + { + "epoch": 3.0931272875152502, + "grad_norm": 12.1117475282002, + "learning_rate": 1.7817276902966477e-05, + "loss": 0.4859, + "step": 7606 + }, + { + "epoch": 3.093533956893046, + "grad_norm": 1.4115844877688213, + "learning_rate": 1.7816645018487766e-05, + "loss": 0.0252, + "step": 7607 + }, + { + "epoch": 3.0939406262708418, + "grad_norm": 2.8921416001092046, + "learning_rate": 1.781601305376726e-05, + "loss": 0.0501, + "step": 7608 + }, + { + "epoch": 3.0943472956486375, + "grad_norm": 2.131351890850271, + "learning_rate": 1.7815381008811443e-05, + "loss": 0.0324, + "step": 7609 + }, + { + "epoch": 3.0947539650264333, + "grad_norm": 2.327365147596956, + "learning_rate": 1.7814748883626812e-05, + "loss": 0.024, + "step": 7610 + }, + { + "epoch": 3.0951606344042295, + "grad_norm": 5.496249338711844, + "learning_rate": 1.781411667821985e-05, + "loss": 0.0434, + "step": 7611 + }, + { + "epoch": 3.0955673037820253, + "grad_norm": 10.725445286287824, + "learning_rate": 1.781348439259704e-05, + "loss": 0.3283, + "step": 7612 + }, + { + "epoch": 3.095973973159821, + "grad_norm": 7.3957353009698314, + "learning_rate": 1.781285202676489e-05, + "loss": 0.3904, + "step": 7613 + }, + { + "epoch": 3.096380642537617, + "grad_norm": 10.125186387711551, + "learning_rate": 1.7812219580729875e-05, + "loss": 0.2918, + "step": 7614 + }, + { + "epoch": 3.0967873119154126, + "grad_norm": 4.144399392035338, + "learning_rate": 1.78115870544985e-05, + "loss": 0.0583, + "step": 7615 + }, + { + "epoch": 3.0971939812932088, + "grad_norm": 9.250174342672931, + "learning_rate": 1.781095444807725e-05, + "loss": 0.1875, + "step": 7616 + }, + { + "epoch": 3.0976006506710045, + "grad_norm": 11.641069228781495, + "learning_rate": 1.781032176147262e-05, + "loss": 0.3175, + "step": 7617 + }, + { + "epoch": 3.0980073200488003, + "grad_norm": 6.047695039947801, + "learning_rate": 1.780968899469111e-05, + "loss": 0.1806, + "step": 7618 + }, + { + "epoch": 3.098413989426596, + "grad_norm": 6.725917932491228, + "learning_rate": 1.7809056147739207e-05, + "loss": 0.2311, + "step": 7619 + }, + { + "epoch": 3.098820658804392, + "grad_norm": 4.096944667143028, + "learning_rate": 1.7808423220623416e-05, + "loss": 0.2757, + "step": 7620 + }, + { + "epoch": 3.099227328182188, + "grad_norm": 5.001648007127928, + "learning_rate": 1.7807790213350227e-05, + "loss": 0.1197, + "step": 7621 + }, + { + "epoch": 3.099633997559984, + "grad_norm": 5.456356049546759, + "learning_rate": 1.7807157125926146e-05, + "loss": 0.1718, + "step": 7622 + }, + { + "epoch": 3.1000406669377796, + "grad_norm": 10.155486395079778, + "learning_rate": 1.7806523958357663e-05, + "loss": 0.6183, + "step": 7623 + }, + { + "epoch": 3.1004473363155753, + "grad_norm": 2.0848546910560715, + "learning_rate": 1.780589071065129e-05, + "loss": 0.0469, + "step": 7624 + }, + { + "epoch": 3.100854005693371, + "grad_norm": 8.387118865686398, + "learning_rate": 1.7805257382813514e-05, + "loss": 0.2131, + "step": 7625 + }, + { + "epoch": 3.1012606750711673, + "grad_norm": 6.630917750873761, + "learning_rate": 1.7804623974850844e-05, + "loss": 0.2389, + "step": 7626 + }, + { + "epoch": 3.101667344448963, + "grad_norm": 11.392560606023673, + "learning_rate": 1.780399048676978e-05, + "loss": 0.6073, + "step": 7627 + }, + { + "epoch": 3.102074013826759, + "grad_norm": 12.530607628700821, + "learning_rate": 1.7803356918576832e-05, + "loss": 0.4917, + "step": 7628 + }, + { + "epoch": 3.1024806832045546, + "grad_norm": 3.161733141169246, + "learning_rate": 1.7802723270278494e-05, + "loss": 0.0614, + "step": 7629 + }, + { + "epoch": 3.1028873525823504, + "grad_norm": 0.35175449458298896, + "learning_rate": 1.7802089541881275e-05, + "loss": 0.01, + "step": 7630 + }, + { + "epoch": 3.1032940219601466, + "grad_norm": 7.041969619067303, + "learning_rate": 1.7801455733391676e-05, + "loss": 0.2394, + "step": 7631 + }, + { + "epoch": 3.1037006913379424, + "grad_norm": 1.3325853385258022, + "learning_rate": 1.780082184481621e-05, + "loss": 0.0234, + "step": 7632 + }, + { + "epoch": 3.104107360715738, + "grad_norm": 0.2907791551063418, + "learning_rate": 1.7800187876161384e-05, + "loss": 0.0044, + "step": 7633 + }, + { + "epoch": 3.104514030093534, + "grad_norm": 2.6080162185331215, + "learning_rate": 1.7799553827433702e-05, + "loss": 0.0406, + "step": 7634 + }, + { + "epoch": 3.1049206994713296, + "grad_norm": 4.266499890907933, + "learning_rate": 1.7798919698639675e-05, + "loss": 0.1359, + "step": 7635 + }, + { + "epoch": 3.105327368849126, + "grad_norm": 11.706426799623395, + "learning_rate": 1.779828548978581e-05, + "loss": 0.4342, + "step": 7636 + }, + { + "epoch": 3.1057340382269216, + "grad_norm": 9.037260986792433, + "learning_rate": 1.7797651200878624e-05, + "loss": 0.4035, + "step": 7637 + }, + { + "epoch": 3.1061407076047174, + "grad_norm": 6.521489445837745, + "learning_rate": 1.779701683192462e-05, + "loss": 0.1512, + "step": 7638 + }, + { + "epoch": 3.106547376982513, + "grad_norm": 10.860218279157655, + "learning_rate": 1.7796382382930316e-05, + "loss": 0.4632, + "step": 7639 + }, + { + "epoch": 3.106954046360309, + "grad_norm": 11.110510163915785, + "learning_rate": 1.779574785390222e-05, + "loss": 0.4508, + "step": 7640 + }, + { + "epoch": 3.107360715738105, + "grad_norm": 6.709627349865816, + "learning_rate": 1.7795113244846855e-05, + "loss": 0.1583, + "step": 7641 + }, + { + "epoch": 3.107767385115901, + "grad_norm": 0.8545472232346311, + "learning_rate": 1.779447855577072e-05, + "loss": 0.0096, + "step": 7642 + }, + { + "epoch": 3.1081740544936967, + "grad_norm": 2.8460670674474726, + "learning_rate": 1.779384378668035e-05, + "loss": 0.0339, + "step": 7643 + }, + { + "epoch": 3.1085807238714924, + "grad_norm": 8.567566272681155, + "learning_rate": 1.7793208937582246e-05, + "loss": 0.1902, + "step": 7644 + }, + { + "epoch": 3.108987393249288, + "grad_norm": 11.462080568259784, + "learning_rate": 1.779257400848293e-05, + "loss": 0.427, + "step": 7645 + }, + { + "epoch": 3.109394062627084, + "grad_norm": 10.567203647119486, + "learning_rate": 1.779193899938892e-05, + "loss": 0.4205, + "step": 7646 + }, + { + "epoch": 3.10980073200488, + "grad_norm": 4.704451128795149, + "learning_rate": 1.7791303910306734e-05, + "loss": 0.1589, + "step": 7647 + }, + { + "epoch": 3.110207401382676, + "grad_norm": 6.185075264168403, + "learning_rate": 1.7790668741242894e-05, + "loss": 0.3241, + "step": 7648 + }, + { + "epoch": 3.1106140707604717, + "grad_norm": 18.86923587097067, + "learning_rate": 1.7790033492203918e-05, + "loss": 0.2838, + "step": 7649 + }, + { + "epoch": 3.1110207401382675, + "grad_norm": 2.2388630245066126, + "learning_rate": 1.7789398163196324e-05, + "loss": 0.0189, + "step": 7650 + }, + { + "epoch": 3.1114274095160637, + "grad_norm": 12.06998749259918, + "learning_rate": 1.778876275422664e-05, + "loss": 0.6961, + "step": 7651 + }, + { + "epoch": 3.1118340788938594, + "grad_norm": 2.5733220925854727, + "learning_rate": 1.7788127265301385e-05, + "loss": 0.1594, + "step": 7652 + }, + { + "epoch": 3.112240748271655, + "grad_norm": 9.834984298657856, + "learning_rate": 1.778749169642709e-05, + "loss": 0.278, + "step": 7653 + }, + { + "epoch": 3.112647417649451, + "grad_norm": 8.231485867035522, + "learning_rate": 1.778685604761027e-05, + "loss": 0.2225, + "step": 7654 + }, + { + "epoch": 3.1130540870272467, + "grad_norm": 14.24597225550565, + "learning_rate": 1.7786220318857453e-05, + "loss": 1.1178, + "step": 7655 + }, + { + "epoch": 3.1134607564050425, + "grad_norm": 11.554404789373827, + "learning_rate": 1.7785584510175162e-05, + "loss": 0.7999, + "step": 7656 + }, + { + "epoch": 3.1138674257828387, + "grad_norm": 13.676521476820055, + "learning_rate": 1.7784948621569932e-05, + "loss": 0.9371, + "step": 7657 + }, + { + "epoch": 3.1142740951606345, + "grad_norm": 9.03948148681308, + "learning_rate": 1.7784312653048285e-05, + "loss": 0.4844, + "step": 7658 + }, + { + "epoch": 3.1146807645384302, + "grad_norm": 18.718608890963164, + "learning_rate": 1.778367660461675e-05, + "loss": 1.667, + "step": 7659 + }, + { + "epoch": 3.115087433916226, + "grad_norm": 7.772034155867205, + "learning_rate": 1.7783040476281858e-05, + "loss": 0.3104, + "step": 7660 + }, + { + "epoch": 3.1154941032940218, + "grad_norm": 3.5459369179743585, + "learning_rate": 1.778240426805014e-05, + "loss": 0.0795, + "step": 7661 + }, + { + "epoch": 3.115900772671818, + "grad_norm": 9.713331257413651, + "learning_rate": 1.7781767979928127e-05, + "loss": 0.2978, + "step": 7662 + }, + { + "epoch": 3.1163074420496137, + "grad_norm": 9.447443815995255, + "learning_rate": 1.778113161192234e-05, + "loss": 0.2456, + "step": 7663 + }, + { + "epoch": 3.1167141114274095, + "grad_norm": 8.533541645955319, + "learning_rate": 1.778049516403933e-05, + "loss": 0.2013, + "step": 7664 + }, + { + "epoch": 3.1171207808052053, + "grad_norm": 9.23617030436636, + "learning_rate": 1.7779858636285618e-05, + "loss": 0.3296, + "step": 7665 + }, + { + "epoch": 3.117527450183001, + "grad_norm": 10.709180111690348, + "learning_rate": 1.7779222028667743e-05, + "loss": 0.4395, + "step": 7666 + }, + { + "epoch": 3.1179341195607972, + "grad_norm": 9.52659252810403, + "learning_rate": 1.777858534119224e-05, + "loss": 0.2484, + "step": 7667 + }, + { + "epoch": 3.118340788938593, + "grad_norm": 16.10110572192947, + "learning_rate": 1.7777948573865638e-05, + "loss": 1.1018, + "step": 7668 + }, + { + "epoch": 3.1187474583163888, + "grad_norm": 9.288434609266004, + "learning_rate": 1.7777311726694485e-05, + "loss": 0.3109, + "step": 7669 + }, + { + "epoch": 3.1191541276941845, + "grad_norm": 12.05185631921254, + "learning_rate": 1.777667479968531e-05, + "loss": 0.4922, + "step": 7670 + }, + { + "epoch": 3.1195607970719803, + "grad_norm": 6.73455039693331, + "learning_rate": 1.7776037792844653e-05, + "loss": 0.233, + "step": 7671 + }, + { + "epoch": 3.1199674664497765, + "grad_norm": 0.4372125696957869, + "learning_rate": 1.777540070617906e-05, + "loss": 0.0066, + "step": 7672 + }, + { + "epoch": 3.1203741358275723, + "grad_norm": 12.535719252912587, + "learning_rate": 1.777476353969506e-05, + "loss": 0.2812, + "step": 7673 + }, + { + "epoch": 3.120780805205368, + "grad_norm": 3.4439607824696785, + "learning_rate": 1.77741262933992e-05, + "loss": 0.0502, + "step": 7674 + }, + { + "epoch": 3.121187474583164, + "grad_norm": 11.712664786001978, + "learning_rate": 1.777348896729802e-05, + "loss": 0.43, + "step": 7675 + }, + { + "epoch": 3.1215941439609596, + "grad_norm": 5.6442435815379275, + "learning_rate": 1.7772851561398065e-05, + "loss": 0.1467, + "step": 7676 + }, + { + "epoch": 3.122000813338756, + "grad_norm": 5.4919394013601925, + "learning_rate": 1.777221407570588e-05, + "loss": 0.2513, + "step": 7677 + }, + { + "epoch": 3.1224074827165516, + "grad_norm": 1.2859795300960477, + "learning_rate": 1.7771576510228e-05, + "loss": 0.0178, + "step": 7678 + }, + { + "epoch": 3.1228141520943473, + "grad_norm": 5.650837929584577, + "learning_rate": 1.777093886497098e-05, + "loss": 0.1881, + "step": 7679 + }, + { + "epoch": 3.123220821472143, + "grad_norm": 5.469706864943859, + "learning_rate": 1.777030113994136e-05, + "loss": 0.3397, + "step": 7680 + }, + { + "epoch": 3.123627490849939, + "grad_norm": 5.854608983003498, + "learning_rate": 1.7769663335145688e-05, + "loss": 0.3031, + "step": 7681 + }, + { + "epoch": 3.124034160227735, + "grad_norm": 10.393234105534836, + "learning_rate": 1.7769025450590514e-05, + "loss": 0.2638, + "step": 7682 + }, + { + "epoch": 3.124440829605531, + "grad_norm": 7.892937371482119, + "learning_rate": 1.7768387486282383e-05, + "loss": 0.1276, + "step": 7683 + }, + { + "epoch": 3.1248474989833266, + "grad_norm": 16.0995285437546, + "learning_rate": 1.776774944222784e-05, + "loss": 0.9339, + "step": 7684 + }, + { + "epoch": 3.1252541683611224, + "grad_norm": 9.118930498857221, + "learning_rate": 1.7767111318433442e-05, + "loss": 0.1465, + "step": 7685 + }, + { + "epoch": 3.125660837738918, + "grad_norm": 8.471333102741355, + "learning_rate": 1.776647311490574e-05, + "loss": 0.2025, + "step": 7686 + }, + { + "epoch": 3.126067507116714, + "grad_norm": 6.492058986236494, + "learning_rate": 1.776583483165128e-05, + "loss": 0.1528, + "step": 7687 + }, + { + "epoch": 3.12647417649451, + "grad_norm": 7.152990084502177, + "learning_rate": 1.7765196468676618e-05, + "loss": 0.2259, + "step": 7688 + }, + { + "epoch": 3.126880845872306, + "grad_norm": 8.942537517090486, + "learning_rate": 1.7764558025988307e-05, + "loss": 0.2804, + "step": 7689 + }, + { + "epoch": 3.1272875152501016, + "grad_norm": 2.69296726325709, + "learning_rate": 1.7763919503592898e-05, + "loss": 0.0409, + "step": 7690 + }, + { + "epoch": 3.1276941846278974, + "grad_norm": 11.552689663307945, + "learning_rate": 1.7763280901496947e-05, + "loss": 0.3595, + "step": 7691 + }, + { + "epoch": 3.1281008540056936, + "grad_norm": 0.15269648969285624, + "learning_rate": 1.7762642219707014e-05, + "loss": 0.0024, + "step": 7692 + }, + { + "epoch": 3.1285075233834894, + "grad_norm": 12.644364249240558, + "learning_rate": 1.7762003458229648e-05, + "loss": 0.3316, + "step": 7693 + }, + { + "epoch": 3.128914192761285, + "grad_norm": 11.631894813108682, + "learning_rate": 1.7761364617071414e-05, + "loss": 0.3951, + "step": 7694 + }, + { + "epoch": 3.129320862139081, + "grad_norm": 7.197488898401046, + "learning_rate": 1.776072569623886e-05, + "loss": 0.2222, + "step": 7695 + }, + { + "epoch": 3.1297275315168767, + "grad_norm": 6.2859150337824765, + "learning_rate": 1.7760086695738556e-05, + "loss": 0.288, + "step": 7696 + }, + { + "epoch": 3.1301342008946724, + "grad_norm": 2.7189938314964666, + "learning_rate": 1.7759447615577054e-05, + "loss": 0.0502, + "step": 7697 + }, + { + "epoch": 3.1305408702724686, + "grad_norm": 8.134306629839651, + "learning_rate": 1.775880845576092e-05, + "loss": 0.1903, + "step": 7698 + }, + { + "epoch": 3.1309475396502644, + "grad_norm": 9.867034623614654, + "learning_rate": 1.7758169216296708e-05, + "loss": 0.4399, + "step": 7699 + }, + { + "epoch": 3.13135420902806, + "grad_norm": 3.527870827332846, + "learning_rate": 1.775752989719099e-05, + "loss": 0.059, + "step": 7700 + }, + { + "epoch": 3.131760878405856, + "grad_norm": 9.043701449644901, + "learning_rate": 1.775689049845032e-05, + "loss": 0.5224, + "step": 7701 + }, + { + "epoch": 3.1321675477836517, + "grad_norm": 4.719349374921405, + "learning_rate": 1.7756251020081265e-05, + "loss": 0.1075, + "step": 7702 + }, + { + "epoch": 3.132574217161448, + "grad_norm": 8.45232921657138, + "learning_rate": 1.775561146209039e-05, + "loss": 0.2738, + "step": 7703 + }, + { + "epoch": 3.1329808865392437, + "grad_norm": 6.09523421792656, + "learning_rate": 1.775497182448426e-05, + "loss": 0.143, + "step": 7704 + }, + { + "epoch": 3.1333875559170394, + "grad_norm": 4.170886051317886, + "learning_rate": 1.775433210726944e-05, + "loss": 0.164, + "step": 7705 + }, + { + "epoch": 3.133794225294835, + "grad_norm": 0.25494587669348967, + "learning_rate": 1.7753692310452497e-05, + "loss": 0.0034, + "step": 7706 + }, + { + "epoch": 3.134200894672631, + "grad_norm": 6.171428181674743, + "learning_rate": 1.7753052434040004e-05, + "loss": 0.0955, + "step": 7707 + }, + { + "epoch": 3.134607564050427, + "grad_norm": 1.3942251239675347, + "learning_rate": 1.7752412478038523e-05, + "loss": 0.0215, + "step": 7708 + }, + { + "epoch": 3.135014233428223, + "grad_norm": 5.831261695598254, + "learning_rate": 1.7751772442454628e-05, + "loss": 0.0907, + "step": 7709 + }, + { + "epoch": 3.1354209028060187, + "grad_norm": 9.973534436489148, + "learning_rate": 1.7751132327294884e-05, + "loss": 0.1095, + "step": 7710 + }, + { + "epoch": 3.1358275721838145, + "grad_norm": 13.390610258796483, + "learning_rate": 1.7750492132565868e-05, + "loss": 0.4597, + "step": 7711 + }, + { + "epoch": 3.1362342415616102, + "grad_norm": 8.503910857462435, + "learning_rate": 1.7749851858274152e-05, + "loss": 0.5209, + "step": 7712 + }, + { + "epoch": 3.1366409109394064, + "grad_norm": 7.934777087525071, + "learning_rate": 1.7749211504426304e-05, + "loss": 0.1391, + "step": 7713 + }, + { + "epoch": 3.137047580317202, + "grad_norm": 7.114442727464963, + "learning_rate": 1.77485710710289e-05, + "loss": 0.1812, + "step": 7714 + }, + { + "epoch": 3.137454249694998, + "grad_norm": 7.476323198656305, + "learning_rate": 1.7747930558088513e-05, + "loss": 0.0756, + "step": 7715 + }, + { + "epoch": 3.1378609190727937, + "grad_norm": 11.849957546050442, + "learning_rate": 1.774728996561172e-05, + "loss": 0.502, + "step": 7716 + }, + { + "epoch": 3.1382675884505895, + "grad_norm": 0.02668792215317923, + "learning_rate": 1.7746649293605097e-05, + "loss": 0.0005, + "step": 7717 + }, + { + "epoch": 3.1386742578283857, + "grad_norm": 0.22893847204675827, + "learning_rate": 1.7746008542075222e-05, + "loss": 0.005, + "step": 7718 + }, + { + "epoch": 3.1390809272061815, + "grad_norm": 11.576010479239624, + "learning_rate": 1.7745367711028666e-05, + "loss": 0.5453, + "step": 7719 + }, + { + "epoch": 3.1394875965839772, + "grad_norm": 4.636555141201342, + "learning_rate": 1.7744726800472018e-05, + "loss": 0.0855, + "step": 7720 + }, + { + "epoch": 3.139894265961773, + "grad_norm": 6.148856918202598, + "learning_rate": 1.7744085810411848e-05, + "loss": 0.081, + "step": 7721 + }, + { + "epoch": 3.1403009353395688, + "grad_norm": 0.06768273080302054, + "learning_rate": 1.774344474085474e-05, + "loss": 0.0009, + "step": 7722 + }, + { + "epoch": 3.140707604717365, + "grad_norm": 2.7599975546809685, + "learning_rate": 1.7742803591807274e-05, + "loss": 0.0475, + "step": 7723 + }, + { + "epoch": 3.1411142740951608, + "grad_norm": 0.0319241562117418, + "learning_rate": 1.7742162363276033e-05, + "loss": 0.0004, + "step": 7724 + }, + { + "epoch": 3.1415209434729565, + "grad_norm": 1.9559792811120853, + "learning_rate": 1.77415210552676e-05, + "loss": 0.0166, + "step": 7725 + }, + { + "epoch": 3.1419276128507523, + "grad_norm": 10.439891700658418, + "learning_rate": 1.7740879667788555e-05, + "loss": 0.36, + "step": 7726 + }, + { + "epoch": 3.142334282228548, + "grad_norm": 7.33502675127111, + "learning_rate": 1.7740238200845485e-05, + "loss": 0.2968, + "step": 7727 + }, + { + "epoch": 3.142740951606344, + "grad_norm": 12.621101970678785, + "learning_rate": 1.7739596654444975e-05, + "loss": 0.4039, + "step": 7728 + }, + { + "epoch": 3.14314762098414, + "grad_norm": 8.850127275088502, + "learning_rate": 1.773895502859361e-05, + "loss": 0.2333, + "step": 7729 + }, + { + "epoch": 3.143554290361936, + "grad_norm": 13.06943280331572, + "learning_rate": 1.7738313323297976e-05, + "loss": 0.4517, + "step": 7730 + }, + { + "epoch": 3.1439609597397316, + "grad_norm": 0.07758347057744393, + "learning_rate": 1.773767153856466e-05, + "loss": 0.0013, + "step": 7731 + }, + { + "epoch": 3.1443676291175273, + "grad_norm": 8.654506884191777, + "learning_rate": 1.7737029674400256e-05, + "loss": 0.3907, + "step": 7732 + }, + { + "epoch": 3.1447742984953235, + "grad_norm": 3.7248313258790846, + "learning_rate": 1.7736387730811343e-05, + "loss": 0.1755, + "step": 7733 + }, + { + "epoch": 3.1451809678731193, + "grad_norm": 1.3062613222580415, + "learning_rate": 1.773574570780452e-05, + "loss": 0.0316, + "step": 7734 + }, + { + "epoch": 3.145587637250915, + "grad_norm": 1.567648848976446, + "learning_rate": 1.773510360538637e-05, + "loss": 0.0253, + "step": 7735 + }, + { + "epoch": 3.145994306628711, + "grad_norm": 7.03678093128391, + "learning_rate": 1.7734461423563495e-05, + "loss": 0.3892, + "step": 7736 + }, + { + "epoch": 3.1464009760065066, + "grad_norm": 15.329253383090492, + "learning_rate": 1.7733819162342475e-05, + "loss": 0.7475, + "step": 7737 + }, + { + "epoch": 3.1468076453843024, + "grad_norm": 9.065247965056349, + "learning_rate": 1.773317682172991e-05, + "loss": 0.1268, + "step": 7738 + }, + { + "epoch": 3.1472143147620986, + "grad_norm": 3.5793292256816382, + "learning_rate": 1.7732534401732395e-05, + "loss": 0.0341, + "step": 7739 + }, + { + "epoch": 3.1476209841398943, + "grad_norm": 0.95350783621517, + "learning_rate": 1.7731891902356522e-05, + "loss": 0.0119, + "step": 7740 + }, + { + "epoch": 3.14802765351769, + "grad_norm": 6.7933578690981316, + "learning_rate": 1.7731249323608888e-05, + "loss": 0.0683, + "step": 7741 + }, + { + "epoch": 3.148434322895486, + "grad_norm": 7.385497763020459, + "learning_rate": 1.7730606665496086e-05, + "loss": 0.399, + "step": 7742 + }, + { + "epoch": 3.1488409922732816, + "grad_norm": 15.461778947105762, + "learning_rate": 1.7729963928024717e-05, + "loss": 0.3543, + "step": 7743 + }, + { + "epoch": 3.149247661651078, + "grad_norm": 6.466360838281579, + "learning_rate": 1.7729321111201376e-05, + "loss": 0.1791, + "step": 7744 + }, + { + "epoch": 3.1496543310288736, + "grad_norm": 0.5434014417195049, + "learning_rate": 1.7728678215032668e-05, + "loss": 0.0085, + "step": 7745 + }, + { + "epoch": 3.1500610004066694, + "grad_norm": 9.244815868367136, + "learning_rate": 1.7728035239525186e-05, + "loss": 0.6967, + "step": 7746 + }, + { + "epoch": 3.150467669784465, + "grad_norm": 2.3194268567309373, + "learning_rate": 1.7727392184685533e-05, + "loss": 0.0407, + "step": 7747 + }, + { + "epoch": 3.150874339162261, + "grad_norm": 14.473638313580167, + "learning_rate": 1.772674905052031e-05, + "loss": 0.7725, + "step": 7748 + }, + { + "epoch": 3.151281008540057, + "grad_norm": 1.3245229363454252, + "learning_rate": 1.772610583703612e-05, + "loss": 0.0234, + "step": 7749 + }, + { + "epoch": 3.151687677917853, + "grad_norm": 10.239653613618025, + "learning_rate": 1.772546254423956e-05, + "loss": 0.4131, + "step": 7750 + }, + { + "epoch": 3.1520943472956486, + "grad_norm": 2.162754595331667, + "learning_rate": 1.7724819172137246e-05, + "loss": 0.0455, + "step": 7751 + }, + { + "epoch": 3.1525010166734444, + "grad_norm": 5.603176301127207, + "learning_rate": 1.772417572073577e-05, + "loss": 0.1394, + "step": 7752 + }, + { + "epoch": 3.15290768605124, + "grad_norm": 10.17089965646785, + "learning_rate": 1.7723532190041747e-05, + "loss": 0.468, + "step": 7753 + }, + { + "epoch": 3.1533143554290364, + "grad_norm": 3.967996442928391, + "learning_rate": 1.772288858006178e-05, + "loss": 0.1087, + "step": 7754 + }, + { + "epoch": 3.153721024806832, + "grad_norm": 0.23384821107280304, + "learning_rate": 1.7722244890802472e-05, + "loss": 0.0032, + "step": 7755 + }, + { + "epoch": 3.154127694184628, + "grad_norm": 13.720253746574631, + "learning_rate": 1.7721601122270432e-05, + "loss": 0.3882, + "step": 7756 + }, + { + "epoch": 3.1545343635624237, + "grad_norm": 1.6848045073295324, + "learning_rate": 1.7720957274472273e-05, + "loss": 0.0235, + "step": 7757 + }, + { + "epoch": 3.1549410329402194, + "grad_norm": 11.930040785277434, + "learning_rate": 1.7720313347414598e-05, + "loss": 0.3811, + "step": 7758 + }, + { + "epoch": 3.1553477023180156, + "grad_norm": 5.5807117561218424, + "learning_rate": 1.7719669341104025e-05, + "loss": 0.3176, + "step": 7759 + }, + { + "epoch": 3.1557543716958114, + "grad_norm": 9.485620931757206, + "learning_rate": 1.771902525554716e-05, + "loss": 0.4463, + "step": 7760 + }, + { + "epoch": 3.156161041073607, + "grad_norm": 1.921921311009254, + "learning_rate": 1.7718381090750613e-05, + "loss": 0.0247, + "step": 7761 + }, + { + "epoch": 3.156567710451403, + "grad_norm": 3.9350143053341298, + "learning_rate": 1.7717736846721005e-05, + "loss": 0.0701, + "step": 7762 + }, + { + "epoch": 3.1569743798291987, + "grad_norm": 7.424206747758676, + "learning_rate": 1.771709252346494e-05, + "loss": 0.3805, + "step": 7763 + }, + { + "epoch": 3.157381049206995, + "grad_norm": 8.305555894215722, + "learning_rate": 1.7716448120989038e-05, + "loss": 0.2514, + "step": 7764 + }, + { + "epoch": 3.1577877185847907, + "grad_norm": 5.967620016924989, + "learning_rate": 1.771580363929991e-05, + "loss": 0.2178, + "step": 7765 + }, + { + "epoch": 3.1581943879625864, + "grad_norm": 5.961691389633372, + "learning_rate": 1.7715159078404176e-05, + "loss": 0.2735, + "step": 7766 + }, + { + "epoch": 3.158601057340382, + "grad_norm": 4.553548994769989, + "learning_rate": 1.7714514438308453e-05, + "loss": 0.0873, + "step": 7767 + }, + { + "epoch": 3.159007726718178, + "grad_norm": 0.7650547732615781, + "learning_rate": 1.7713869719019355e-05, + "loss": 0.0107, + "step": 7768 + }, + { + "epoch": 3.1594143960959737, + "grad_norm": 7.302445428427107, + "learning_rate": 1.7713224920543502e-05, + "loss": 0.4144, + "step": 7769 + }, + { + "epoch": 3.15982106547377, + "grad_norm": 7.98367251185286, + "learning_rate": 1.7712580042887513e-05, + "loss": 0.2701, + "step": 7770 + }, + { + "epoch": 3.1602277348515657, + "grad_norm": 8.868435160943136, + "learning_rate": 1.7711935086058007e-05, + "loss": 0.2872, + "step": 7771 + }, + { + "epoch": 3.1606344042293615, + "grad_norm": 1.556429654646809, + "learning_rate": 1.7711290050061606e-05, + "loss": 0.0203, + "step": 7772 + }, + { + "epoch": 3.1610410736071572, + "grad_norm": 9.236758253258598, + "learning_rate": 1.7710644934904934e-05, + "loss": 0.2635, + "step": 7773 + }, + { + "epoch": 3.1614477429849535, + "grad_norm": 2.2034826691272644, + "learning_rate": 1.770999974059461e-05, + "loss": 0.0685, + "step": 7774 + }, + { + "epoch": 3.1618544123627492, + "grad_norm": 4.404270138455859, + "learning_rate": 1.770935446713726e-05, + "loss": 0.1258, + "step": 7775 + }, + { + "epoch": 3.162261081740545, + "grad_norm": 3.424974468303513, + "learning_rate": 1.7708709114539505e-05, + "loss": 0.0524, + "step": 7776 + }, + { + "epoch": 3.1626677511183408, + "grad_norm": 7.639094878731896, + "learning_rate": 1.770806368280797e-05, + "loss": 0.3463, + "step": 7777 + }, + { + "epoch": 3.1630744204961365, + "grad_norm": 0.166451500311736, + "learning_rate": 1.7707418171949286e-05, + "loss": 0.0029, + "step": 7778 + }, + { + "epoch": 3.1634810898739323, + "grad_norm": 4.809308957584649, + "learning_rate": 1.7706772581970068e-05, + "loss": 0.0633, + "step": 7779 + }, + { + "epoch": 3.1638877592517285, + "grad_norm": 1.542736276673625, + "learning_rate": 1.7706126912876956e-05, + "loss": 0.0203, + "step": 7780 + }, + { + "epoch": 3.1642944286295243, + "grad_norm": 12.565641399709985, + "learning_rate": 1.770548116467657e-05, + "loss": 0.5893, + "step": 7781 + }, + { + "epoch": 3.16470109800732, + "grad_norm": 2.2170933975515554, + "learning_rate": 1.7704835337375543e-05, + "loss": 0.042, + "step": 7782 + }, + { + "epoch": 3.165107767385116, + "grad_norm": 3.665751210181096, + "learning_rate": 1.7704189430980507e-05, + "loss": 0.1203, + "step": 7783 + }, + { + "epoch": 3.1655144367629116, + "grad_norm": 5.33562596570254, + "learning_rate": 1.7703543445498085e-05, + "loss": 0.0768, + "step": 7784 + }, + { + "epoch": 3.1659211061407078, + "grad_norm": 10.230852620925587, + "learning_rate": 1.7702897380934912e-05, + "loss": 0.8155, + "step": 7785 + }, + { + "epoch": 3.1663277755185035, + "grad_norm": 2.183419873782716, + "learning_rate": 1.7702251237297623e-05, + "loss": 0.027, + "step": 7786 + }, + { + "epoch": 3.1667344448962993, + "grad_norm": 14.9391143756433, + "learning_rate": 1.7701605014592847e-05, + "loss": 0.3336, + "step": 7787 + }, + { + "epoch": 3.167141114274095, + "grad_norm": 0.9958935267571802, + "learning_rate": 1.770095871282722e-05, + "loss": 0.0071, + "step": 7788 + }, + { + "epoch": 3.167547783651891, + "grad_norm": 0.2547248193789554, + "learning_rate": 1.7700312332007378e-05, + "loss": 0.0029, + "step": 7789 + }, + { + "epoch": 3.167954453029687, + "grad_norm": 6.956376100485248, + "learning_rate": 1.769966587213995e-05, + "loss": 0.1622, + "step": 7790 + }, + { + "epoch": 3.168361122407483, + "grad_norm": 0.05484481660539601, + "learning_rate": 1.769901933323158e-05, + "loss": 0.001, + "step": 7791 + }, + { + "epoch": 3.1687677917852786, + "grad_norm": 7.794917411662236, + "learning_rate": 1.7698372715288902e-05, + "loss": 0.7733, + "step": 7792 + }, + { + "epoch": 3.1691744611630743, + "grad_norm": 7.119723638244151, + "learning_rate": 1.7697726018318554e-05, + "loss": 0.416, + "step": 7793 + }, + { + "epoch": 3.16958113054087, + "grad_norm": 35.58046854080017, + "learning_rate": 1.769707924232717e-05, + "loss": 0.5267, + "step": 7794 + }, + { + "epoch": 3.1699877999186663, + "grad_norm": 7.810950610495625, + "learning_rate": 1.7696432387321397e-05, + "loss": 0.1467, + "step": 7795 + }, + { + "epoch": 3.170394469296462, + "grad_norm": 7.629532739165649, + "learning_rate": 1.769578545330787e-05, + "loss": 0.217, + "step": 7796 + }, + { + "epoch": 3.170801138674258, + "grad_norm": 8.357954507531684, + "learning_rate": 1.7695138440293235e-05, + "loss": 0.3664, + "step": 7797 + }, + { + "epoch": 3.1712078080520536, + "grad_norm": 6.380319618337995, + "learning_rate": 1.769449134828413e-05, + "loss": 0.3219, + "step": 7798 + }, + { + "epoch": 3.1716144774298494, + "grad_norm": 6.322923419679469, + "learning_rate": 1.7693844177287198e-05, + "loss": 0.1275, + "step": 7799 + }, + { + "epoch": 3.1720211468076456, + "grad_norm": 10.618124620674335, + "learning_rate": 1.7693196927309083e-05, + "loss": 0.3934, + "step": 7800 + }, + { + "epoch": 3.1724278161854413, + "grad_norm": 10.926240722252432, + "learning_rate": 1.7692549598356428e-05, + "loss": 0.3455, + "step": 7801 + }, + { + "epoch": 3.172834485563237, + "grad_norm": 2.998639748898656, + "learning_rate": 1.7691902190435885e-05, + "loss": 0.054, + "step": 7802 + }, + { + "epoch": 3.173241154941033, + "grad_norm": 5.267909952851079, + "learning_rate": 1.769125470355409e-05, + "loss": 0.2463, + "step": 7803 + }, + { + "epoch": 3.1736478243188286, + "grad_norm": 14.458146924989302, + "learning_rate": 1.7690607137717697e-05, + "loss": 0.3826, + "step": 7804 + }, + { + "epoch": 3.174054493696625, + "grad_norm": 11.408164333361801, + "learning_rate": 1.768995949293335e-05, + "loss": 0.2223, + "step": 7805 + }, + { + "epoch": 3.1744611630744206, + "grad_norm": 25.961437357137797, + "learning_rate": 1.76893117692077e-05, + "loss": 0.2582, + "step": 7806 + }, + { + "epoch": 3.1748678324522164, + "grad_norm": 10.910661354359263, + "learning_rate": 1.7688663966547393e-05, + "loss": 0.5293, + "step": 7807 + }, + { + "epoch": 3.175274501830012, + "grad_norm": 4.006630215888631, + "learning_rate": 1.768801608495908e-05, + "loss": 0.0757, + "step": 7808 + }, + { + "epoch": 3.175681171207808, + "grad_norm": 7.906349975940723, + "learning_rate": 1.7687368124449416e-05, + "loss": 0.1828, + "step": 7809 + }, + { + "epoch": 3.1760878405856037, + "grad_norm": 2.743494705164825, + "learning_rate": 1.7686720085025048e-05, + "loss": 0.0351, + "step": 7810 + }, + { + "epoch": 3.1764945099634, + "grad_norm": 6.609633589904094, + "learning_rate": 1.7686071966692625e-05, + "loss": 0.4791, + "step": 7811 + }, + { + "epoch": 3.1769011793411956, + "grad_norm": 14.61634087385333, + "learning_rate": 1.768542376945881e-05, + "loss": 0.7114, + "step": 7812 + }, + { + "epoch": 3.1773078487189914, + "grad_norm": 1.0841985290384484, + "learning_rate": 1.7684775493330253e-05, + "loss": 0.019, + "step": 7813 + }, + { + "epoch": 3.177714518096787, + "grad_norm": 2.277210000639423, + "learning_rate": 1.7684127138313605e-05, + "loss": 0.0382, + "step": 7814 + }, + { + "epoch": 3.1781211874745834, + "grad_norm": 15.930341294521323, + "learning_rate": 1.7683478704415527e-05, + "loss": 0.9607, + "step": 7815 + }, + { + "epoch": 3.178527856852379, + "grad_norm": 2.4746820010560513, + "learning_rate": 1.768283019164267e-05, + "loss": 0.0701, + "step": 7816 + }, + { + "epoch": 3.178934526230175, + "grad_norm": 43.24367214504069, + "learning_rate": 1.7682181600001697e-05, + "loss": 0.8945, + "step": 7817 + }, + { + "epoch": 3.1793411956079707, + "grad_norm": 7.896977139938383, + "learning_rate": 1.768153292949926e-05, + "loss": 0.3017, + "step": 7818 + }, + { + "epoch": 3.1797478649857664, + "grad_norm": 8.940220341452868, + "learning_rate": 1.7680884180142022e-05, + "loss": 0.4541, + "step": 7819 + }, + { + "epoch": 3.180154534363562, + "grad_norm": 11.145849715207627, + "learning_rate": 1.7680235351936646e-05, + "loss": 0.1787, + "step": 7820 + }, + { + "epoch": 3.1805612037413584, + "grad_norm": 8.912961528265503, + "learning_rate": 1.7679586444889786e-05, + "loss": 0.2781, + "step": 7821 + }, + { + "epoch": 3.180967873119154, + "grad_norm": 7.186984107399038, + "learning_rate": 1.767893745900811e-05, + "loss": 0.1784, + "step": 7822 + }, + { + "epoch": 3.18137454249695, + "grad_norm": 13.85254392707855, + "learning_rate": 1.7678288394298272e-05, + "loss": 0.5569, + "step": 7823 + }, + { + "epoch": 3.1817812118747457, + "grad_norm": 5.241857116321645, + "learning_rate": 1.767763925076694e-05, + "loss": 0.1468, + "step": 7824 + }, + { + "epoch": 3.1821878812525415, + "grad_norm": 12.071861856519085, + "learning_rate": 1.7676990028420777e-05, + "loss": 0.5814, + "step": 7825 + }, + { + "epoch": 3.1825945506303377, + "grad_norm": 1.509203143447494, + "learning_rate": 1.7676340727266447e-05, + "loss": 0.0297, + "step": 7826 + }, + { + "epoch": 3.1830012200081335, + "grad_norm": 9.985751708227783, + "learning_rate": 1.7675691347310616e-05, + "loss": 0.2302, + "step": 7827 + }, + { + "epoch": 3.1834078893859292, + "grad_norm": 6.34490094558801, + "learning_rate": 1.7675041888559952e-05, + "loss": 0.2076, + "step": 7828 + }, + { + "epoch": 3.183814558763725, + "grad_norm": 4.022301034682014, + "learning_rate": 1.7674392351021122e-05, + "loss": 0.1037, + "step": 7829 + }, + { + "epoch": 3.1842212281415208, + "grad_norm": 2.309468552704612, + "learning_rate": 1.7673742734700788e-05, + "loss": 0.0525, + "step": 7830 + }, + { + "epoch": 3.184627897519317, + "grad_norm": 10.870705021988924, + "learning_rate": 1.7673093039605624e-05, + "loss": 0.8088, + "step": 7831 + }, + { + "epoch": 3.1850345668971127, + "grad_norm": 10.803234023761696, + "learning_rate": 1.7672443265742297e-05, + "loss": 0.8586, + "step": 7832 + }, + { + "epoch": 3.1854412362749085, + "grad_norm": 2.1661760604458076, + "learning_rate": 1.767179341311748e-05, + "loss": 0.0584, + "step": 7833 + }, + { + "epoch": 3.1858479056527043, + "grad_norm": 3.731467637547126, + "learning_rate": 1.7671143481737844e-05, + "loss": 0.1015, + "step": 7834 + }, + { + "epoch": 3.1862545750305, + "grad_norm": 7.468692152252436, + "learning_rate": 1.7670493471610056e-05, + "loss": 0.1771, + "step": 7835 + }, + { + "epoch": 3.1866612444082962, + "grad_norm": 7.11531676423353, + "learning_rate": 1.7669843382740795e-05, + "loss": 0.544, + "step": 7836 + }, + { + "epoch": 3.187067913786092, + "grad_norm": 1.4724110744913386, + "learning_rate": 1.7669193215136727e-05, + "loss": 0.0259, + "step": 7837 + }, + { + "epoch": 3.1874745831638878, + "grad_norm": 3.9566793766779385, + "learning_rate": 1.7668542968804535e-05, + "loss": 0.1684, + "step": 7838 + }, + { + "epoch": 3.1878812525416835, + "grad_norm": 2.2391263963722094, + "learning_rate": 1.766789264375089e-05, + "loss": 0.0531, + "step": 7839 + }, + { + "epoch": 3.1882879219194793, + "grad_norm": 1.863949390065228, + "learning_rate": 1.7667242239982466e-05, + "loss": 0.0303, + "step": 7840 + }, + { + "epoch": 3.1886945912972755, + "grad_norm": 3.3840777845083316, + "learning_rate": 1.7666591757505946e-05, + "loss": 0.0668, + "step": 7841 + }, + { + "epoch": 3.1891012606750713, + "grad_norm": 1.5274287555197035, + "learning_rate": 1.7665941196328e-05, + "loss": 0.0285, + "step": 7842 + }, + { + "epoch": 3.189507930052867, + "grad_norm": 9.835238955722172, + "learning_rate": 1.7665290556455304e-05, + "loss": 0.0401, + "step": 7843 + }, + { + "epoch": 3.189914599430663, + "grad_norm": 16.721921229632752, + "learning_rate": 1.766463983789455e-05, + "loss": 0.4867, + "step": 7844 + }, + { + "epoch": 3.1903212688084586, + "grad_norm": 7.254766320869601, + "learning_rate": 1.7663989040652407e-05, + "loss": 0.1814, + "step": 7845 + }, + { + "epoch": 3.1907279381862548, + "grad_norm": 1.6172064515744868, + "learning_rate": 1.766333816473556e-05, + "loss": 0.0796, + "step": 7846 + }, + { + "epoch": 3.1911346075640505, + "grad_norm": 18.863859734558872, + "learning_rate": 1.766268721015069e-05, + "loss": 0.3129, + "step": 7847 + }, + { + "epoch": 3.1915412769418463, + "grad_norm": 15.327073399097886, + "learning_rate": 1.7662036176904477e-05, + "loss": 0.1699, + "step": 7848 + }, + { + "epoch": 3.191947946319642, + "grad_norm": 15.263295119816673, + "learning_rate": 1.766138506500361e-05, + "loss": 0.5606, + "step": 7849 + }, + { + "epoch": 3.192354615697438, + "grad_norm": 13.161662300341137, + "learning_rate": 1.766073387445477e-05, + "loss": 0.6473, + "step": 7850 + }, + { + "epoch": 3.1927612850752336, + "grad_norm": 10.048042191002088, + "learning_rate": 1.766008260526464e-05, + "loss": 0.3533, + "step": 7851 + }, + { + "epoch": 3.19316795445303, + "grad_norm": 4.1727628804127, + "learning_rate": 1.7659431257439907e-05, + "loss": 0.0836, + "step": 7852 + }, + { + "epoch": 3.1935746238308256, + "grad_norm": 8.535110577170864, + "learning_rate": 1.7658779830987253e-05, + "loss": 0.181, + "step": 7853 + }, + { + "epoch": 3.1939812932086213, + "grad_norm": 8.044984088058877, + "learning_rate": 1.7658128325913375e-05, + "loss": 0.2629, + "step": 7854 + }, + { + "epoch": 3.194387962586417, + "grad_norm": 9.20809404563598, + "learning_rate": 1.7657476742224955e-05, + "loss": 0.724, + "step": 7855 + }, + { + "epoch": 3.1947946319642133, + "grad_norm": 2.152285352746995, + "learning_rate": 1.765682507992868e-05, + "loss": 0.0308, + "step": 7856 + }, + { + "epoch": 3.195201301342009, + "grad_norm": 5.425412926623589, + "learning_rate": 1.765617333903124e-05, + "loss": 0.1582, + "step": 7857 + }, + { + "epoch": 3.195607970719805, + "grad_norm": 0.5125224243701051, + "learning_rate": 1.7655521519539332e-05, + "loss": 0.0099, + "step": 7858 + }, + { + "epoch": 3.1960146400976006, + "grad_norm": 11.829197650721657, + "learning_rate": 1.765486962145964e-05, + "loss": 0.5248, + "step": 7859 + }, + { + "epoch": 3.1964213094753964, + "grad_norm": 6.405360302565649, + "learning_rate": 1.7654217644798862e-05, + "loss": 0.3215, + "step": 7860 + }, + { + "epoch": 3.196827978853192, + "grad_norm": 33.368913394644395, + "learning_rate": 1.765356558956368e-05, + "loss": 2.239, + "step": 7861 + }, + { + "epoch": 3.1972346482309884, + "grad_norm": 5.763421070568327, + "learning_rate": 1.7652913455760803e-05, + "loss": 0.1352, + "step": 7862 + }, + { + "epoch": 3.197641317608784, + "grad_norm": 11.67564695567561, + "learning_rate": 1.7652261243396917e-05, + "loss": 0.2666, + "step": 7863 + }, + { + "epoch": 3.19804798698658, + "grad_norm": 4.685556388627078, + "learning_rate": 1.7651608952478714e-05, + "loss": 0.0879, + "step": 7864 + }, + { + "epoch": 3.1984546563643756, + "grad_norm": 16.759839918298454, + "learning_rate": 1.7650956583012897e-05, + "loss": 0.6873, + "step": 7865 + }, + { + "epoch": 3.1988613257421714, + "grad_norm": 18.617125393507372, + "learning_rate": 1.765030413500616e-05, + "loss": 1.4025, + "step": 7866 + }, + { + "epoch": 3.1992679951199676, + "grad_norm": 12.268264491213856, + "learning_rate": 1.76496516084652e-05, + "loss": 0.7824, + "step": 7867 + }, + { + "epoch": 3.1996746644977634, + "grad_norm": 4.405344418647483, + "learning_rate": 1.7648999003396714e-05, + "loss": 0.1416, + "step": 7868 + }, + { + "epoch": 3.200081333875559, + "grad_norm": 4.343374949722041, + "learning_rate": 1.7648346319807407e-05, + "loss": 0.079, + "step": 7869 + }, + { + "epoch": 3.200488003253355, + "grad_norm": 14.91582417124843, + "learning_rate": 1.7647693557703973e-05, + "loss": 1.1703, + "step": 7870 + }, + { + "epoch": 3.2008946726311507, + "grad_norm": 5.680877666750139, + "learning_rate": 1.7647040717093118e-05, + "loss": 0.2668, + "step": 7871 + }, + { + "epoch": 3.201301342008947, + "grad_norm": 8.806562697682764, + "learning_rate": 1.764638779798154e-05, + "loss": 0.348, + "step": 7872 + }, + { + "epoch": 3.2017080113867427, + "grad_norm": 7.7708930345796885, + "learning_rate": 1.7645734800375942e-05, + "loss": 0.2578, + "step": 7873 + }, + { + "epoch": 3.2021146807645384, + "grad_norm": 10.363557067373716, + "learning_rate": 1.764508172428303e-05, + "loss": 0.7141, + "step": 7874 + }, + { + "epoch": 3.202521350142334, + "grad_norm": 10.128426683845866, + "learning_rate": 1.7644428569709505e-05, + "loss": 0.5633, + "step": 7875 + }, + { + "epoch": 3.20292801952013, + "grad_norm": 14.800132678357857, + "learning_rate": 1.7643775336662078e-05, + "loss": 0.621, + "step": 7876 + }, + { + "epoch": 3.203334688897926, + "grad_norm": 7.927673967731142, + "learning_rate": 1.7643122025147447e-05, + "loss": 0.2471, + "step": 7877 + }, + { + "epoch": 3.203741358275722, + "grad_norm": 7.66740072457049, + "learning_rate": 1.764246863517232e-05, + "loss": 0.1916, + "step": 7878 + }, + { + "epoch": 3.2041480276535177, + "grad_norm": 8.317583344489739, + "learning_rate": 1.7641815166743405e-05, + "loss": 0.3007, + "step": 7879 + }, + { + "epoch": 3.2045546970313135, + "grad_norm": 9.923752551700415, + "learning_rate": 1.7641161619867415e-05, + "loss": 0.6627, + "step": 7880 + }, + { + "epoch": 3.2049613664091092, + "grad_norm": 5.4098684913491315, + "learning_rate": 1.7640507994551054e-05, + "loss": 0.1268, + "step": 7881 + }, + { + "epoch": 3.2053680357869054, + "grad_norm": 6.476045428910049, + "learning_rate": 1.7639854290801032e-05, + "loss": 0.2697, + "step": 7882 + }, + { + "epoch": 3.205774705164701, + "grad_norm": 11.662564099642667, + "learning_rate": 1.7639200508624063e-05, + "loss": 0.4705, + "step": 7883 + }, + { + "epoch": 3.206181374542497, + "grad_norm": 15.125968663483688, + "learning_rate": 1.7638546648026852e-05, + "loss": 0.9624, + "step": 7884 + }, + { + "epoch": 3.2065880439202927, + "grad_norm": 10.879123051056581, + "learning_rate": 1.763789270901612e-05, + "loss": 0.42, + "step": 7885 + }, + { + "epoch": 3.2069947132980885, + "grad_norm": 33.324155063141255, + "learning_rate": 1.763723869159857e-05, + "loss": 0.2745, + "step": 7886 + }, + { + "epoch": 3.2074013826758847, + "grad_norm": 6.841267672424812, + "learning_rate": 1.7636584595780925e-05, + "loss": 0.0575, + "step": 7887 + }, + { + "epoch": 3.2078080520536805, + "grad_norm": 11.504923375117045, + "learning_rate": 1.7635930421569897e-05, + "loss": 0.55, + "step": 7888 + }, + { + "epoch": 3.2082147214314762, + "grad_norm": 7.719518455412343, + "learning_rate": 1.76352761689722e-05, + "loss": 0.4618, + "step": 7889 + }, + { + "epoch": 3.208621390809272, + "grad_norm": 10.921774486835087, + "learning_rate": 1.7634621837994546e-05, + "loss": 0.1425, + "step": 7890 + }, + { + "epoch": 3.2090280601870678, + "grad_norm": 1.3620559563694465, + "learning_rate": 1.763396742864366e-05, + "loss": 0.0301, + "step": 7891 + }, + { + "epoch": 3.2094347295648635, + "grad_norm": 13.34731139320025, + "learning_rate": 1.7633312940926254e-05, + "loss": 0.45, + "step": 7892 + }, + { + "epoch": 3.2098413989426597, + "grad_norm": 1.091546513824041, + "learning_rate": 1.763265837484905e-05, + "loss": 0.0163, + "step": 7893 + }, + { + "epoch": 3.2102480683204555, + "grad_norm": 14.806465388414514, + "learning_rate": 1.763200373041877e-05, + "loss": 0.1825, + "step": 7894 + }, + { + "epoch": 3.2106547376982513, + "grad_norm": 18.028076178487424, + "learning_rate": 1.7631349007642123e-05, + "loss": 1.054, + "step": 7895 + }, + { + "epoch": 3.211061407076047, + "grad_norm": 0.5449980662938789, + "learning_rate": 1.7630694206525844e-05, + "loss": 0.0166, + "step": 7896 + }, + { + "epoch": 3.2114680764538432, + "grad_norm": 10.577731982650354, + "learning_rate": 1.7630039327076645e-05, + "loss": 0.2271, + "step": 7897 + }, + { + "epoch": 3.211874745831639, + "grad_norm": 10.251833254032864, + "learning_rate": 1.7629384369301253e-05, + "loss": 0.2988, + "step": 7898 + }, + { + "epoch": 3.2122814152094348, + "grad_norm": 5.545733577019673, + "learning_rate": 1.762872933320639e-05, + "loss": 0.159, + "step": 7899 + }, + { + "epoch": 3.2126880845872305, + "grad_norm": 15.11917419427403, + "learning_rate": 1.7628074218798782e-05, + "loss": 0.247, + "step": 7900 + }, + { + "epoch": 3.2130947539650263, + "grad_norm": 10.33270540498833, + "learning_rate": 1.7627419026085154e-05, + "loss": 0.2952, + "step": 7901 + }, + { + "epoch": 3.213501423342822, + "grad_norm": 19.576138422249592, + "learning_rate": 1.7626763755072228e-05, + "loss": 0.8588, + "step": 7902 + }, + { + "epoch": 3.2139080927206183, + "grad_norm": 10.527461399843512, + "learning_rate": 1.7626108405766738e-05, + "loss": 0.1873, + "step": 7903 + }, + { + "epoch": 3.214314762098414, + "grad_norm": 4.023383189812493, + "learning_rate": 1.7625452978175404e-05, + "loss": 0.0612, + "step": 7904 + }, + { + "epoch": 3.21472143147621, + "grad_norm": 6.840258475327582, + "learning_rate": 1.7624797472304958e-05, + "loss": 0.2137, + "step": 7905 + }, + { + "epoch": 3.2151281008540056, + "grad_norm": 3.439142839228691, + "learning_rate": 1.7624141888162128e-05, + "loss": 0.0841, + "step": 7906 + }, + { + "epoch": 3.2155347702318013, + "grad_norm": 9.764730805275583, + "learning_rate": 1.7623486225753647e-05, + "loss": 0.1251, + "step": 7907 + }, + { + "epoch": 3.2159414396095976, + "grad_norm": 11.51082149624577, + "learning_rate": 1.7622830485086237e-05, + "loss": 0.5352, + "step": 7908 + }, + { + "epoch": 3.2163481089873933, + "grad_norm": 1.7188785889597236, + "learning_rate": 1.762217466616664e-05, + "loss": 0.0565, + "step": 7909 + }, + { + "epoch": 3.216754778365189, + "grad_norm": 10.841798119879067, + "learning_rate": 1.7621518769001582e-05, + "loss": 0.5103, + "step": 7910 + }, + { + "epoch": 3.217161447742985, + "grad_norm": 3.4432745576615535, + "learning_rate": 1.7620862793597798e-05, + "loss": 0.0715, + "step": 7911 + }, + { + "epoch": 3.2175681171207806, + "grad_norm": 6.039554472660082, + "learning_rate": 1.7620206739962027e-05, + "loss": 0.1918, + "step": 7912 + }, + { + "epoch": 3.217974786498577, + "grad_norm": 1.851662245707939, + "learning_rate": 1.7619550608100992e-05, + "loss": 0.0322, + "step": 7913 + }, + { + "epoch": 3.2183814558763726, + "grad_norm": 0.5271352781704375, + "learning_rate": 1.7618894398021438e-05, + "loss": 0.009, + "step": 7914 + }, + { + "epoch": 3.2187881252541684, + "grad_norm": 8.75253936812513, + "learning_rate": 1.76182381097301e-05, + "loss": 0.3601, + "step": 7915 + }, + { + "epoch": 3.219194794631964, + "grad_norm": 14.393020997343164, + "learning_rate": 1.761758174323371e-05, + "loss": 0.6763, + "step": 7916 + }, + { + "epoch": 3.21960146400976, + "grad_norm": 6.505243340079402, + "learning_rate": 1.7616925298539014e-05, + "loss": 0.3797, + "step": 7917 + }, + { + "epoch": 3.220008133387556, + "grad_norm": 9.589306190918082, + "learning_rate": 1.7616268775652744e-05, + "loss": 0.5277, + "step": 7918 + }, + { + "epoch": 3.220414802765352, + "grad_norm": 12.646191020420513, + "learning_rate": 1.761561217458164e-05, + "loss": 0.728, + "step": 7919 + }, + { + "epoch": 3.2208214721431476, + "grad_norm": 10.101192378892168, + "learning_rate": 1.7614955495332447e-05, + "loss": 0.2608, + "step": 7920 + }, + { + "epoch": 3.2212281415209434, + "grad_norm": 10.400539738656231, + "learning_rate": 1.7614298737911905e-05, + "loss": 0.5693, + "step": 7921 + }, + { + "epoch": 3.221634810898739, + "grad_norm": 9.152941912746744, + "learning_rate": 1.761364190232675e-05, + "loss": 0.3463, + "step": 7922 + }, + { + "epoch": 3.2220414802765354, + "grad_norm": 4.551442804394795, + "learning_rate": 1.761298498858373e-05, + "loss": 0.1641, + "step": 7923 + }, + { + "epoch": 3.222448149654331, + "grad_norm": 2.2123050257993513, + "learning_rate": 1.7612327996689586e-05, + "loss": 0.0878, + "step": 7924 + }, + { + "epoch": 3.222854819032127, + "grad_norm": 7.498583327934977, + "learning_rate": 1.7611670926651067e-05, + "loss": 0.4286, + "step": 7925 + }, + { + "epoch": 3.2232614884099227, + "grad_norm": 10.935237444824327, + "learning_rate": 1.7611013778474914e-05, + "loss": 0.3699, + "step": 7926 + }, + { + "epoch": 3.2236681577877184, + "grad_norm": 4.026941677796932, + "learning_rate": 1.7610356552167873e-05, + "loss": 0.0758, + "step": 7927 + }, + { + "epoch": 3.2240748271655146, + "grad_norm": 10.256947464487775, + "learning_rate": 1.7609699247736696e-05, + "loss": 0.5458, + "step": 7928 + }, + { + "epoch": 3.2244814965433104, + "grad_norm": 4.539599125474001, + "learning_rate": 1.7609041865188122e-05, + "loss": 0.1896, + "step": 7929 + }, + { + "epoch": 3.224888165921106, + "grad_norm": 6.029760823778765, + "learning_rate": 1.7608384404528904e-05, + "loss": 0.2166, + "step": 7930 + }, + { + "epoch": 3.225294835298902, + "grad_norm": 0.7226594227422356, + "learning_rate": 1.760772686576579e-05, + "loss": 0.0098, + "step": 7931 + }, + { + "epoch": 3.2257015046766977, + "grad_norm": 8.737687189826726, + "learning_rate": 1.7607069248905536e-05, + "loss": 0.2113, + "step": 7932 + }, + { + "epoch": 3.226108174054494, + "grad_norm": 14.559472303407961, + "learning_rate": 1.7606411553954884e-05, + "loss": 0.921, + "step": 7933 + }, + { + "epoch": 3.2265148434322897, + "grad_norm": 9.958858544650191, + "learning_rate": 1.760575378092059e-05, + "loss": 0.4551, + "step": 7934 + }, + { + "epoch": 3.2269215128100854, + "grad_norm": 10.658689322804102, + "learning_rate": 1.7605095929809403e-05, + "loss": 0.5518, + "step": 7935 + }, + { + "epoch": 3.227328182187881, + "grad_norm": 3.4466224437056647, + "learning_rate": 1.7604438000628083e-05, + "loss": 0.0676, + "step": 7936 + }, + { + "epoch": 3.227734851565677, + "grad_norm": 3.911488102046245, + "learning_rate": 1.7603779993383376e-05, + "loss": 0.0774, + "step": 7937 + }, + { + "epoch": 3.228141520943473, + "grad_norm": 1.5537761676028687, + "learning_rate": 1.760312190808204e-05, + "loss": 0.0253, + "step": 7938 + }, + { + "epoch": 3.228548190321269, + "grad_norm": 4.85563400449403, + "learning_rate": 1.7602463744730834e-05, + "loss": 0.1303, + "step": 7939 + }, + { + "epoch": 3.2289548596990647, + "grad_norm": 1.8990529326897942, + "learning_rate": 1.760180550333651e-05, + "loss": 0.0394, + "step": 7940 + }, + { + "epoch": 3.2293615290768605, + "grad_norm": 9.583841353498723, + "learning_rate": 1.7601147183905823e-05, + "loss": 0.1578, + "step": 7941 + }, + { + "epoch": 3.2297681984546562, + "grad_norm": 10.317152453665893, + "learning_rate": 1.7600488786445544e-05, + "loss": 0.2197, + "step": 7942 + }, + { + "epoch": 3.230174867832452, + "grad_norm": 14.388327687401842, + "learning_rate": 1.7599830310962413e-05, + "loss": 0.6223, + "step": 7943 + }, + { + "epoch": 3.230581537210248, + "grad_norm": 0.9060470768923645, + "learning_rate": 1.7599171757463204e-05, + "loss": 0.0228, + "step": 7944 + }, + { + "epoch": 3.230988206588044, + "grad_norm": 11.126343422313104, + "learning_rate": 1.7598513125954673e-05, + "loss": 0.4242, + "step": 7945 + }, + { + "epoch": 3.2313948759658397, + "grad_norm": 1.688271094856376, + "learning_rate": 1.7597854416443576e-05, + "loss": 0.0247, + "step": 7946 + }, + { + "epoch": 3.2318015453436355, + "grad_norm": 8.792203980411434, + "learning_rate": 1.7597195628936683e-05, + "loss": 0.3285, + "step": 7947 + }, + { + "epoch": 3.2322082147214313, + "grad_norm": 3.0245599669554197, + "learning_rate": 1.759653676344075e-05, + "loss": 0.0375, + "step": 7948 + }, + { + "epoch": 3.2326148840992275, + "grad_norm": 7.628819211584428, + "learning_rate": 1.759587781996255e-05, + "loss": 0.3349, + "step": 7949 + }, + { + "epoch": 3.2330215534770232, + "grad_norm": 8.872172634550845, + "learning_rate": 1.7595218798508835e-05, + "loss": 0.4005, + "step": 7950 + }, + { + "epoch": 3.233428222854819, + "grad_norm": 7.9750849264642145, + "learning_rate": 1.759455969908638e-05, + "loss": 0.5053, + "step": 7951 + }, + { + "epoch": 3.2338348922326148, + "grad_norm": 16.550957946653437, + "learning_rate": 1.7593900521701945e-05, + "loss": 0.732, + "step": 7952 + }, + { + "epoch": 3.2342415616104105, + "grad_norm": 5.185346594048047, + "learning_rate": 1.7593241266362302e-05, + "loss": 0.1875, + "step": 7953 + }, + { + "epoch": 3.2346482309882068, + "grad_norm": 4.986326228749541, + "learning_rate": 1.7592581933074215e-05, + "loss": 0.1937, + "step": 7954 + }, + { + "epoch": 3.2350549003660025, + "grad_norm": 6.775842470528941, + "learning_rate": 1.7591922521844453e-05, + "loss": 0.1363, + "step": 7955 + }, + { + "epoch": 3.2354615697437983, + "grad_norm": 4.5735478018545805, + "learning_rate": 1.7591263032679782e-05, + "loss": 0.0846, + "step": 7956 + }, + { + "epoch": 3.235868239121594, + "grad_norm": 8.026162183093252, + "learning_rate": 1.759060346558698e-05, + "loss": 0.2155, + "step": 7957 + }, + { + "epoch": 3.23627490849939, + "grad_norm": 12.697514779029142, + "learning_rate": 1.758994382057281e-05, + "loss": 0.329, + "step": 7958 + }, + { + "epoch": 3.236681577877186, + "grad_norm": 0.19145572169598749, + "learning_rate": 1.7589284097644048e-05, + "loss": 0.0031, + "step": 7959 + }, + { + "epoch": 3.237088247254982, + "grad_norm": 0.4197071667293955, + "learning_rate": 1.7588624296807465e-05, + "loss": 0.0073, + "step": 7960 + }, + { + "epoch": 3.2374949166327776, + "grad_norm": 7.267321327763089, + "learning_rate": 1.7587964418069835e-05, + "loss": 0.3122, + "step": 7961 + }, + { + "epoch": 3.2379015860105733, + "grad_norm": 14.28643101706856, + "learning_rate": 1.758730446143793e-05, + "loss": 1.1358, + "step": 7962 + }, + { + "epoch": 3.238308255388369, + "grad_norm": 14.107514981521488, + "learning_rate": 1.758664442691853e-05, + "loss": 0.8681, + "step": 7963 + }, + { + "epoch": 3.2387149247661653, + "grad_norm": 5.304301231137767, + "learning_rate": 1.7585984314518404e-05, + "loss": 0.1071, + "step": 7964 + }, + { + "epoch": 3.239121594143961, + "grad_norm": 9.19692185923565, + "learning_rate": 1.7585324124244326e-05, + "loss": 0.2287, + "step": 7965 + }, + { + "epoch": 3.239528263521757, + "grad_norm": 5.201877878234525, + "learning_rate": 1.7584663856103084e-05, + "loss": 0.3043, + "step": 7966 + }, + { + "epoch": 3.2399349328995526, + "grad_norm": 13.611418896113943, + "learning_rate": 1.758400351010145e-05, + "loss": 0.8154, + "step": 7967 + }, + { + "epoch": 3.2403416022773484, + "grad_norm": 5.285228350731516, + "learning_rate": 1.75833430862462e-05, + "loss": 0.0927, + "step": 7968 + }, + { + "epoch": 3.2407482716551446, + "grad_norm": 5.252794955118374, + "learning_rate": 1.7582682584544123e-05, + "loss": 0.0894, + "step": 7969 + }, + { + "epoch": 3.2411549410329403, + "grad_norm": 7.900320858943522, + "learning_rate": 1.7582022005001986e-05, + "loss": 0.2167, + "step": 7970 + }, + { + "epoch": 3.241561610410736, + "grad_norm": 12.639664103725892, + "learning_rate": 1.758136134762658e-05, + "loss": 0.603, + "step": 7971 + }, + { + "epoch": 3.241968279788532, + "grad_norm": 3.617543221880537, + "learning_rate": 1.7580700612424685e-05, + "loss": 0.1139, + "step": 7972 + }, + { + "epoch": 3.2423749491663276, + "grad_norm": 1.8249403040699481, + "learning_rate": 1.7580039799403083e-05, + "loss": 0.0349, + "step": 7973 + }, + { + "epoch": 3.242781618544124, + "grad_norm": 4.975497323052892, + "learning_rate": 1.7579378908568556e-05, + "loss": 0.1356, + "step": 7974 + }, + { + "epoch": 3.2431882879219196, + "grad_norm": 2.5720411373402814, + "learning_rate": 1.7578717939927892e-05, + "loss": 0.0384, + "step": 7975 + }, + { + "epoch": 3.2435949572997154, + "grad_norm": 12.514915607531147, + "learning_rate": 1.7578056893487873e-05, + "loss": 0.5768, + "step": 7976 + }, + { + "epoch": 3.244001626677511, + "grad_norm": 17.470506274264167, + "learning_rate": 1.7577395769255287e-05, + "loss": 0.6409, + "step": 7977 + }, + { + "epoch": 3.244408296055307, + "grad_norm": 14.776205618518759, + "learning_rate": 1.7576734567236915e-05, + "loss": 0.3663, + "step": 7978 + }, + { + "epoch": 3.244814965433103, + "grad_norm": 5.9130428492191305, + "learning_rate": 1.7576073287439555e-05, + "loss": 0.2071, + "step": 7979 + }, + { + "epoch": 3.245221634810899, + "grad_norm": 12.015144669353454, + "learning_rate": 1.757541192986999e-05, + "loss": 0.5765, + "step": 7980 + }, + { + "epoch": 3.2456283041886946, + "grad_norm": 6.192438668897148, + "learning_rate": 1.7574750494535002e-05, + "loss": 0.1973, + "step": 7981 + }, + { + "epoch": 3.2460349735664904, + "grad_norm": 4.805659071695488, + "learning_rate": 1.7574088981441395e-05, + "loss": 0.1609, + "step": 7982 + }, + { + "epoch": 3.246441642944286, + "grad_norm": 6.689128881239593, + "learning_rate": 1.7573427390595946e-05, + "loss": 0.41, + "step": 7983 + }, + { + "epoch": 3.246848312322082, + "grad_norm": 8.646283713816729, + "learning_rate": 1.757276572200546e-05, + "loss": 0.5006, + "step": 7984 + }, + { + "epoch": 3.247254981699878, + "grad_norm": 1.005645349507287, + "learning_rate": 1.757210397567672e-05, + "loss": 0.0172, + "step": 7985 + }, + { + "epoch": 3.247661651077674, + "grad_norm": 9.943834886512265, + "learning_rate": 1.757144215161652e-05, + "loss": 0.4379, + "step": 7986 + }, + { + "epoch": 3.2480683204554697, + "grad_norm": 12.363848131751855, + "learning_rate": 1.757078024983166e-05, + "loss": 0.5241, + "step": 7987 + }, + { + "epoch": 3.2484749898332654, + "grad_norm": 1.970161626763589, + "learning_rate": 1.7570118270328924e-05, + "loss": 0.0782, + "step": 7988 + }, + { + "epoch": 3.248881659211061, + "grad_norm": 11.500354157490381, + "learning_rate": 1.7569456213115117e-05, + "loss": 0.4196, + "step": 7989 + }, + { + "epoch": 3.2492883285888574, + "grad_norm": 14.704455076015241, + "learning_rate": 1.7568794078197035e-05, + "loss": 0.5453, + "step": 7990 + }, + { + "epoch": 3.249694997966653, + "grad_norm": 7.536738731277444, + "learning_rate": 1.756813186558147e-05, + "loss": 0.6905, + "step": 7991 + }, + { + "epoch": 3.250101667344449, + "grad_norm": 6.261440411749393, + "learning_rate": 1.7567469575275222e-05, + "loss": 0.4406, + "step": 7992 + }, + { + "epoch": 3.2505083367222447, + "grad_norm": 24.183264888017916, + "learning_rate": 1.756680720728509e-05, + "loss": 1.376, + "step": 7993 + }, + { + "epoch": 3.2509150061000405, + "grad_norm": 5.108257712593172, + "learning_rate": 1.7566144761617875e-05, + "loss": 0.3592, + "step": 7994 + }, + { + "epoch": 3.2513216754778367, + "grad_norm": 0.16780152056492179, + "learning_rate": 1.7565482238280377e-05, + "loss": 0.0042, + "step": 7995 + }, + { + "epoch": 3.2517283448556324, + "grad_norm": 1.1893874419692596, + "learning_rate": 1.7564819637279394e-05, + "loss": 0.0169, + "step": 7996 + }, + { + "epoch": 3.252135014233428, + "grad_norm": 17.573977189567085, + "learning_rate": 1.756415695862173e-05, + "loss": 0.3681, + "step": 7997 + }, + { + "epoch": 3.252541683611224, + "grad_norm": 13.370872966234632, + "learning_rate": 1.756349420231419e-05, + "loss": 0.6352, + "step": 7998 + }, + { + "epoch": 3.2529483529890197, + "grad_norm": 9.040093198525344, + "learning_rate": 1.7562831368363575e-05, + "loss": 0.2888, + "step": 7999 + }, + { + "epoch": 3.253355022366816, + "grad_norm": 6.381028775993469, + "learning_rate": 1.7562168456776688e-05, + "loss": 0.2937, + "step": 8000 + }, + { + "epoch": 3.2537616917446117, + "grad_norm": 3.8718501078296765, + "learning_rate": 1.756150546756034e-05, + "loss": 0.0537, + "step": 8001 + }, + { + "epoch": 3.2541683611224075, + "grad_norm": 9.06089994835666, + "learning_rate": 1.756084240072133e-05, + "loss": 0.3692, + "step": 8002 + }, + { + "epoch": 3.2545750305002032, + "grad_norm": 6.941136089440679, + "learning_rate": 1.7560179256266465e-05, + "loss": 0.1633, + "step": 8003 + }, + { + "epoch": 3.254981699877999, + "grad_norm": 1.648645862074334, + "learning_rate": 1.7559516034202557e-05, + "loss": 0.0232, + "step": 8004 + }, + { + "epoch": 3.2553883692557952, + "grad_norm": 8.440126830597432, + "learning_rate": 1.7558852734536412e-05, + "loss": 0.3688, + "step": 8005 + }, + { + "epoch": 3.255795038633591, + "grad_norm": 5.319034077663899, + "learning_rate": 1.7558189357274842e-05, + "loss": 0.1769, + "step": 8006 + }, + { + "epoch": 3.2562017080113868, + "grad_norm": 16.210830388577048, + "learning_rate": 1.755752590242465e-05, + "loss": 1.2487, + "step": 8007 + }, + { + "epoch": 3.2566083773891825, + "grad_norm": 0.3833802842929907, + "learning_rate": 1.7556862369992655e-05, + "loss": 0.006, + "step": 8008 + }, + { + "epoch": 3.2570150467669783, + "grad_norm": 16.329823775193223, + "learning_rate": 1.7556198759985663e-05, + "loss": 0.8813, + "step": 8009 + }, + { + "epoch": 3.2574217161447745, + "grad_norm": 2.297771671534045, + "learning_rate": 1.7555535072410486e-05, + "loss": 0.0859, + "step": 8010 + }, + { + "epoch": 3.2578283855225703, + "grad_norm": 12.606244724435227, + "learning_rate": 1.755487130727394e-05, + "loss": 0.1756, + "step": 8011 + }, + { + "epoch": 3.258235054900366, + "grad_norm": 8.941981383254797, + "learning_rate": 1.7554207464582838e-05, + "loss": 0.4374, + "step": 8012 + }, + { + "epoch": 3.258641724278162, + "grad_norm": 7.051445252121881, + "learning_rate": 1.7553543544343994e-05, + "loss": 0.167, + "step": 8013 + }, + { + "epoch": 3.2590483936559576, + "grad_norm": 9.820023759665196, + "learning_rate": 1.7552879546564227e-05, + "loss": 0.3226, + "step": 8014 + }, + { + "epoch": 3.2594550630337533, + "grad_norm": 4.865064641699658, + "learning_rate": 1.7552215471250343e-05, + "loss": 0.2092, + "step": 8015 + }, + { + "epoch": 3.2598617324115495, + "grad_norm": 8.09264579673211, + "learning_rate": 1.755155131840917e-05, + "loss": 0.4323, + "step": 8016 + }, + { + "epoch": 3.2602684017893453, + "grad_norm": 4.553125635038968, + "learning_rate": 1.7550887088047526e-05, + "loss": 0.0793, + "step": 8017 + }, + { + "epoch": 3.260675071167141, + "grad_norm": 7.1691049670530544, + "learning_rate": 1.7550222780172222e-05, + "loss": 0.1912, + "step": 8018 + }, + { + "epoch": 3.261081740544937, + "grad_norm": 9.06870005984078, + "learning_rate": 1.754955839479008e-05, + "loss": 0.3794, + "step": 8019 + }, + { + "epoch": 3.261488409922733, + "grad_norm": 12.455219005842578, + "learning_rate": 1.7548893931907924e-05, + "loss": 0.828, + "step": 8020 + }, + { + "epoch": 3.261895079300529, + "grad_norm": 2.7202662314716988, + "learning_rate": 1.7548229391532572e-05, + "loss": 0.0558, + "step": 8021 + }, + { + "epoch": 3.2623017486783246, + "grad_norm": 11.941020983616443, + "learning_rate": 1.7547564773670847e-05, + "loss": 0.4588, + "step": 8022 + }, + { + "epoch": 3.2627084180561203, + "grad_norm": 7.587824503906329, + "learning_rate": 1.7546900078329568e-05, + "loss": 0.1804, + "step": 8023 + }, + { + "epoch": 3.263115087433916, + "grad_norm": 0.6259423840683389, + "learning_rate": 1.7546235305515566e-05, + "loss": 0.0093, + "step": 8024 + }, + { + "epoch": 3.263521756811712, + "grad_norm": 4.982480648758449, + "learning_rate": 1.7545570455235657e-05, + "loss": 0.139, + "step": 8025 + }, + { + "epoch": 3.263928426189508, + "grad_norm": 0.8922000301268744, + "learning_rate": 1.754490552749667e-05, + "loss": 0.0122, + "step": 8026 + }, + { + "epoch": 3.264335095567304, + "grad_norm": 8.136578808765503, + "learning_rate": 1.754424052230543e-05, + "loss": 0.3204, + "step": 8027 + }, + { + "epoch": 3.2647417649450996, + "grad_norm": 16.083458884108225, + "learning_rate": 1.7543575439668766e-05, + "loss": 1.5349, + "step": 8028 + }, + { + "epoch": 3.2651484343228954, + "grad_norm": 3.19596420688754, + "learning_rate": 1.7542910279593502e-05, + "loss": 0.0829, + "step": 8029 + }, + { + "epoch": 3.2655551037006916, + "grad_norm": 9.282229127236402, + "learning_rate": 1.754224504208647e-05, + "loss": 0.3248, + "step": 8030 + }, + { + "epoch": 3.2659617730784873, + "grad_norm": 1.5137871304449682, + "learning_rate": 1.7541579727154493e-05, + "loss": 0.0733, + "step": 8031 + }, + { + "epoch": 3.266368442456283, + "grad_norm": 11.495722804078113, + "learning_rate": 1.7540914334804407e-05, + "loss": 0.4722, + "step": 8032 + }, + { + "epoch": 3.266775111834079, + "grad_norm": 4.765596391052611, + "learning_rate": 1.7540248865043043e-05, + "loss": 0.1419, + "step": 8033 + }, + { + "epoch": 3.2671817812118746, + "grad_norm": 3.1959398841533577, + "learning_rate": 1.7539583317877226e-05, + "loss": 0.041, + "step": 8034 + }, + { + "epoch": 3.2675884505896704, + "grad_norm": 5.7359252713555255, + "learning_rate": 1.753891769331379e-05, + "loss": 0.1543, + "step": 8035 + }, + { + "epoch": 3.2679951199674666, + "grad_norm": 8.19035616680572, + "learning_rate": 1.7538251991359573e-05, + "loss": 0.2526, + "step": 8036 + }, + { + "epoch": 3.2684017893452624, + "grad_norm": 0.42656965518148177, + "learning_rate": 1.7537586212021405e-05, + "loss": 0.0048, + "step": 8037 + }, + { + "epoch": 3.268808458723058, + "grad_norm": 1.8627723255238546, + "learning_rate": 1.753692035530612e-05, + "loss": 0.042, + "step": 8038 + }, + { + "epoch": 3.269215128100854, + "grad_norm": 5.74212185680649, + "learning_rate": 1.7536254421220555e-05, + "loss": 0.2673, + "step": 8039 + }, + { + "epoch": 3.2696217974786497, + "grad_norm": 7.33540001888061, + "learning_rate": 1.7535588409771544e-05, + "loss": 0.2356, + "step": 8040 + }, + { + "epoch": 3.270028466856446, + "grad_norm": 2.834734503860175, + "learning_rate": 1.7534922320965924e-05, + "loss": 0.1048, + "step": 8041 + }, + { + "epoch": 3.2704351362342416, + "grad_norm": 5.913573700375109, + "learning_rate": 1.7534256154810537e-05, + "loss": 0.1485, + "step": 8042 + }, + { + "epoch": 3.2708418056120374, + "grad_norm": 9.899300757872956, + "learning_rate": 1.7533589911312215e-05, + "loss": 0.2303, + "step": 8043 + }, + { + "epoch": 3.271248474989833, + "grad_norm": 7.633068579388836, + "learning_rate": 1.7532923590477807e-05, + "loss": 0.1631, + "step": 8044 + }, + { + "epoch": 3.271655144367629, + "grad_norm": 7.779400742321317, + "learning_rate": 1.7532257192314145e-05, + "loss": 0.2465, + "step": 8045 + }, + { + "epoch": 3.272061813745425, + "grad_norm": 1.2757236097507956, + "learning_rate": 1.7531590716828072e-05, + "loss": 0.0262, + "step": 8046 + }, + { + "epoch": 3.272468483123221, + "grad_norm": 4.465346024169604, + "learning_rate": 1.7530924164026427e-05, + "loss": 0.0925, + "step": 8047 + }, + { + "epoch": 3.2728751525010167, + "grad_norm": 9.222525975290015, + "learning_rate": 1.7530257533916057e-05, + "loss": 0.3063, + "step": 8048 + }, + { + "epoch": 3.2732818218788124, + "grad_norm": 3.0356039925344738, + "learning_rate": 1.75295908265038e-05, + "loss": 0.161, + "step": 8049 + }, + { + "epoch": 3.273688491256608, + "grad_norm": 5.15342208391285, + "learning_rate": 1.752892404179651e-05, + "loss": 0.1122, + "step": 8050 + }, + { + "epoch": 3.2740951606344044, + "grad_norm": 8.371483503458842, + "learning_rate": 1.7528257179801024e-05, + "loss": 0.478, + "step": 8051 + }, + { + "epoch": 3.2745018300122, + "grad_norm": 3.390091849144437, + "learning_rate": 1.7527590240524185e-05, + "loss": 0.0853, + "step": 8052 + }, + { + "epoch": 3.274908499389996, + "grad_norm": 1.4060358137070368, + "learning_rate": 1.7526923223972848e-05, + "loss": 0.0243, + "step": 8053 + }, + { + "epoch": 3.2753151687677917, + "grad_norm": 2.913211081712197, + "learning_rate": 1.7526256130153854e-05, + "loss": 0.0401, + "step": 8054 + }, + { + "epoch": 3.2757218381455875, + "grad_norm": 11.893680659485304, + "learning_rate": 1.7525588959074055e-05, + "loss": 0.5657, + "step": 8055 + }, + { + "epoch": 3.2761285075233832, + "grad_norm": 1.4256629406493742, + "learning_rate": 1.7524921710740294e-05, + "loss": 0.0268, + "step": 8056 + }, + { + "epoch": 3.2765351769011795, + "grad_norm": 8.300873678702576, + "learning_rate": 1.7524254385159425e-05, + "loss": 0.2905, + "step": 8057 + }, + { + "epoch": 3.2769418462789752, + "grad_norm": 8.864047239613388, + "learning_rate": 1.75235869823383e-05, + "loss": 0.2536, + "step": 8058 + }, + { + "epoch": 3.277348515656771, + "grad_norm": 3.641103109835448, + "learning_rate": 1.7522919502283765e-05, + "loss": 0.2323, + "step": 8059 + }, + { + "epoch": 3.2777551850345668, + "grad_norm": 7.060919090935716, + "learning_rate": 1.7522251945002678e-05, + "loss": 0.2782, + "step": 8060 + }, + { + "epoch": 3.278161854412363, + "grad_norm": 1.6615087282417111, + "learning_rate": 1.752158431050189e-05, + "loss": 0.0309, + "step": 8061 + }, + { + "epoch": 3.2785685237901587, + "grad_norm": 4.140027752995717, + "learning_rate": 1.7520916598788252e-05, + "loss": 0.0956, + "step": 8062 + }, + { + "epoch": 3.2789751931679545, + "grad_norm": 0.4794290105213678, + "learning_rate": 1.7520248809868618e-05, + "loss": 0.0087, + "step": 8063 + }, + { + "epoch": 3.2793818625457503, + "grad_norm": 11.537240284260507, + "learning_rate": 1.751958094374985e-05, + "loss": 0.3619, + "step": 8064 + }, + { + "epoch": 3.279788531923546, + "grad_norm": 1.0072697235488706, + "learning_rate": 1.751891300043879e-05, + "loss": 0.0161, + "step": 8065 + }, + { + "epoch": 3.280195201301342, + "grad_norm": 12.280948212743125, + "learning_rate": 1.7518244979942315e-05, + "loss": 0.7626, + "step": 8066 + }, + { + "epoch": 3.280601870679138, + "grad_norm": 0.3362958658416696, + "learning_rate": 1.7517576882267266e-05, + "loss": 0.0055, + "step": 8067 + }, + { + "epoch": 3.2810085400569338, + "grad_norm": 11.596301452792876, + "learning_rate": 1.7516908707420505e-05, + "loss": 0.3875, + "step": 8068 + }, + { + "epoch": 3.2814152094347295, + "grad_norm": 6.376733159803311, + "learning_rate": 1.7516240455408893e-05, + "loss": 0.1525, + "step": 8069 + }, + { + "epoch": 3.2818218788125253, + "grad_norm": 8.61200046640009, + "learning_rate": 1.7515572126239292e-05, + "loss": 0.1597, + "step": 8070 + }, + { + "epoch": 3.2822285481903215, + "grad_norm": 7.026905313049566, + "learning_rate": 1.7514903719918563e-05, + "loss": 0.1943, + "step": 8071 + }, + { + "epoch": 3.2826352175681173, + "grad_norm": 6.830540161766171, + "learning_rate": 1.7514235236453562e-05, + "loss": 0.3015, + "step": 8072 + }, + { + "epoch": 3.283041886945913, + "grad_norm": 9.30269642116758, + "learning_rate": 1.7513566675851158e-05, + "loss": 0.2981, + "step": 8073 + }, + { + "epoch": 3.283448556323709, + "grad_norm": 0.8739305309126217, + "learning_rate": 1.7512898038118208e-05, + "loss": 0.0143, + "step": 8074 + }, + { + "epoch": 3.2838552257015046, + "grad_norm": 16.217021040783898, + "learning_rate": 1.7512229323261577e-05, + "loss": 0.6417, + "step": 8075 + }, + { + "epoch": 3.2842618950793003, + "grad_norm": 0.17068657574670396, + "learning_rate": 1.751156053128813e-05, + "loss": 0.0025, + "step": 8076 + }, + { + "epoch": 3.2846685644570965, + "grad_norm": 0.3768215475832869, + "learning_rate": 1.7510891662204738e-05, + "loss": 0.0062, + "step": 8077 + }, + { + "epoch": 3.2850752338348923, + "grad_norm": 8.873102462093982, + "learning_rate": 1.751022271601826e-05, + "loss": 0.4388, + "step": 8078 + }, + { + "epoch": 3.285481903212688, + "grad_norm": 10.251653787898064, + "learning_rate": 1.7509553692735566e-05, + "loss": 0.1522, + "step": 8079 + }, + { + "epoch": 3.285888572590484, + "grad_norm": 3.1428198407109798, + "learning_rate": 1.7508884592363524e-05, + "loss": 0.1068, + "step": 8080 + }, + { + "epoch": 3.2862952419682796, + "grad_norm": 3.2207628662283745, + "learning_rate": 1.7508215414909003e-05, + "loss": 0.058, + "step": 8081 + }, + { + "epoch": 3.286701911346076, + "grad_norm": 7.483807559878674, + "learning_rate": 1.7507546160378872e-05, + "loss": 0.1212, + "step": 8082 + }, + { + "epoch": 3.2871085807238716, + "grad_norm": 2.7461799543206076, + "learning_rate": 1.750687682878e-05, + "loss": 0.1059, + "step": 8083 + }, + { + "epoch": 3.2875152501016673, + "grad_norm": 8.091509357342225, + "learning_rate": 1.7506207420119263e-05, + "loss": 0.2026, + "step": 8084 + }, + { + "epoch": 3.287921919479463, + "grad_norm": 4.075704416033545, + "learning_rate": 1.7505537934403527e-05, + "loss": 0.0585, + "step": 8085 + }, + { + "epoch": 3.288328588857259, + "grad_norm": 6.5563993478298945, + "learning_rate": 1.7504868371639668e-05, + "loss": 0.1152, + "step": 8086 + }, + { + "epoch": 3.288735258235055, + "grad_norm": 0.3244178791004831, + "learning_rate": 1.7504198731834554e-05, + "loss": 0.0029, + "step": 8087 + }, + { + "epoch": 3.289141927612851, + "grad_norm": 3.5227061259174106, + "learning_rate": 1.7503529014995067e-05, + "loss": 0.0641, + "step": 8088 + }, + { + "epoch": 3.2895485969906466, + "grad_norm": 2.119824330964727, + "learning_rate": 1.7502859221128074e-05, + "loss": 0.034, + "step": 8089 + }, + { + "epoch": 3.2899552663684424, + "grad_norm": 8.413876088254861, + "learning_rate": 1.750218935024046e-05, + "loss": 0.3712, + "step": 8090 + }, + { + "epoch": 3.290361935746238, + "grad_norm": 0.3986862744942007, + "learning_rate": 1.7501519402339092e-05, + "loss": 0.0131, + "step": 8091 + }, + { + "epoch": 3.2907686051240344, + "grad_norm": 7.713786421042301, + "learning_rate": 1.7500849377430855e-05, + "loss": 0.3499, + "step": 8092 + }, + { + "epoch": 3.29117527450183, + "grad_norm": 9.470021541753738, + "learning_rate": 1.7500179275522624e-05, + "loss": 0.5659, + "step": 8093 + }, + { + "epoch": 3.291581943879626, + "grad_norm": 6.830806967093104, + "learning_rate": 1.7499509096621277e-05, + "loss": 0.1837, + "step": 8094 + }, + { + "epoch": 3.2919886132574216, + "grad_norm": 13.052690140162472, + "learning_rate": 1.7498838840733695e-05, + "loss": 0.3517, + "step": 8095 + }, + { + "epoch": 3.2923952826352174, + "grad_norm": 9.980211487905187, + "learning_rate": 1.749816850786676e-05, + "loss": 0.2249, + "step": 8096 + }, + { + "epoch": 3.292801952013013, + "grad_norm": 4.561197277760017, + "learning_rate": 1.7497498098027348e-05, + "loss": 0.0913, + "step": 8097 + }, + { + "epoch": 3.2932086213908094, + "grad_norm": 3.6279559316831453, + "learning_rate": 1.7496827611222348e-05, + "loss": 0.1531, + "step": 8098 + }, + { + "epoch": 3.293615290768605, + "grad_norm": 10.16348705735449, + "learning_rate": 1.7496157047458635e-05, + "loss": 0.6357, + "step": 8099 + }, + { + "epoch": 3.294021960146401, + "grad_norm": 9.683386118599776, + "learning_rate": 1.74954864067431e-05, + "loss": 0.2763, + "step": 8100 + }, + { + "epoch": 3.2944286295241967, + "grad_norm": 6.3511710548511795, + "learning_rate": 1.7494815689082626e-05, + "loss": 0.2715, + "step": 8101 + }, + { + "epoch": 3.294835298901993, + "grad_norm": 6.031463004882871, + "learning_rate": 1.7494144894484095e-05, + "loss": 0.1461, + "step": 8102 + }, + { + "epoch": 3.2952419682797887, + "grad_norm": 2.462692453223137, + "learning_rate": 1.7493474022954396e-05, + "loss": 0.0418, + "step": 8103 + }, + { + "epoch": 3.2956486376575844, + "grad_norm": 13.20166057374044, + "learning_rate": 1.749280307450041e-05, + "loss": 0.6786, + "step": 8104 + }, + { + "epoch": 3.29605530703538, + "grad_norm": 5.121138081887927, + "learning_rate": 1.7492132049129038e-05, + "loss": 0.225, + "step": 8105 + }, + { + "epoch": 3.296461976413176, + "grad_norm": 1.24197623688883, + "learning_rate": 1.7491460946847155e-05, + "loss": 0.0226, + "step": 8106 + }, + { + "epoch": 3.2968686457909717, + "grad_norm": 11.207486484119716, + "learning_rate": 1.7490789767661652e-05, + "loss": 0.4254, + "step": 8107 + }, + { + "epoch": 3.297275315168768, + "grad_norm": 49.1855596149178, + "learning_rate": 1.749011851157943e-05, + "loss": 0.7166, + "step": 8108 + }, + { + "epoch": 3.2976819845465637, + "grad_norm": 0.1838419043644193, + "learning_rate": 1.7489447178607367e-05, + "loss": 0.003, + "step": 8109 + }, + { + "epoch": 3.2980886539243595, + "grad_norm": 9.53611702671102, + "learning_rate": 1.7488775768752357e-05, + "loss": 0.2739, + "step": 8110 + }, + { + "epoch": 3.2984953233021552, + "grad_norm": 3.121200892965003, + "learning_rate": 1.7488104282021297e-05, + "loss": 0.1382, + "step": 8111 + }, + { + "epoch": 3.2989019926799514, + "grad_norm": 1.401086629955277, + "learning_rate": 1.748743271842108e-05, + "loss": 0.0456, + "step": 8112 + }, + { + "epoch": 3.299308662057747, + "grad_norm": 8.293286264125237, + "learning_rate": 1.7486761077958595e-05, + "loss": 0.2753, + "step": 8113 + }, + { + "epoch": 3.299715331435543, + "grad_norm": 9.610930776647983, + "learning_rate": 1.748608936064074e-05, + "loss": 0.3589, + "step": 8114 + }, + { + "epoch": 3.3001220008133387, + "grad_norm": 3.539709439602479, + "learning_rate": 1.7485417566474412e-05, + "loss": 0.0616, + "step": 8115 + }, + { + "epoch": 3.3005286701911345, + "grad_norm": 4.7903605908182, + "learning_rate": 1.7484745695466503e-05, + "loss": 0.0838, + "step": 8116 + }, + { + "epoch": 3.3009353395689303, + "grad_norm": 2.699494885407656, + "learning_rate": 1.7484073747623914e-05, + "loss": 0.0421, + "step": 8117 + }, + { + "epoch": 3.3013420089467265, + "grad_norm": 7.149597833283133, + "learning_rate": 1.748340172295354e-05, + "loss": 0.2701, + "step": 8118 + }, + { + "epoch": 3.3017486783245222, + "grad_norm": 6.764175914403763, + "learning_rate": 1.7482729621462284e-05, + "loss": 0.1774, + "step": 8119 + }, + { + "epoch": 3.302155347702318, + "grad_norm": 6.738907944191546, + "learning_rate": 1.7482057443157035e-05, + "loss": 0.199, + "step": 8120 + }, + { + "epoch": 3.3025620170801138, + "grad_norm": 11.326658068820478, + "learning_rate": 1.7481385188044706e-05, + "loss": 0.4987, + "step": 8121 + }, + { + "epoch": 3.3029686864579095, + "grad_norm": 15.627728159674854, + "learning_rate": 1.7480712856132195e-05, + "loss": 0.8265, + "step": 8122 + }, + { + "epoch": 3.3033753558357057, + "grad_norm": 10.728730901318416, + "learning_rate": 1.74800404474264e-05, + "loss": 0.1431, + "step": 8123 + }, + { + "epoch": 3.3037820252135015, + "grad_norm": 6.605192233455981, + "learning_rate": 1.747936796193422e-05, + "loss": 0.1941, + "step": 8124 + }, + { + "epoch": 3.3041886945912973, + "grad_norm": 0.010491388005373574, + "learning_rate": 1.7478695399662568e-05, + "loss": 0.0002, + "step": 8125 + }, + { + "epoch": 3.304595363969093, + "grad_norm": 2.9178607155788203, + "learning_rate": 1.7478022760618346e-05, + "loss": 0.0542, + "step": 8126 + }, + { + "epoch": 3.305002033346889, + "grad_norm": 6.998109504491551, + "learning_rate": 1.7477350044808452e-05, + "loss": 0.2258, + "step": 8127 + }, + { + "epoch": 3.305408702724685, + "grad_norm": 9.698517932972388, + "learning_rate": 1.74766772522398e-05, + "loss": 0.422, + "step": 8128 + }, + { + "epoch": 3.3058153721024808, + "grad_norm": 2.9872480346477315, + "learning_rate": 1.7476004382919293e-05, + "loss": 0.0598, + "step": 8129 + }, + { + "epoch": 3.3062220414802765, + "grad_norm": 0.9028166331772418, + "learning_rate": 1.7475331436853837e-05, + "loss": 0.013, + "step": 8130 + }, + { + "epoch": 3.3066287108580723, + "grad_norm": 9.48551289733336, + "learning_rate": 1.7474658414050344e-05, + "loss": 0.2602, + "step": 8131 + }, + { + "epoch": 3.307035380235868, + "grad_norm": 9.923914406624139, + "learning_rate": 1.7473985314515715e-05, + "loss": 0.3843, + "step": 8132 + }, + { + "epoch": 3.3074420496136643, + "grad_norm": 0.37427527571306096, + "learning_rate": 1.7473312138256872e-05, + "loss": 0.0049, + "step": 8133 + }, + { + "epoch": 3.30784871899146, + "grad_norm": 10.665729377061488, + "learning_rate": 1.7472638885280713e-05, + "loss": 0.3929, + "step": 8134 + }, + { + "epoch": 3.308255388369256, + "grad_norm": 3.962840067483152, + "learning_rate": 1.7471965555594157e-05, + "loss": 0.0682, + "step": 8135 + }, + { + "epoch": 3.3086620577470516, + "grad_norm": 7.813360956197597, + "learning_rate": 1.7471292149204113e-05, + "loss": 0.1471, + "step": 8136 + }, + { + "epoch": 3.3090687271248473, + "grad_norm": 7.144517894823835, + "learning_rate": 1.7470618666117495e-05, + "loss": 0.3299, + "step": 8137 + }, + { + "epoch": 3.309475396502643, + "grad_norm": 7.875830624396597, + "learning_rate": 1.746994510634122e-05, + "loss": 0.3134, + "step": 8138 + }, + { + "epoch": 3.3098820658804393, + "grad_norm": 11.055626226306007, + "learning_rate": 1.7469271469882193e-05, + "loss": 0.2535, + "step": 8139 + }, + { + "epoch": 3.310288735258235, + "grad_norm": 24.539378759106583, + "learning_rate": 1.7468597756747342e-05, + "loss": 0.3789, + "step": 8140 + }, + { + "epoch": 3.310695404636031, + "grad_norm": 6.079568919941707, + "learning_rate": 1.7467923966943572e-05, + "loss": 0.1681, + "step": 8141 + }, + { + "epoch": 3.3111020740138266, + "grad_norm": 5.991875254370634, + "learning_rate": 1.7467250100477804e-05, + "loss": 0.221, + "step": 8142 + }, + { + "epoch": 3.311508743391623, + "grad_norm": 15.222587744852595, + "learning_rate": 1.7466576157356957e-05, + "loss": 0.761, + "step": 8143 + }, + { + "epoch": 3.3119154127694186, + "grad_norm": 6.608288476858356, + "learning_rate": 1.7465902137587946e-05, + "loss": 0.3199, + "step": 8144 + }, + { + "epoch": 3.3123220821472144, + "grad_norm": 3.037211708203111, + "learning_rate": 1.7465228041177694e-05, + "loss": 0.0496, + "step": 8145 + }, + { + "epoch": 3.31272875152501, + "grad_norm": 1.3405887186591408, + "learning_rate": 1.7464553868133118e-05, + "loss": 0.0189, + "step": 8146 + }, + { + "epoch": 3.313135420902806, + "grad_norm": 6.488192666935131, + "learning_rate": 1.746387961846114e-05, + "loss": 0.1736, + "step": 8147 + }, + { + "epoch": 3.3135420902806016, + "grad_norm": 9.231166963466784, + "learning_rate": 1.746320529216868e-05, + "loss": 0.4151, + "step": 8148 + }, + { + "epoch": 3.313948759658398, + "grad_norm": 10.89233456091883, + "learning_rate": 1.7462530889262662e-05, + "loss": 0.3221, + "step": 8149 + }, + { + "epoch": 3.3143554290361936, + "grad_norm": 0.9688347776187306, + "learning_rate": 1.746185640975001e-05, + "loss": 0.014, + "step": 8150 + }, + { + "epoch": 3.3147620984139894, + "grad_norm": 4.496596724997455, + "learning_rate": 1.7461181853637645e-05, + "loss": 0.1412, + "step": 8151 + }, + { + "epoch": 3.315168767791785, + "grad_norm": 1.0861638300779048, + "learning_rate": 1.7460507220932497e-05, + "loss": 0.0248, + "step": 8152 + }, + { + "epoch": 3.3155754371695814, + "grad_norm": 7.3130179627030065, + "learning_rate": 1.7459832511641483e-05, + "loss": 0.2998, + "step": 8153 + }, + { + "epoch": 3.315982106547377, + "grad_norm": 6.852589222372908, + "learning_rate": 1.7459157725771537e-05, + "loss": 0.1618, + "step": 8154 + }, + { + "epoch": 3.316388775925173, + "grad_norm": 11.519955513425062, + "learning_rate": 1.745848286332958e-05, + "loss": 0.4517, + "step": 8155 + }, + { + "epoch": 3.3167954453029687, + "grad_norm": 11.181962011103828, + "learning_rate": 1.7457807924322542e-05, + "loss": 0.482, + "step": 8156 + }, + { + "epoch": 3.3172021146807644, + "grad_norm": 0.6834966177616193, + "learning_rate": 1.7457132908757356e-05, + "loss": 0.0095, + "step": 8157 + }, + { + "epoch": 3.31760878405856, + "grad_norm": 8.057549608798123, + "learning_rate": 1.7456457816640945e-05, + "loss": 0.3721, + "step": 8158 + }, + { + "epoch": 3.3180154534363564, + "grad_norm": 12.28362669012026, + "learning_rate": 1.7455782647980242e-05, + "loss": 0.4698, + "step": 8159 + }, + { + "epoch": 3.318422122814152, + "grad_norm": 3.422728455879846, + "learning_rate": 1.7455107402782175e-05, + "loss": 0.0816, + "step": 8160 + }, + { + "epoch": 3.318828792191948, + "grad_norm": 4.661416222564954, + "learning_rate": 1.745443208105368e-05, + "loss": 0.3822, + "step": 8161 + }, + { + "epoch": 3.3192354615697437, + "grad_norm": 0.39384509960000225, + "learning_rate": 1.7453756682801694e-05, + "loss": 0.0074, + "step": 8162 + }, + { + "epoch": 3.3196421309475395, + "grad_norm": 7.679380540737893, + "learning_rate": 1.7453081208033137e-05, + "loss": 0.2661, + "step": 8163 + }, + { + "epoch": 3.3200488003253357, + "grad_norm": 14.955837301355102, + "learning_rate": 1.7452405656754954e-05, + "loss": 0.8033, + "step": 8164 + }, + { + "epoch": 3.3204554697031314, + "grad_norm": 8.333382263446621, + "learning_rate": 1.7451730028974076e-05, + "loss": 0.2776, + "step": 8165 + }, + { + "epoch": 3.320862139080927, + "grad_norm": 8.431568744612381, + "learning_rate": 1.7451054324697436e-05, + "loss": 0.2213, + "step": 8166 + }, + { + "epoch": 3.321268808458723, + "grad_norm": 10.768371372041168, + "learning_rate": 1.7450378543931976e-05, + "loss": 0.506, + "step": 8167 + }, + { + "epoch": 3.3216754778365187, + "grad_norm": 1.9045775321066798, + "learning_rate": 1.7449702686684632e-05, + "loss": 0.0293, + "step": 8168 + }, + { + "epoch": 3.322082147214315, + "grad_norm": 6.160600656481813, + "learning_rate": 1.744902675296234e-05, + "loss": 0.0901, + "step": 8169 + }, + { + "epoch": 3.3224888165921107, + "grad_norm": 48.22715833706192, + "learning_rate": 1.7448350742772035e-05, + "loss": 0.4928, + "step": 8170 + }, + { + "epoch": 3.3228954859699065, + "grad_norm": 3.236998513448969, + "learning_rate": 1.7447674656120663e-05, + "loss": 0.0609, + "step": 8171 + }, + { + "epoch": 3.3233021553477022, + "grad_norm": 4.2419546651758155, + "learning_rate": 1.7446998493015163e-05, + "loss": 0.0709, + "step": 8172 + }, + { + "epoch": 3.323708824725498, + "grad_norm": 6.324763920275315, + "learning_rate": 1.7446322253462476e-05, + "loss": 0.1344, + "step": 8173 + }, + { + "epoch": 3.324115494103294, + "grad_norm": 7.558861921417675, + "learning_rate": 1.744564593746954e-05, + "loss": 0.3166, + "step": 8174 + }, + { + "epoch": 3.32452216348109, + "grad_norm": 3.718109084081702, + "learning_rate": 1.7444969545043308e-05, + "loss": 0.0195, + "step": 8175 + }, + { + "epoch": 3.3249288328588857, + "grad_norm": 15.31324669683154, + "learning_rate": 1.744429307619071e-05, + "loss": 0.2387, + "step": 8176 + }, + { + "epoch": 3.3253355022366815, + "grad_norm": 7.749512980621276, + "learning_rate": 1.74436165309187e-05, + "loss": 0.3749, + "step": 8177 + }, + { + "epoch": 3.3257421716144773, + "grad_norm": 4.723803499793123, + "learning_rate": 1.744293990923422e-05, + "loss": 0.1689, + "step": 8178 + }, + { + "epoch": 3.326148840992273, + "grad_norm": 6.704454737190227, + "learning_rate": 1.7442263211144215e-05, + "loss": 0.3079, + "step": 8179 + }, + { + "epoch": 3.3265555103700692, + "grad_norm": 1.2109708932821437, + "learning_rate": 1.7441586436655636e-05, + "loss": 0.0224, + "step": 8180 + }, + { + "epoch": 3.326962179747865, + "grad_norm": 4.77485492413395, + "learning_rate": 1.7440909585775422e-05, + "loss": 0.1191, + "step": 8181 + }, + { + "epoch": 3.3273688491256608, + "grad_norm": 6.567702118581124, + "learning_rate": 1.744023265851053e-05, + "loss": 0.1823, + "step": 8182 + }, + { + "epoch": 3.3277755185034565, + "grad_norm": 6.335398173291952, + "learning_rate": 1.7439555654867902e-05, + "loss": 0.1919, + "step": 8183 + }, + { + "epoch": 3.3281821878812528, + "grad_norm": 22.511559622018655, + "learning_rate": 1.7438878574854495e-05, + "loss": 0.8866, + "step": 8184 + }, + { + "epoch": 3.3285888572590485, + "grad_norm": 3.510507566095694, + "learning_rate": 1.7438201418477252e-05, + "loss": 0.101, + "step": 8185 + }, + { + "epoch": 3.3289955266368443, + "grad_norm": 4.258863074424512, + "learning_rate": 1.743752418574313e-05, + "loss": 0.0895, + "step": 8186 + }, + { + "epoch": 3.32940219601464, + "grad_norm": 5.384602830925737, + "learning_rate": 1.7436846876659075e-05, + "loss": 0.1474, + "step": 8187 + }, + { + "epoch": 3.329808865392436, + "grad_norm": 29.630027405530708, + "learning_rate": 1.7436169491232048e-05, + "loss": 0.2773, + "step": 8188 + }, + { + "epoch": 3.3302155347702316, + "grad_norm": 7.672537672512554, + "learning_rate": 1.7435492029468995e-05, + "loss": 0.3517, + "step": 8189 + }, + { + "epoch": 3.330622204148028, + "grad_norm": 23.059175798337574, + "learning_rate": 1.7434814491376877e-05, + "loss": 0.7675, + "step": 8190 + }, + { + "epoch": 3.3310288735258236, + "grad_norm": 8.115772960649924, + "learning_rate": 1.7434136876962648e-05, + "loss": 0.4774, + "step": 8191 + }, + { + "epoch": 3.3314355429036193, + "grad_norm": 0.20119211827592617, + "learning_rate": 1.7433459186233258e-05, + "loss": 0.003, + "step": 8192 + }, + { + "epoch": 3.331842212281415, + "grad_norm": 3.5373681662201193, + "learning_rate": 1.743278141919567e-05, + "loss": 0.0489, + "step": 8193 + }, + { + "epoch": 3.3322488816592113, + "grad_norm": 7.183689364224285, + "learning_rate": 1.7432103575856844e-05, + "loss": 0.372, + "step": 8194 + }, + { + "epoch": 3.332655551037007, + "grad_norm": 7.262122820527722, + "learning_rate": 1.7431425656223728e-05, + "loss": 0.2606, + "step": 8195 + }, + { + "epoch": 3.333062220414803, + "grad_norm": 8.252538253534599, + "learning_rate": 1.743074766030329e-05, + "loss": 0.1962, + "step": 8196 + }, + { + "epoch": 3.3334688897925986, + "grad_norm": 13.31096697918905, + "learning_rate": 1.7430069588102486e-05, + "loss": 0.3527, + "step": 8197 + }, + { + "epoch": 3.3338755591703944, + "grad_norm": 2.1970210307729428, + "learning_rate": 1.7429391439628282e-05, + "loss": 0.0471, + "step": 8198 + }, + { + "epoch": 3.33428222854819, + "grad_norm": 1.9054496596785209, + "learning_rate": 1.7428713214887634e-05, + "loss": 0.0265, + "step": 8199 + }, + { + "epoch": 3.3346888979259863, + "grad_norm": 7.664036733449, + "learning_rate": 1.7428034913887502e-05, + "loss": 0.3267, + "step": 8200 + }, + { + "epoch": 3.335095567303782, + "grad_norm": 43.97050748971414, + "learning_rate": 1.742735653663486e-05, + "loss": 1.2828, + "step": 8201 + }, + { + "epoch": 3.335502236681578, + "grad_norm": 0.5433659495276938, + "learning_rate": 1.7426678083136663e-05, + "loss": 0.009, + "step": 8202 + }, + { + "epoch": 3.3359089060593736, + "grad_norm": 17.13914299613533, + "learning_rate": 1.7425999553399875e-05, + "loss": 0.2335, + "step": 8203 + }, + { + "epoch": 3.3363155754371694, + "grad_norm": 3.4945775026997024, + "learning_rate": 1.7425320947431466e-05, + "loss": 0.0687, + "step": 8204 + }, + { + "epoch": 3.3367222448149656, + "grad_norm": 4.676364603482221, + "learning_rate": 1.7424642265238397e-05, + "loss": 0.0877, + "step": 8205 + }, + { + "epoch": 3.3371289141927614, + "grad_norm": 15.02706014675068, + "learning_rate": 1.7423963506827643e-05, + "loss": 0.9394, + "step": 8206 + }, + { + "epoch": 3.337535583570557, + "grad_norm": 9.544504608338574, + "learning_rate": 1.7423284672206166e-05, + "loss": 0.5191, + "step": 8207 + }, + { + "epoch": 3.337942252948353, + "grad_norm": 8.301013602763401, + "learning_rate": 1.7422605761380937e-05, + "loss": 0.23, + "step": 8208 + }, + { + "epoch": 3.3383489223261487, + "grad_norm": 14.514565819283987, + "learning_rate": 1.742192677435892e-05, + "loss": 0.594, + "step": 8209 + }, + { + "epoch": 3.338755591703945, + "grad_norm": 2.8343500854525097, + "learning_rate": 1.7421247711147094e-05, + "loss": 0.1129, + "step": 8210 + }, + { + "epoch": 3.3391622610817406, + "grad_norm": 6.122300114527204, + "learning_rate": 1.7420568571752423e-05, + "loss": 0.2435, + "step": 8211 + }, + { + "epoch": 3.3395689304595364, + "grad_norm": 2.858172255565166, + "learning_rate": 1.741988935618188e-05, + "loss": 0.052, + "step": 8212 + }, + { + "epoch": 3.339975599837332, + "grad_norm": 6.48893403193827, + "learning_rate": 1.741921006444244e-05, + "loss": 0.2722, + "step": 8213 + }, + { + "epoch": 3.340382269215128, + "grad_norm": 4.86049833913469, + "learning_rate": 1.7418530696541073e-05, + "loss": 0.1567, + "step": 8214 + }, + { + "epoch": 3.340788938592924, + "grad_norm": 6.52756068031228, + "learning_rate": 1.7417851252484757e-05, + "loss": 0.1969, + "step": 8215 + }, + { + "epoch": 3.34119560797072, + "grad_norm": 4.212207145947725, + "learning_rate": 1.741717173228046e-05, + "loss": 0.1161, + "step": 8216 + }, + { + "epoch": 3.3416022773485157, + "grad_norm": 2.6864849421569503, + "learning_rate": 1.741649213593517e-05, + "loss": 0.0519, + "step": 8217 + }, + { + "epoch": 3.3420089467263114, + "grad_norm": 15.576403291950317, + "learning_rate": 1.741581246345585e-05, + "loss": 0.8419, + "step": 8218 + }, + { + "epoch": 3.342415616104107, + "grad_norm": 6.12167924394695, + "learning_rate": 1.7415132714849484e-05, + "loss": 0.1219, + "step": 8219 + }, + { + "epoch": 3.342822285481903, + "grad_norm": 9.163450579912638, + "learning_rate": 1.7414452890123048e-05, + "loss": 0.4874, + "step": 8220 + }, + { + "epoch": 3.343228954859699, + "grad_norm": 9.104146517472653, + "learning_rate": 1.741377298928352e-05, + "loss": 0.2035, + "step": 8221 + }, + { + "epoch": 3.343635624237495, + "grad_norm": 12.852091333703497, + "learning_rate": 1.7413093012337884e-05, + "loss": 0.4817, + "step": 8222 + }, + { + "epoch": 3.3440422936152907, + "grad_norm": 2.5167372185902694, + "learning_rate": 1.741241295929312e-05, + "loss": 0.0846, + "step": 8223 + }, + { + "epoch": 3.3444489629930865, + "grad_norm": 4.993896192494353, + "learning_rate": 1.74117328301562e-05, + "loss": 0.0956, + "step": 8224 + }, + { + "epoch": 3.3448556323708827, + "grad_norm": 5.764587027889722, + "learning_rate": 1.7411052624934117e-05, + "loss": 0.143, + "step": 8225 + }, + { + "epoch": 3.3452623017486784, + "grad_norm": 7.797486402624187, + "learning_rate": 1.7410372343633845e-05, + "loss": 0.4417, + "step": 8226 + }, + { + "epoch": 3.345668971126474, + "grad_norm": 14.969937914249035, + "learning_rate": 1.7409691986262374e-05, + "loss": 0.9093, + "step": 8227 + }, + { + "epoch": 3.34607564050427, + "grad_norm": 10.767501229133229, + "learning_rate": 1.7409011552826684e-05, + "loss": 0.2817, + "step": 8228 + }, + { + "epoch": 3.3464823098820657, + "grad_norm": 12.815892540389108, + "learning_rate": 1.740833104333376e-05, + "loss": 0.3356, + "step": 8229 + }, + { + "epoch": 3.3468889792598615, + "grad_norm": 1.2527302724912632, + "learning_rate": 1.7407650457790594e-05, + "loss": 0.011, + "step": 8230 + }, + { + "epoch": 3.3472956486376577, + "grad_norm": 7.651819944101468, + "learning_rate": 1.7406969796204167e-05, + "loss": 0.3742, + "step": 8231 + }, + { + "epoch": 3.3477023180154535, + "grad_norm": 7.844293364845823, + "learning_rate": 1.7406289058581466e-05, + "loss": 0.2875, + "step": 8232 + }, + { + "epoch": 3.3481089873932492, + "grad_norm": 9.081487171978079, + "learning_rate": 1.740560824492948e-05, + "loss": 0.4868, + "step": 8233 + }, + { + "epoch": 3.348515656771045, + "grad_norm": 3.753462010066665, + "learning_rate": 1.7404927355255197e-05, + "loss": 0.0721, + "step": 8234 + }, + { + "epoch": 3.348922326148841, + "grad_norm": 3.0501541726227086, + "learning_rate": 1.740424638956561e-05, + "loss": 0.0773, + "step": 8235 + }, + { + "epoch": 3.349328995526637, + "grad_norm": 7.914604710689892, + "learning_rate": 1.7403565347867705e-05, + "loss": 0.2252, + "step": 8236 + }, + { + "epoch": 3.3497356649044328, + "grad_norm": 0.9802401080912617, + "learning_rate": 1.7402884230168476e-05, + "loss": 0.019, + "step": 8237 + }, + { + "epoch": 3.3501423342822285, + "grad_norm": 3.2272507021129986, + "learning_rate": 1.7402203036474915e-05, + "loss": 0.0348, + "step": 8238 + }, + { + "epoch": 3.3505490036600243, + "grad_norm": 1.5850191325643372, + "learning_rate": 1.7401521766794018e-05, + "loss": 0.0241, + "step": 8239 + }, + { + "epoch": 3.35095567303782, + "grad_norm": 12.336520365600242, + "learning_rate": 1.740084042113277e-05, + "loss": 0.5187, + "step": 8240 + }, + { + "epoch": 3.3513623424156163, + "grad_norm": 8.99275346531765, + "learning_rate": 1.740015899949817e-05, + "loss": 0.1495, + "step": 8241 + }, + { + "epoch": 3.351769011793412, + "grad_norm": 11.3858961939406, + "learning_rate": 1.739947750189722e-05, + "loss": 0.5519, + "step": 8242 + }, + { + "epoch": 3.352175681171208, + "grad_norm": 3.7472508578816375, + "learning_rate": 1.7398795928336905e-05, + "loss": 0.2016, + "step": 8243 + }, + { + "epoch": 3.3525823505490036, + "grad_norm": 14.136328919575728, + "learning_rate": 1.7398114278824226e-05, + "loss": 0.8148, + "step": 8244 + }, + { + "epoch": 3.3529890199267993, + "grad_norm": 1.417224239820238, + "learning_rate": 1.739743255336618e-05, + "loss": 0.0197, + "step": 8245 + }, + { + "epoch": 3.3533956893045955, + "grad_norm": 7.0324449374993785, + "learning_rate": 1.7396750751969764e-05, + "loss": 0.1227, + "step": 8246 + }, + { + "epoch": 3.3538023586823913, + "grad_norm": 10.046502633891928, + "learning_rate": 1.7396068874641983e-05, + "loss": 0.0653, + "step": 8247 + }, + { + "epoch": 3.354209028060187, + "grad_norm": 13.734975357680595, + "learning_rate": 1.739538692138983e-05, + "loss": 0.8085, + "step": 8248 + }, + { + "epoch": 3.354615697437983, + "grad_norm": 16.081332478434987, + "learning_rate": 1.7394704892220307e-05, + "loss": 0.9972, + "step": 8249 + }, + { + "epoch": 3.3550223668157786, + "grad_norm": 0.2265721223193592, + "learning_rate": 1.739402278714042e-05, + "loss": 0.0031, + "step": 8250 + }, + { + "epoch": 3.355429036193575, + "grad_norm": 1.7697936252355686, + "learning_rate": 1.7393340606157168e-05, + "loss": 0.0345, + "step": 8251 + }, + { + "epoch": 3.3558357055713706, + "grad_norm": 10.89326123967029, + "learning_rate": 1.739265834927755e-05, + "loss": 0.4279, + "step": 8252 + }, + { + "epoch": 3.3562423749491663, + "grad_norm": 3.2990782529553164, + "learning_rate": 1.7391976016508577e-05, + "loss": 0.0842, + "step": 8253 + }, + { + "epoch": 3.356649044326962, + "grad_norm": 1.0697836432543906, + "learning_rate": 1.7391293607857248e-05, + "loss": 0.0132, + "step": 8254 + }, + { + "epoch": 3.357055713704758, + "grad_norm": 8.171658487305438, + "learning_rate": 1.739061112333057e-05, + "loss": 0.4504, + "step": 8255 + }, + { + "epoch": 3.357462383082554, + "grad_norm": 3.3860003473628515, + "learning_rate": 1.738992856293555e-05, + "loss": 0.0284, + "step": 8256 + }, + { + "epoch": 3.35786905246035, + "grad_norm": 1.6408083258748851, + "learning_rate": 1.7389245926679194e-05, + "loss": 0.0317, + "step": 8257 + }, + { + "epoch": 3.3582757218381456, + "grad_norm": 7.309771295356824, + "learning_rate": 1.738856321456851e-05, + "loss": 0.1216, + "step": 8258 + }, + { + "epoch": 3.3586823912159414, + "grad_norm": 7.606732513034543, + "learning_rate": 1.7387880426610504e-05, + "loss": 0.1742, + "step": 8259 + }, + { + "epoch": 3.359089060593737, + "grad_norm": 2.446226955289755, + "learning_rate": 1.738719756281219e-05, + "loss": 0.0709, + "step": 8260 + }, + { + "epoch": 3.359495729971533, + "grad_norm": 5.234485733093964, + "learning_rate": 1.7386514623180573e-05, + "loss": 0.1358, + "step": 8261 + }, + { + "epoch": 3.359902399349329, + "grad_norm": 7.779005226804534, + "learning_rate": 1.7385831607722667e-05, + "loss": 0.2898, + "step": 8262 + }, + { + "epoch": 3.360309068727125, + "grad_norm": 11.607693383638615, + "learning_rate": 1.7385148516445483e-05, + "loss": 0.4647, + "step": 8263 + }, + { + "epoch": 3.3607157381049206, + "grad_norm": 7.527966882943227, + "learning_rate": 1.738446534935603e-05, + "loss": 0.2225, + "step": 8264 + }, + { + "epoch": 3.3611224074827164, + "grad_norm": 30.74899205973022, + "learning_rate": 1.7383782106461327e-05, + "loss": 0.4158, + "step": 8265 + }, + { + "epoch": 3.3615290768605126, + "grad_norm": 23.750109358272248, + "learning_rate": 1.738309878776838e-05, + "loss": 1.1154, + "step": 8266 + }, + { + "epoch": 3.3619357462383084, + "grad_norm": 5.098475495548899, + "learning_rate": 1.738241539328421e-05, + "loss": 0.2749, + "step": 8267 + }, + { + "epoch": 3.362342415616104, + "grad_norm": 10.349841345444096, + "learning_rate": 1.7381731923015836e-05, + "loss": 0.3869, + "step": 8268 + }, + { + "epoch": 3.3627490849939, + "grad_norm": 11.551967805660833, + "learning_rate": 1.738104837697026e-05, + "loss": 0.4961, + "step": 8269 + }, + { + "epoch": 3.3631557543716957, + "grad_norm": 1.5823140213666829, + "learning_rate": 1.7380364755154513e-05, + "loss": 0.0242, + "step": 8270 + }, + { + "epoch": 3.3635624237494914, + "grad_norm": 4.646931814932854, + "learning_rate": 1.7379681057575605e-05, + "loss": 0.1018, + "step": 8271 + }, + { + "epoch": 3.3639690931272876, + "grad_norm": 13.451460745207594, + "learning_rate": 1.737899728424056e-05, + "loss": 0.622, + "step": 8272 + }, + { + "epoch": 3.3643757625050834, + "grad_norm": 12.852997097859015, + "learning_rate": 1.7378313435156392e-05, + "loss": 0.6146, + "step": 8273 + }, + { + "epoch": 3.364782431882879, + "grad_norm": 10.71462178400203, + "learning_rate": 1.737762951033012e-05, + "loss": 0.54, + "step": 8274 + }, + { + "epoch": 3.365189101260675, + "grad_norm": 22.046163673621862, + "learning_rate": 1.737694550976877e-05, + "loss": 0.6157, + "step": 8275 + }, + { + "epoch": 3.365595770638471, + "grad_norm": 8.953087173439426, + "learning_rate": 1.7376261433479365e-05, + "loss": 0.354, + "step": 8276 + }, + { + "epoch": 3.366002440016267, + "grad_norm": 18.243484893668267, + "learning_rate": 1.737557728146892e-05, + "loss": 0.4345, + "step": 8277 + }, + { + "epoch": 3.3664091093940627, + "grad_norm": 5.769666856065661, + "learning_rate": 1.7374893053744467e-05, + "loss": 0.2959, + "step": 8278 + }, + { + "epoch": 3.3668157787718584, + "grad_norm": 0.020694046115041442, + "learning_rate": 1.737420875031302e-05, + "loss": 0.0002, + "step": 8279 + }, + { + "epoch": 3.367222448149654, + "grad_norm": 9.777250811843436, + "learning_rate": 1.737352437118161e-05, + "loss": 0.2643, + "step": 8280 + }, + { + "epoch": 3.36762911752745, + "grad_norm": 13.877446547923004, + "learning_rate": 1.7372839916357262e-05, + "loss": 0.4711, + "step": 8281 + }, + { + "epoch": 3.368035786905246, + "grad_norm": 1.9644679902255677, + "learning_rate": 1.7372155385847004e-05, + "loss": 0.0316, + "step": 8282 + }, + { + "epoch": 3.368442456283042, + "grad_norm": 0.7702394567532018, + "learning_rate": 1.737147077965786e-05, + "loss": 0.0169, + "step": 8283 + }, + { + "epoch": 3.3688491256608377, + "grad_norm": 5.086869353670764, + "learning_rate": 1.7370786097796853e-05, + "loss": 0.1713, + "step": 8284 + }, + { + "epoch": 3.3692557950386335, + "grad_norm": 12.581759124244337, + "learning_rate": 1.7370101340271022e-05, + "loss": 0.6801, + "step": 8285 + }, + { + "epoch": 3.3696624644164297, + "grad_norm": 2.9708125212175234, + "learning_rate": 1.7369416507087387e-05, + "loss": 0.1211, + "step": 8286 + }, + { + "epoch": 3.3700691337942255, + "grad_norm": 1.513430336227525, + "learning_rate": 1.7368731598252987e-05, + "loss": 0.0187, + "step": 8287 + }, + { + "epoch": 3.3704758031720212, + "grad_norm": 5.236446546273663, + "learning_rate": 1.736804661377485e-05, + "loss": 0.0908, + "step": 8288 + }, + { + "epoch": 3.370882472549817, + "grad_norm": 14.02340233195366, + "learning_rate": 1.7367361553660002e-05, + "loss": 0.4023, + "step": 8289 + }, + { + "epoch": 3.3712891419276128, + "grad_norm": 7.408438175432804, + "learning_rate": 1.736667641791548e-05, + "loss": 0.2551, + "step": 8290 + }, + { + "epoch": 3.3716958113054085, + "grad_norm": 19.130088945641656, + "learning_rate": 1.736599120654832e-05, + "loss": 0.6081, + "step": 8291 + }, + { + "epoch": 3.3721024806832047, + "grad_norm": 3.196446608954204, + "learning_rate": 1.7365305919565552e-05, + "loss": 0.0669, + "step": 8292 + }, + { + "epoch": 3.3725091500610005, + "grad_norm": 2.137139661238369, + "learning_rate": 1.736462055697421e-05, + "loss": 0.0318, + "step": 8293 + }, + { + "epoch": 3.3729158194387963, + "grad_norm": 15.605653401978453, + "learning_rate": 1.736393511878133e-05, + "loss": 0.6945, + "step": 8294 + }, + { + "epoch": 3.373322488816592, + "grad_norm": 4.287002900393749, + "learning_rate": 1.7363249604993954e-05, + "loss": 0.2225, + "step": 8295 + }, + { + "epoch": 3.373729158194388, + "grad_norm": 12.12445897304462, + "learning_rate": 1.7362564015619115e-05, + "loss": 0.6503, + "step": 8296 + }, + { + "epoch": 3.374135827572184, + "grad_norm": 4.290603959705245, + "learning_rate": 1.7361878350663848e-05, + "loss": 0.1035, + "step": 8297 + }, + { + "epoch": 3.3745424969499798, + "grad_norm": 8.12304184980666, + "learning_rate": 1.7361192610135197e-05, + "loss": 0.4387, + "step": 8298 + }, + { + "epoch": 3.3749491663277755, + "grad_norm": 8.79347131681259, + "learning_rate": 1.7360506794040195e-05, + "loss": 0.3963, + "step": 8299 + }, + { + "epoch": 3.3753558357055713, + "grad_norm": 9.784144969681172, + "learning_rate": 1.735982090238589e-05, + "loss": 0.182, + "step": 8300 + }, + { + "epoch": 3.375762505083367, + "grad_norm": 3.7282365350544624, + "learning_rate": 1.735913493517932e-05, + "loss": 0.0693, + "step": 8301 + }, + { + "epoch": 3.376169174461163, + "grad_norm": 1.1862930478591753, + "learning_rate": 1.7358448892427524e-05, + "loss": 0.0167, + "step": 8302 + }, + { + "epoch": 3.376575843838959, + "grad_norm": 5.428564491919781, + "learning_rate": 1.7357762774137547e-05, + "loss": 0.1624, + "step": 8303 + }, + { + "epoch": 3.376982513216755, + "grad_norm": 11.263676355961993, + "learning_rate": 1.7357076580316432e-05, + "loss": 0.3163, + "step": 8304 + }, + { + "epoch": 3.3773891825945506, + "grad_norm": 9.872163851101721, + "learning_rate": 1.7356390310971224e-05, + "loss": 0.3408, + "step": 8305 + }, + { + "epoch": 3.3777958519723463, + "grad_norm": 9.391995250234475, + "learning_rate": 1.7355703966108966e-05, + "loss": 0.4906, + "step": 8306 + }, + { + "epoch": 3.3782025213501425, + "grad_norm": 0.18973671804489645, + "learning_rate": 1.7355017545736706e-05, + "loss": 0.0042, + "step": 8307 + }, + { + "epoch": 3.3786091907279383, + "grad_norm": 6.389617854186101, + "learning_rate": 1.7354331049861488e-05, + "loss": 0.3416, + "step": 8308 + }, + { + "epoch": 3.379015860105734, + "grad_norm": 20.047024256536382, + "learning_rate": 1.7353644478490362e-05, + "loss": 0.7717, + "step": 8309 + }, + { + "epoch": 3.37942252948353, + "grad_norm": 13.473208432150932, + "learning_rate": 1.7352957831630372e-05, + "loss": 0.9292, + "step": 8310 + }, + { + "epoch": 3.3798291988613256, + "grad_norm": 2.2151097976641774, + "learning_rate": 1.735227110928857e-05, + "loss": 0.0419, + "step": 8311 + }, + { + "epoch": 3.3802358682391214, + "grad_norm": 12.950991807639939, + "learning_rate": 1.7351584311472004e-05, + "loss": 0.6244, + "step": 8312 + }, + { + "epoch": 3.3806425376169176, + "grad_norm": 2.158836246245831, + "learning_rate": 1.7350897438187727e-05, + "loss": 0.044, + "step": 8313 + }, + { + "epoch": 3.3810492069947133, + "grad_norm": 4.903215195466037, + "learning_rate": 1.735021048944279e-05, + "loss": 0.0908, + "step": 8314 + }, + { + "epoch": 3.381455876372509, + "grad_norm": 33.857903843330064, + "learning_rate": 1.734952346524424e-05, + "loss": 0.8265, + "step": 8315 + }, + { + "epoch": 3.381862545750305, + "grad_norm": 13.738912718108384, + "learning_rate": 1.7348836365599133e-05, + "loss": 0.7545, + "step": 8316 + }, + { + "epoch": 3.382269215128101, + "grad_norm": 0.3128942536377677, + "learning_rate": 1.734814919051452e-05, + "loss": 0.0064, + "step": 8317 + }, + { + "epoch": 3.382675884505897, + "grad_norm": 15.709762028008596, + "learning_rate": 1.7347461939997462e-05, + "loss": 0.2409, + "step": 8318 + }, + { + "epoch": 3.3830825538836926, + "grad_norm": 12.92809351217494, + "learning_rate": 1.7346774614055007e-05, + "loss": 0.4794, + "step": 8319 + }, + { + "epoch": 3.3834892232614884, + "grad_norm": 1.1364497169607, + "learning_rate": 1.7346087212694213e-05, + "loss": 0.0218, + "step": 8320 + }, + { + "epoch": 3.383895892639284, + "grad_norm": 4.019439927353694, + "learning_rate": 1.7345399735922136e-05, + "loss": 0.1915, + "step": 8321 + }, + { + "epoch": 3.38430256201708, + "grad_norm": 11.910816344332185, + "learning_rate": 1.7344712183745837e-05, + "loss": 0.2361, + "step": 8322 + }, + { + "epoch": 3.384709231394876, + "grad_norm": 0.3727861667480053, + "learning_rate": 1.7344024556172364e-05, + "loss": 0.0088, + "step": 8323 + }, + { + "epoch": 3.385115900772672, + "grad_norm": 9.111765646676862, + "learning_rate": 1.734333685320879e-05, + "loss": 0.2548, + "step": 8324 + }, + { + "epoch": 3.3855225701504676, + "grad_norm": 8.065600722928718, + "learning_rate": 1.7342649074862164e-05, + "loss": 0.0694, + "step": 8325 + }, + { + "epoch": 3.3859292395282634, + "grad_norm": 1.7865032246904855, + "learning_rate": 1.7341961221139548e-05, + "loss": 0.0351, + "step": 8326 + }, + { + "epoch": 3.3863359089060596, + "grad_norm": 6.91454974221566, + "learning_rate": 1.7341273292048008e-05, + "loss": 0.4121, + "step": 8327 + }, + { + "epoch": 3.3867425782838554, + "grad_norm": 22.618310322421845, + "learning_rate": 1.7340585287594605e-05, + "loss": 0.785, + "step": 8328 + }, + { + "epoch": 3.387149247661651, + "grad_norm": 4.49974601023593, + "learning_rate": 1.7339897207786393e-05, + "loss": 0.1474, + "step": 8329 + }, + { + "epoch": 3.387555917039447, + "grad_norm": 2.5082903871623503, + "learning_rate": 1.7339209052630446e-05, + "loss": 0.0551, + "step": 8330 + }, + { + "epoch": 3.3879625864172427, + "grad_norm": 13.420751905227725, + "learning_rate": 1.7338520822133826e-05, + "loss": 0.9413, + "step": 8331 + }, + { + "epoch": 3.3883692557950384, + "grad_norm": 1.7904137457379703, + "learning_rate": 1.7337832516303593e-05, + "loss": 0.0271, + "step": 8332 + }, + { + "epoch": 3.3887759251728347, + "grad_norm": 5.7899032638936205, + "learning_rate": 1.7337144135146818e-05, + "loss": 0.1382, + "step": 8333 + }, + { + "epoch": 3.3891825945506304, + "grad_norm": 10.259493042133228, + "learning_rate": 1.7336455678670566e-05, + "loss": 0.5712, + "step": 8334 + }, + { + "epoch": 3.389589263928426, + "grad_norm": 5.980178302710975, + "learning_rate": 1.7335767146881904e-05, + "loss": 0.0878, + "step": 8335 + }, + { + "epoch": 3.389995933306222, + "grad_norm": 0.9147657725898206, + "learning_rate": 1.73350785397879e-05, + "loss": 0.0121, + "step": 8336 + }, + { + "epoch": 3.3904026026840177, + "grad_norm": 8.730448598433071, + "learning_rate": 1.733438985739562e-05, + "loss": 0.2723, + "step": 8337 + }, + { + "epoch": 3.390809272061814, + "grad_norm": 9.922652312965544, + "learning_rate": 1.733370109971214e-05, + "loss": 0.5202, + "step": 8338 + }, + { + "epoch": 3.3912159414396097, + "grad_norm": 1.5172723178635101, + "learning_rate": 1.7333012266744527e-05, + "loss": 0.0216, + "step": 8339 + }, + { + "epoch": 3.3916226108174055, + "grad_norm": 13.405093684491606, + "learning_rate": 1.733232335849985e-05, + "loss": 0.6889, + "step": 8340 + }, + { + "epoch": 3.3920292801952012, + "grad_norm": 10.46092249101441, + "learning_rate": 1.7331634374985187e-05, + "loss": 0.3333, + "step": 8341 + }, + { + "epoch": 3.392435949572997, + "grad_norm": 5.982880142597084, + "learning_rate": 1.7330945316207604e-05, + "loss": 0.1317, + "step": 8342 + }, + { + "epoch": 3.392842618950793, + "grad_norm": 3.8859591619914737, + "learning_rate": 1.733025618217418e-05, + "loss": 0.0841, + "step": 8343 + }, + { + "epoch": 3.393249288328589, + "grad_norm": 7.0602720504250165, + "learning_rate": 1.732956697289199e-05, + "loss": 0.1482, + "step": 8344 + }, + { + "epoch": 3.3936559577063847, + "grad_norm": 2.0573547927601568, + "learning_rate": 1.73288776883681e-05, + "loss": 0.0323, + "step": 8345 + }, + { + "epoch": 3.3940626270841805, + "grad_norm": 10.330044428227136, + "learning_rate": 1.7328188328609595e-05, + "loss": 0.5051, + "step": 8346 + }, + { + "epoch": 3.3944692964619763, + "grad_norm": 10.050275791663013, + "learning_rate": 1.7327498893623548e-05, + "loss": 0.3326, + "step": 8347 + }, + { + "epoch": 3.3948759658397725, + "grad_norm": 7.636237318078678, + "learning_rate": 1.732680938341704e-05, + "loss": 0.4483, + "step": 8348 + }, + { + "epoch": 3.3952826352175682, + "grad_norm": 13.358891120312318, + "learning_rate": 1.732611979799714e-05, + "loss": 0.3556, + "step": 8349 + }, + { + "epoch": 3.395689304595364, + "grad_norm": 10.516863354840947, + "learning_rate": 1.732543013737094e-05, + "loss": 0.4825, + "step": 8350 + }, + { + "epoch": 3.3960959739731598, + "grad_norm": 9.1406324381472, + "learning_rate": 1.732474040154551e-05, + "loss": 0.1785, + "step": 8351 + }, + { + "epoch": 3.3965026433509555, + "grad_norm": 7.729818567782095, + "learning_rate": 1.732405059052793e-05, + "loss": 0.1591, + "step": 8352 + }, + { + "epoch": 3.3969093127287513, + "grad_norm": 29.869631418077102, + "learning_rate": 1.732336070432529e-05, + "loss": 0.7031, + "step": 8353 + }, + { + "epoch": 3.3973159821065475, + "grad_norm": 11.23779949732526, + "learning_rate": 1.732267074294466e-05, + "loss": 0.3443, + "step": 8354 + }, + { + "epoch": 3.3977226514843433, + "grad_norm": 9.636337336993114, + "learning_rate": 1.7321980706393135e-05, + "loss": 0.2115, + "step": 8355 + }, + { + "epoch": 3.398129320862139, + "grad_norm": 3.0429408089367773, + "learning_rate": 1.7321290594677792e-05, + "loss": 0.0576, + "step": 8356 + }, + { + "epoch": 3.398535990239935, + "grad_norm": 16.43677545671242, + "learning_rate": 1.7320600407805718e-05, + "loss": 0.3777, + "step": 8357 + }, + { + "epoch": 3.398942659617731, + "grad_norm": 6.674628246350126, + "learning_rate": 1.731991014578399e-05, + "loss": 0.1907, + "step": 8358 + }, + { + "epoch": 3.3993493289955268, + "grad_norm": 15.42179158740702, + "learning_rate": 1.731921980861971e-05, + "loss": 0.3795, + "step": 8359 + }, + { + "epoch": 3.3997559983733225, + "grad_norm": 4.433368619798972, + "learning_rate": 1.7318529396319946e-05, + "loss": 0.1578, + "step": 8360 + }, + { + "epoch": 3.4001626677511183, + "grad_norm": 12.06232748427862, + "learning_rate": 1.7317838908891797e-05, + "loss": 0.6701, + "step": 8361 + }, + { + "epoch": 3.400569337128914, + "grad_norm": 11.125970161943913, + "learning_rate": 1.7317148346342348e-05, + "loss": 0.3097, + "step": 8362 + }, + { + "epoch": 3.40097600650671, + "grad_norm": 9.724098590170652, + "learning_rate": 1.7316457708678692e-05, + "loss": 0.3013, + "step": 8363 + }, + { + "epoch": 3.401382675884506, + "grad_norm": 4.23667821719794, + "learning_rate": 1.7315766995907914e-05, + "loss": 0.0637, + "step": 8364 + }, + { + "epoch": 3.401789345262302, + "grad_norm": 2.7305140838591138, + "learning_rate": 1.7315076208037107e-05, + "loss": 0.0598, + "step": 8365 + }, + { + "epoch": 3.4021960146400976, + "grad_norm": 0.6506773975743531, + "learning_rate": 1.7314385345073356e-05, + "loss": 0.0125, + "step": 8366 + }, + { + "epoch": 3.4026026840178933, + "grad_norm": 5.745643042378075, + "learning_rate": 1.731369440702376e-05, + "loss": 0.5411, + "step": 8367 + }, + { + "epoch": 3.4030093533956896, + "grad_norm": 6.331186062717615, + "learning_rate": 1.731300339389541e-05, + "loss": 0.2756, + "step": 8368 + }, + { + "epoch": 3.4034160227734853, + "grad_norm": 3.391920564591494, + "learning_rate": 1.7312312305695403e-05, + "loss": 0.0729, + "step": 8369 + }, + { + "epoch": 3.403822692151281, + "grad_norm": 5.1463189720089035, + "learning_rate": 1.731162114243083e-05, + "loss": 0.1219, + "step": 8370 + }, + { + "epoch": 3.404229361529077, + "grad_norm": 3.2887836719701617, + "learning_rate": 1.731092990410878e-05, + "loss": 0.0664, + "step": 8371 + }, + { + "epoch": 3.4046360309068726, + "grad_norm": 11.484784113642453, + "learning_rate": 1.7310238590736362e-05, + "loss": 0.6563, + "step": 8372 + }, + { + "epoch": 3.4050427002846684, + "grad_norm": 5.337935491117338, + "learning_rate": 1.7309547202320663e-05, + "loss": 0.1405, + "step": 8373 + }, + { + "epoch": 3.4054493696624646, + "grad_norm": 4.869250434919441, + "learning_rate": 1.7308855738868784e-05, + "loss": 0.1779, + "step": 8374 + }, + { + "epoch": 3.4058560390402604, + "grad_norm": 10.167564197236388, + "learning_rate": 1.7308164200387822e-05, + "loss": 0.6046, + "step": 8375 + }, + { + "epoch": 3.406262708418056, + "grad_norm": 12.054612725644647, + "learning_rate": 1.7307472586884873e-05, + "loss": 0.599, + "step": 8376 + }, + { + "epoch": 3.406669377795852, + "grad_norm": 9.98838771931904, + "learning_rate": 1.7306780898367044e-05, + "loss": 0.4643, + "step": 8377 + }, + { + "epoch": 3.4070760471736476, + "grad_norm": 10.494950227255961, + "learning_rate": 1.7306089134841428e-05, + "loss": 0.2645, + "step": 8378 + }, + { + "epoch": 3.407482716551444, + "grad_norm": 14.387880027212905, + "learning_rate": 1.7305397296315134e-05, + "loss": 1.2091, + "step": 8379 + }, + { + "epoch": 3.4078893859292396, + "grad_norm": 8.497097913385739, + "learning_rate": 1.730470538279526e-05, + "loss": 0.2676, + "step": 8380 + }, + { + "epoch": 3.4082960553070354, + "grad_norm": 10.040826126214489, + "learning_rate": 1.7304013394288905e-05, + "loss": 0.4019, + "step": 8381 + }, + { + "epoch": 3.408702724684831, + "grad_norm": 4.8894046479254225, + "learning_rate": 1.730332133080318e-05, + "loss": 0.0966, + "step": 8382 + }, + { + "epoch": 3.409109394062627, + "grad_norm": 11.107122303926795, + "learning_rate": 1.7302629192345186e-05, + "loss": 0.3804, + "step": 8383 + }, + { + "epoch": 3.409516063440423, + "grad_norm": 8.86099119240948, + "learning_rate": 1.7301936978922028e-05, + "loss": 0.498, + "step": 8384 + }, + { + "epoch": 3.409922732818219, + "grad_norm": 5.781209953514146, + "learning_rate": 1.7301244690540808e-05, + "loss": 0.2583, + "step": 8385 + }, + { + "epoch": 3.4103294021960147, + "grad_norm": 0.43255126097576485, + "learning_rate": 1.7300552327208638e-05, + "loss": 0.005, + "step": 8386 + }, + { + "epoch": 3.4107360715738104, + "grad_norm": 8.352577974841408, + "learning_rate": 1.7299859888932626e-05, + "loss": 0.2492, + "step": 8387 + }, + { + "epoch": 3.411142740951606, + "grad_norm": 5.087377653351276, + "learning_rate": 1.729916737571988e-05, + "loss": 0.1822, + "step": 8388 + }, + { + "epoch": 3.4115494103294024, + "grad_norm": 9.970408973502103, + "learning_rate": 1.7298474787577505e-05, + "loss": 0.3248, + "step": 8389 + }, + { + "epoch": 3.411956079707198, + "grad_norm": 4.193174729612291, + "learning_rate": 1.7297782124512613e-05, + "loss": 0.1369, + "step": 8390 + }, + { + "epoch": 3.412362749084994, + "grad_norm": 13.973167908498299, + "learning_rate": 1.7297089386532312e-05, + "loss": 0.7418, + "step": 8391 + }, + { + "epoch": 3.4127694184627897, + "grad_norm": 1.872594863578889, + "learning_rate": 1.7296396573643717e-05, + "loss": 0.0368, + "step": 8392 + }, + { + "epoch": 3.4131760878405855, + "grad_norm": 13.652018153110426, + "learning_rate": 1.729570368585394e-05, + "loss": 0.463, + "step": 8393 + }, + { + "epoch": 3.4135827572183812, + "grad_norm": 12.280479760380123, + "learning_rate": 1.7295010723170093e-05, + "loss": 0.5375, + "step": 8394 + }, + { + "epoch": 3.4139894265961774, + "grad_norm": 11.67673526438557, + "learning_rate": 1.729431768559929e-05, + "loss": 0.7616, + "step": 8395 + }, + { + "epoch": 3.414396095973973, + "grad_norm": 21.854928386679614, + "learning_rate": 1.7293624573148644e-05, + "loss": 1.1535, + "step": 8396 + }, + { + "epoch": 3.414802765351769, + "grad_norm": 1.8182177433582825, + "learning_rate": 1.7292931385825273e-05, + "loss": 0.0338, + "step": 8397 + }, + { + "epoch": 3.4152094347295647, + "grad_norm": 0.4538939195460024, + "learning_rate": 1.7292238123636287e-05, + "loss": 0.0066, + "step": 8398 + }, + { + "epoch": 3.415616104107361, + "grad_norm": 3.339963147611362, + "learning_rate": 1.729154478658881e-05, + "loss": 0.064, + "step": 8399 + }, + { + "epoch": 3.4160227734851567, + "grad_norm": 13.21325877396413, + "learning_rate": 1.7290851374689957e-05, + "loss": 0.2657, + "step": 8400 + }, + { + "epoch": 3.4164294428629525, + "grad_norm": 0.26574700684309427, + "learning_rate": 1.7290157887946843e-05, + "loss": 0.0056, + "step": 8401 + }, + { + "epoch": 3.4168361122407482, + "grad_norm": 5.151190243536334, + "learning_rate": 1.7289464326366588e-05, + "loss": 0.235, + "step": 8402 + }, + { + "epoch": 3.417242781618544, + "grad_norm": 0.08451740061602425, + "learning_rate": 1.7288770689956316e-05, + "loss": 0.0012, + "step": 8403 + }, + { + "epoch": 3.4176494509963398, + "grad_norm": 6.870399041895158, + "learning_rate": 1.728807697872314e-05, + "loss": 0.3388, + "step": 8404 + }, + { + "epoch": 3.418056120374136, + "grad_norm": 4.749686781571611, + "learning_rate": 1.728738319267419e-05, + "loss": 0.0724, + "step": 8405 + }, + { + "epoch": 3.4184627897519317, + "grad_norm": 7.442120742131773, + "learning_rate": 1.7286689331816583e-05, + "loss": 0.3483, + "step": 8406 + }, + { + "epoch": 3.4188694591297275, + "grad_norm": 5.978048348695656, + "learning_rate": 1.728599539615744e-05, + "loss": 0.372, + "step": 8407 + }, + { + "epoch": 3.4192761285075233, + "grad_norm": 5.022349426645621, + "learning_rate": 1.728530138570389e-05, + "loss": 0.0849, + "step": 8408 + }, + { + "epoch": 3.4196827978853195, + "grad_norm": 5.123495247464687, + "learning_rate": 1.7284607300463054e-05, + "loss": 0.0887, + "step": 8409 + }, + { + "epoch": 3.4200894672631152, + "grad_norm": 5.297376325184946, + "learning_rate": 1.728391314044206e-05, + "loss": 0.0956, + "step": 8410 + }, + { + "epoch": 3.420496136640911, + "grad_norm": 4.031100340318981, + "learning_rate": 1.7283218905648027e-05, + "loss": 0.0769, + "step": 8411 + }, + { + "epoch": 3.4209028060187068, + "grad_norm": 16.28349773042784, + "learning_rate": 1.728252459608809e-05, + "loss": 0.9179, + "step": 8412 + }, + { + "epoch": 3.4213094753965025, + "grad_norm": 2.688756863787497, + "learning_rate": 1.7281830211769373e-05, + "loss": 0.0473, + "step": 8413 + }, + { + "epoch": 3.4217161447742983, + "grad_norm": 1.5650998157364313, + "learning_rate": 1.7281135752699002e-05, + "loss": 0.0194, + "step": 8414 + }, + { + "epoch": 3.4221228141520945, + "grad_norm": 8.85737165780642, + "learning_rate": 1.728044121888411e-05, + "loss": 0.4647, + "step": 8415 + }, + { + "epoch": 3.4225294835298903, + "grad_norm": 1.548698896973878, + "learning_rate": 1.7279746610331822e-05, + "loss": 0.034, + "step": 8416 + }, + { + "epoch": 3.422936152907686, + "grad_norm": 14.784260112534374, + "learning_rate": 1.727905192704927e-05, + "loss": 1.0116, + "step": 8417 + }, + { + "epoch": 3.423342822285482, + "grad_norm": 30.970820634749742, + "learning_rate": 1.727835716904359e-05, + "loss": 0.3052, + "step": 8418 + }, + { + "epoch": 3.4237494916632776, + "grad_norm": 3.9456599750747556, + "learning_rate": 1.727766233632191e-05, + "loss": 0.0712, + "step": 8419 + }, + { + "epoch": 3.424156161041074, + "grad_norm": 33.37244996237045, + "learning_rate": 1.7276967428891366e-05, + "loss": 0.2183, + "step": 8420 + }, + { + "epoch": 3.4245628304188696, + "grad_norm": 7.354235766333171, + "learning_rate": 1.7276272446759084e-05, + "loss": 0.1353, + "step": 8421 + }, + { + "epoch": 3.4249694997966653, + "grad_norm": 10.234880258558748, + "learning_rate": 1.7275577389932205e-05, + "loss": 0.5168, + "step": 8422 + }, + { + "epoch": 3.425376169174461, + "grad_norm": 19.341047540382977, + "learning_rate": 1.7274882258417864e-05, + "loss": 0.5106, + "step": 8423 + }, + { + "epoch": 3.425782838552257, + "grad_norm": 11.408103255860597, + "learning_rate": 1.7274187052223192e-05, + "loss": 0.4057, + "step": 8424 + }, + { + "epoch": 3.426189507930053, + "grad_norm": 0.7787919952023644, + "learning_rate": 1.727349177135533e-05, + "loss": 0.0122, + "step": 8425 + }, + { + "epoch": 3.426596177307849, + "grad_norm": 8.419725185324953, + "learning_rate": 1.7272796415821417e-05, + "loss": 0.4593, + "step": 8426 + }, + { + "epoch": 3.4270028466856446, + "grad_norm": 29.62110375181963, + "learning_rate": 1.7272100985628585e-05, + "loss": 0.4914, + "step": 8427 + }, + { + "epoch": 3.4274095160634404, + "grad_norm": 7.280481175104315, + "learning_rate": 1.7271405480783977e-05, + "loss": 0.5409, + "step": 8428 + }, + { + "epoch": 3.427816185441236, + "grad_norm": 6.242041657666355, + "learning_rate": 1.7270709901294737e-05, + "loss": 0.1144, + "step": 8429 + }, + { + "epoch": 3.4282228548190323, + "grad_norm": 4.300330813627842, + "learning_rate": 1.7270014247167996e-05, + "loss": 0.1522, + "step": 8430 + }, + { + "epoch": 3.428629524196828, + "grad_norm": 1.9542056095343125, + "learning_rate": 1.72693185184109e-05, + "loss": 0.042, + "step": 8431 + }, + { + "epoch": 3.429036193574624, + "grad_norm": 9.10493502899875, + "learning_rate": 1.726862271503059e-05, + "loss": 0.0248, + "step": 8432 + }, + { + "epoch": 3.4294428629524196, + "grad_norm": 2.0966006240431647, + "learning_rate": 1.7267926837034213e-05, + "loss": 0.0209, + "step": 8433 + }, + { + "epoch": 3.4298495323302154, + "grad_norm": 17.562160372459573, + "learning_rate": 1.7267230884428905e-05, + "loss": 1.6024, + "step": 8434 + }, + { + "epoch": 3.430256201708011, + "grad_norm": 15.001234655430412, + "learning_rate": 1.7266534857221816e-05, + "loss": 0.7935, + "step": 8435 + }, + { + "epoch": 3.4306628710858074, + "grad_norm": 1.6044199462595352, + "learning_rate": 1.726583875542009e-05, + "loss": 0.0238, + "step": 8436 + }, + { + "epoch": 3.431069540463603, + "grad_norm": 15.226080722927946, + "learning_rate": 1.7265142579030872e-05, + "loss": 0.662, + "step": 8437 + }, + { + "epoch": 3.431476209841399, + "grad_norm": 10.918455423126641, + "learning_rate": 1.726444632806131e-05, + "loss": 0.3643, + "step": 8438 + }, + { + "epoch": 3.4318828792191947, + "grad_norm": 0.8619600239969191, + "learning_rate": 1.726375000251855e-05, + "loss": 0.0148, + "step": 8439 + }, + { + "epoch": 3.432289548596991, + "grad_norm": 3.738151782670385, + "learning_rate": 1.726305360240974e-05, + "loss": 0.155, + "step": 8440 + }, + { + "epoch": 3.4326962179747866, + "grad_norm": 9.300277255716898, + "learning_rate": 1.726235712774203e-05, + "loss": 0.2372, + "step": 8441 + }, + { + "epoch": 3.4331028873525824, + "grad_norm": 1.9614133722863836, + "learning_rate": 1.7261660578522566e-05, + "loss": 0.0273, + "step": 8442 + }, + { + "epoch": 3.433509556730378, + "grad_norm": 1.3806857199851568, + "learning_rate": 1.7260963954758504e-05, + "loss": 0.0212, + "step": 8443 + }, + { + "epoch": 3.433916226108174, + "grad_norm": 0.998462613137278, + "learning_rate": 1.7260267256456993e-05, + "loss": 0.0127, + "step": 8444 + }, + { + "epoch": 3.4343228954859697, + "grad_norm": 10.792317734767156, + "learning_rate": 1.7259570483625185e-05, + "loss": 0.7507, + "step": 8445 + }, + { + "epoch": 3.434729564863766, + "grad_norm": 4.818741775229415, + "learning_rate": 1.725887363627023e-05, + "loss": 0.1213, + "step": 8446 + }, + { + "epoch": 3.4351362342415617, + "grad_norm": 7.766818029058626, + "learning_rate": 1.7258176714399288e-05, + "loss": 0.3336, + "step": 8447 + }, + { + "epoch": 3.4355429036193574, + "grad_norm": 2.8158519543076315, + "learning_rate": 1.7257479718019504e-05, + "loss": 0.0538, + "step": 8448 + }, + { + "epoch": 3.435949572997153, + "grad_norm": 7.690755127333314, + "learning_rate": 1.7256782647138042e-05, + "loss": 0.1884, + "step": 8449 + }, + { + "epoch": 3.4363562423749494, + "grad_norm": 18.3065568080991, + "learning_rate": 1.725608550176205e-05, + "loss": 0.9454, + "step": 8450 + }, + { + "epoch": 3.436762911752745, + "grad_norm": 15.445157796801796, + "learning_rate": 1.7255388281898692e-05, + "loss": 0.8864, + "step": 8451 + }, + { + "epoch": 3.437169581130541, + "grad_norm": 16.160495615718332, + "learning_rate": 1.725469098755512e-05, + "loss": 0.6597, + "step": 8452 + }, + { + "epoch": 3.4375762505083367, + "grad_norm": 4.306918519433118, + "learning_rate": 1.7253993618738495e-05, + "loss": 0.0866, + "step": 8453 + }, + { + "epoch": 3.4379829198861325, + "grad_norm": 5.982461438731612, + "learning_rate": 1.7253296175455972e-05, + "loss": 0.1568, + "step": 8454 + }, + { + "epoch": 3.4383895892639282, + "grad_norm": 6.73378196962894, + "learning_rate": 1.725259865771472e-05, + "loss": 0.1164, + "step": 8455 + }, + { + "epoch": 3.4387962586417244, + "grad_norm": 24.868492443374095, + "learning_rate": 1.7251901065521884e-05, + "loss": 0.2006, + "step": 8456 + }, + { + "epoch": 3.43920292801952, + "grad_norm": 22.55635681158764, + "learning_rate": 1.7251203398884637e-05, + "loss": 0.9406, + "step": 8457 + }, + { + "epoch": 3.439609597397316, + "grad_norm": 14.255122189342227, + "learning_rate": 1.7250505657810138e-05, + "loss": 0.6739, + "step": 8458 + }, + { + "epoch": 3.4400162667751117, + "grad_norm": 17.968439393883415, + "learning_rate": 1.724980784230555e-05, + "loss": 0.9966, + "step": 8459 + }, + { + "epoch": 3.4404229361529075, + "grad_norm": 6.691251938145542, + "learning_rate": 1.7249109952378037e-05, + "loss": 0.1544, + "step": 8460 + }, + { + "epoch": 3.4408296055307037, + "grad_norm": 8.590688072171467, + "learning_rate": 1.724841198803476e-05, + "loss": 0.2732, + "step": 8461 + }, + { + "epoch": 3.4412362749084995, + "grad_norm": 8.178831582360994, + "learning_rate": 1.7247713949282885e-05, + "loss": 0.2418, + "step": 8462 + }, + { + "epoch": 3.4416429442862952, + "grad_norm": 2.1129254354864355, + "learning_rate": 1.724701583612958e-05, + "loss": 0.074, + "step": 8463 + }, + { + "epoch": 3.442049613664091, + "grad_norm": 2.888292599811289, + "learning_rate": 1.724631764858201e-05, + "loss": 0.0625, + "step": 8464 + }, + { + "epoch": 3.4424562830418868, + "grad_norm": 12.56798596423942, + "learning_rate": 1.724561938664734e-05, + "loss": 0.4264, + "step": 8465 + }, + { + "epoch": 3.442862952419683, + "grad_norm": 9.237123815481898, + "learning_rate": 1.7244921050332743e-05, + "loss": 0.2031, + "step": 8466 + }, + { + "epoch": 3.4432696217974788, + "grad_norm": 12.281790808448303, + "learning_rate": 1.7244222639645384e-05, + "loss": 0.7022, + "step": 8467 + }, + { + "epoch": 3.4436762911752745, + "grad_norm": 11.476693198081113, + "learning_rate": 1.7243524154592438e-05, + "loss": 0.4233, + "step": 8468 + }, + { + "epoch": 3.4440829605530703, + "grad_norm": 3.4910003242787497, + "learning_rate": 1.7242825595181065e-05, + "loss": 0.0751, + "step": 8469 + }, + { + "epoch": 3.444489629930866, + "grad_norm": 17.42842536530294, + "learning_rate": 1.7242126961418445e-05, + "loss": 1.1111, + "step": 8470 + }, + { + "epoch": 3.4448962993086623, + "grad_norm": 5.661690539767515, + "learning_rate": 1.7241428253311746e-05, + "loss": 0.1412, + "step": 8471 + }, + { + "epoch": 3.445302968686458, + "grad_norm": 5.586474342392906, + "learning_rate": 1.724072947086814e-05, + "loss": 0.1715, + "step": 8472 + }, + { + "epoch": 3.445709638064254, + "grad_norm": 7.161877801668576, + "learning_rate": 1.7240030614094804e-05, + "loss": 0.1627, + "step": 8473 + }, + { + "epoch": 3.4461163074420496, + "grad_norm": 7.7937693405743085, + "learning_rate": 1.723933168299891e-05, + "loss": 0.2069, + "step": 8474 + }, + { + "epoch": 3.4465229768198453, + "grad_norm": 7.374792368780141, + "learning_rate": 1.723863267758763e-05, + "loss": 0.3493, + "step": 8475 + }, + { + "epoch": 3.446929646197641, + "grad_norm": 22.209054784249698, + "learning_rate": 1.7237933597868143e-05, + "loss": 0.5671, + "step": 8476 + }, + { + "epoch": 3.4473363155754373, + "grad_norm": 9.684479705521182, + "learning_rate": 1.723723444384763e-05, + "loss": 0.8434, + "step": 8477 + }, + { + "epoch": 3.447742984953233, + "grad_norm": 12.068167745442743, + "learning_rate": 1.723653521553326e-05, + "loss": 0.5155, + "step": 8478 + }, + { + "epoch": 3.448149654331029, + "grad_norm": 2.269530162240619, + "learning_rate": 1.7235835912932213e-05, + "loss": 0.0306, + "step": 8479 + }, + { + "epoch": 3.4485563237088246, + "grad_norm": 15.097847251621538, + "learning_rate": 1.7235136536051667e-05, + "loss": 0.6906, + "step": 8480 + }, + { + "epoch": 3.448962993086621, + "grad_norm": 17.073341483237773, + "learning_rate": 1.7234437084898804e-05, + "loss": 0.8782, + "step": 8481 + }, + { + "epoch": 3.4493696624644166, + "grad_norm": 6.149755378491331, + "learning_rate": 1.7233737559480805e-05, + "loss": 0.1806, + "step": 8482 + }, + { + "epoch": 3.4497763318422123, + "grad_norm": 1.4084751566108673, + "learning_rate": 1.7233037959804847e-05, + "loss": 0.0245, + "step": 8483 + }, + { + "epoch": 3.450183001220008, + "grad_norm": 9.230527397631127, + "learning_rate": 1.7232338285878117e-05, + "loss": 0.3064, + "step": 8484 + }, + { + "epoch": 3.450589670597804, + "grad_norm": 7.967015824239968, + "learning_rate": 1.7231638537707794e-05, + "loss": 0.4638, + "step": 8485 + }, + { + "epoch": 3.4509963399755996, + "grad_norm": 1.836763267083483, + "learning_rate": 1.723093871530106e-05, + "loss": 0.0262, + "step": 8486 + }, + { + "epoch": 3.451403009353396, + "grad_norm": 2.744591841470115, + "learning_rate": 1.7230238818665103e-05, + "loss": 0.074, + "step": 8487 + }, + { + "epoch": 3.4518096787311916, + "grad_norm": 10.86587194585161, + "learning_rate": 1.7229538847807104e-05, + "loss": 0.8103, + "step": 8488 + }, + { + "epoch": 3.4522163481089874, + "grad_norm": 8.377126938290079, + "learning_rate": 1.722883880273425e-05, + "loss": 0.0974, + "step": 8489 + }, + { + "epoch": 3.452623017486783, + "grad_norm": 3.23833369339431, + "learning_rate": 1.722813868345373e-05, + "loss": 0.0668, + "step": 8490 + }, + { + "epoch": 3.4530296868645793, + "grad_norm": 4.414858415850505, + "learning_rate": 1.7227438489972725e-05, + "loss": 0.1576, + "step": 8491 + }, + { + "epoch": 3.453436356242375, + "grad_norm": 2.0923464564357364, + "learning_rate": 1.722673822229843e-05, + "loss": 0.0419, + "step": 8492 + }, + { + "epoch": 3.453843025620171, + "grad_norm": 5.197858592072603, + "learning_rate": 1.7226037880438028e-05, + "loss": 0.1518, + "step": 8493 + }, + { + "epoch": 3.4542496949979666, + "grad_norm": 12.805688854880948, + "learning_rate": 1.722533746439871e-05, + "loss": 0.764, + "step": 8494 + }, + { + "epoch": 3.4546563643757624, + "grad_norm": 0.16979075991378195, + "learning_rate": 1.7224636974187666e-05, + "loss": 0.0031, + "step": 8495 + }, + { + "epoch": 3.455063033753558, + "grad_norm": 5.291643191479005, + "learning_rate": 1.722393640981209e-05, + "loss": 0.1089, + "step": 8496 + }, + { + "epoch": 3.4554697031313544, + "grad_norm": 9.777928238682936, + "learning_rate": 1.7223235771279166e-05, + "loss": 0.691, + "step": 8497 + }, + { + "epoch": 3.45587637250915, + "grad_norm": 12.595461360547196, + "learning_rate": 1.72225350585961e-05, + "loss": 0.6384, + "step": 8498 + }, + { + "epoch": 3.456283041886946, + "grad_norm": 5.361414321612986, + "learning_rate": 1.7221834271770065e-05, + "loss": 0.1372, + "step": 8499 + }, + { + "epoch": 3.4566897112647417, + "grad_norm": 9.550770919010215, + "learning_rate": 1.7221133410808276e-05, + "loss": 0.5219, + "step": 8500 + }, + { + "epoch": 3.4570963806425374, + "grad_norm": 7.518650430452968, + "learning_rate": 1.7220432475717914e-05, + "loss": 0.2812, + "step": 8501 + }, + { + "epoch": 3.4575030500203336, + "grad_norm": 6.985695308786033, + "learning_rate": 1.721973146650618e-05, + "loss": 0.2319, + "step": 8502 + }, + { + "epoch": 3.4579097193981294, + "grad_norm": 13.285006064373933, + "learning_rate": 1.7219030383180268e-05, + "loss": 0.6112, + "step": 8503 + }, + { + "epoch": 3.458316388775925, + "grad_norm": 15.47525809345113, + "learning_rate": 1.7218329225747376e-05, + "loss": 0.5737, + "step": 8504 + }, + { + "epoch": 3.458723058153721, + "grad_norm": 8.024350811556383, + "learning_rate": 1.7217627994214703e-05, + "loss": 0.3576, + "step": 8505 + }, + { + "epoch": 3.4591297275315167, + "grad_norm": 10.235713501800019, + "learning_rate": 1.7216926688589443e-05, + "loss": 0.3173, + "step": 8506 + }, + { + "epoch": 3.459536396909313, + "grad_norm": 9.522243305468612, + "learning_rate": 1.7216225308878802e-05, + "loss": 0.5815, + "step": 8507 + }, + { + "epoch": 3.4599430662871087, + "grad_norm": 17.630308467409762, + "learning_rate": 1.7215523855089976e-05, + "loss": 0.817, + "step": 8508 + }, + { + "epoch": 3.4603497356649044, + "grad_norm": 11.880405006581354, + "learning_rate": 1.7214822327230166e-05, + "loss": 0.4076, + "step": 8509 + }, + { + "epoch": 3.4607564050427, + "grad_norm": 5.184802014209828, + "learning_rate": 1.721412072530657e-05, + "loss": 0.1568, + "step": 8510 + }, + { + "epoch": 3.461163074420496, + "grad_norm": 6.824756370770561, + "learning_rate": 1.72134190493264e-05, + "loss": 0.3315, + "step": 8511 + }, + { + "epoch": 3.461569743798292, + "grad_norm": 0.31868830669347775, + "learning_rate": 1.7212717299296847e-05, + "loss": 0.0079, + "step": 8512 + }, + { + "epoch": 3.461976413176088, + "grad_norm": 3.3667341997433042, + "learning_rate": 1.7212015475225123e-05, + "loss": 0.0739, + "step": 8513 + }, + { + "epoch": 3.4623830825538837, + "grad_norm": 7.30916834304426, + "learning_rate": 1.7211313577118432e-05, + "loss": 0.2443, + "step": 8514 + }, + { + "epoch": 3.4627897519316795, + "grad_norm": 7.689968256352759, + "learning_rate": 1.7210611604983976e-05, + "loss": 0.2796, + "step": 8515 + }, + { + "epoch": 3.4631964213094752, + "grad_norm": 2.689865501298482, + "learning_rate": 1.7209909558828964e-05, + "loss": 0.0469, + "step": 8516 + }, + { + "epoch": 3.463603090687271, + "grad_norm": 9.748032902519554, + "learning_rate": 1.72092074386606e-05, + "loss": 0.479, + "step": 8517 + }, + { + "epoch": 3.4640097600650672, + "grad_norm": 4.71907055008964, + "learning_rate": 1.7208505244486096e-05, + "loss": 0.0896, + "step": 8518 + }, + { + "epoch": 3.464416429442863, + "grad_norm": 9.982820391140606, + "learning_rate": 1.7207802976312656e-05, + "loss": 0.3592, + "step": 8519 + }, + { + "epoch": 3.4648230988206588, + "grad_norm": 26.360389916057112, + "learning_rate": 1.7207100634147493e-05, + "loss": 0.2872, + "step": 8520 + }, + { + "epoch": 3.4652297681984545, + "grad_norm": 4.722712336429822, + "learning_rate": 1.720639821799781e-05, + "loss": 0.095, + "step": 8521 + }, + { + "epoch": 3.4656364375762507, + "grad_norm": 7.910906245798357, + "learning_rate": 1.7205695727870828e-05, + "loss": 0.3442, + "step": 8522 + }, + { + "epoch": 3.4660431069540465, + "grad_norm": 11.5774243925145, + "learning_rate": 1.7204993163773745e-05, + "loss": 0.4222, + "step": 8523 + }, + { + "epoch": 3.4664497763318423, + "grad_norm": 11.2587610718314, + "learning_rate": 1.7204290525713787e-05, + "loss": 0.4164, + "step": 8524 + }, + { + "epoch": 3.466856445709638, + "grad_norm": 6.031405423941548, + "learning_rate": 1.7203587813698162e-05, + "loss": 0.2641, + "step": 8525 + }, + { + "epoch": 3.467263115087434, + "grad_norm": 7.144179443449143, + "learning_rate": 1.720288502773408e-05, + "loss": 0.0844, + "step": 8526 + }, + { + "epoch": 3.4676697844652296, + "grad_norm": 2.591763086144086, + "learning_rate": 1.7202182167828757e-05, + "loss": 0.0478, + "step": 8527 + }, + { + "epoch": 3.4680764538430258, + "grad_norm": 6.7183066893444225, + "learning_rate": 1.7201479233989413e-05, + "loss": 0.3248, + "step": 8528 + }, + { + "epoch": 3.4684831232208215, + "grad_norm": 10.115690222893164, + "learning_rate": 1.7200776226223256e-05, + "loss": 0.3005, + "step": 8529 + }, + { + "epoch": 3.4688897925986173, + "grad_norm": 3.7593268595160985, + "learning_rate": 1.720007314453751e-05, + "loss": 0.0619, + "step": 8530 + }, + { + "epoch": 3.469296461976413, + "grad_norm": 9.442608250898973, + "learning_rate": 1.719936998893939e-05, + "loss": 0.5672, + "step": 8531 + }, + { + "epoch": 3.4697031313542093, + "grad_norm": 4.702443359819832, + "learning_rate": 1.7198666759436112e-05, + "loss": 0.0889, + "step": 8532 + }, + { + "epoch": 3.470109800732005, + "grad_norm": 0.5000900580875676, + "learning_rate": 1.7197963456034898e-05, + "loss": 0.0102, + "step": 8533 + }, + { + "epoch": 3.470516470109801, + "grad_norm": 13.443778293747824, + "learning_rate": 1.719726007874296e-05, + "loss": 0.4588, + "step": 8534 + }, + { + "epoch": 3.4709231394875966, + "grad_norm": 5.449650727517704, + "learning_rate": 1.719655662756753e-05, + "loss": 0.0937, + "step": 8535 + }, + { + "epoch": 3.4713298088653923, + "grad_norm": 3.494095309017227, + "learning_rate": 1.7195853102515826e-05, + "loss": 0.0717, + "step": 8536 + }, + { + "epoch": 3.471736478243188, + "grad_norm": 5.875601028865094, + "learning_rate": 1.7195149503595068e-05, + "loss": 0.1139, + "step": 8537 + }, + { + "epoch": 3.4721431476209843, + "grad_norm": 13.33228827267031, + "learning_rate": 1.7194445830812475e-05, + "loss": 0.6537, + "step": 8538 + }, + { + "epoch": 3.47254981699878, + "grad_norm": 9.982066617555258, + "learning_rate": 1.719374208417528e-05, + "loss": 0.3402, + "step": 8539 + }, + { + "epoch": 3.472956486376576, + "grad_norm": 1.5540674673227162, + "learning_rate": 1.71930382636907e-05, + "loss": 0.0329, + "step": 8540 + }, + { + "epoch": 3.4733631557543716, + "grad_norm": 6.657069375737792, + "learning_rate": 1.7192334369365963e-05, + "loss": 0.4096, + "step": 8541 + }, + { + "epoch": 3.4737698251321674, + "grad_norm": 16.191446984841487, + "learning_rate": 1.719163040120829e-05, + "loss": 1.5116, + "step": 8542 + }, + { + "epoch": 3.4741764945099636, + "grad_norm": 10.540893989959951, + "learning_rate": 1.7190926359224914e-05, + "loss": 0.3527, + "step": 8543 + }, + { + "epoch": 3.4745831638877593, + "grad_norm": 4.153562315499316, + "learning_rate": 1.719022224342306e-05, + "loss": 0.1074, + "step": 8544 + }, + { + "epoch": 3.474989833265555, + "grad_norm": 10.541973510860808, + "learning_rate": 1.7189518053809955e-05, + "loss": 0.4585, + "step": 8545 + }, + { + "epoch": 3.475396502643351, + "grad_norm": 14.18983267416299, + "learning_rate": 1.7188813790392833e-05, + "loss": 0.36, + "step": 8546 + }, + { + "epoch": 3.4758031720211466, + "grad_norm": 1.067275442338285, + "learning_rate": 1.7188109453178914e-05, + "loss": 0.0084, + "step": 8547 + }, + { + "epoch": 3.476209841398943, + "grad_norm": 5.916077869986091, + "learning_rate": 1.7187405042175432e-05, + "loss": 0.3698, + "step": 8548 + }, + { + "epoch": 3.4766165107767386, + "grad_norm": 9.668483672493759, + "learning_rate": 1.7186700557389627e-05, + "loss": 0.2927, + "step": 8549 + }, + { + "epoch": 3.4770231801545344, + "grad_norm": 4.304349079736031, + "learning_rate": 1.718599599882872e-05, + "loss": 0.101, + "step": 8550 + }, + { + "epoch": 3.47742984953233, + "grad_norm": 1.4781579837669576, + "learning_rate": 1.7185291366499947e-05, + "loss": 0.0238, + "step": 8551 + }, + { + "epoch": 3.477836518910126, + "grad_norm": 28.452412939928923, + "learning_rate": 1.7184586660410542e-05, + "loss": 0.9106, + "step": 8552 + }, + { + "epoch": 3.478243188287922, + "grad_norm": 6.475413044652346, + "learning_rate": 1.7183881880567742e-05, + "loss": 0.1418, + "step": 8553 + }, + { + "epoch": 3.478649857665718, + "grad_norm": 0.5729608006723554, + "learning_rate": 1.7183177026978777e-05, + "loss": 0.0082, + "step": 8554 + }, + { + "epoch": 3.4790565270435136, + "grad_norm": 11.784712370019708, + "learning_rate": 1.7182472099650882e-05, + "loss": 0.5077, + "step": 8555 + }, + { + "epoch": 3.4794631964213094, + "grad_norm": 7.7821221769461975, + "learning_rate": 1.71817670985913e-05, + "loss": 0.3303, + "step": 8556 + }, + { + "epoch": 3.479869865799105, + "grad_norm": 1.4169415819859308, + "learning_rate": 1.718106202380726e-05, + "loss": 0.0275, + "step": 8557 + }, + { + "epoch": 3.480276535176901, + "grad_norm": 1.185594095403685, + "learning_rate": 1.7180356875306008e-05, + "loss": 0.0274, + "step": 8558 + }, + { + "epoch": 3.480683204554697, + "grad_norm": 5.948772177921442, + "learning_rate": 1.7179651653094776e-05, + "loss": 0.3318, + "step": 8559 + }, + { + "epoch": 3.481089873932493, + "grad_norm": 9.04952686726625, + "learning_rate": 1.717894635718081e-05, + "loss": 0.2265, + "step": 8560 + }, + { + "epoch": 3.4814965433102887, + "grad_norm": 8.318889335021439, + "learning_rate": 1.7178240987571343e-05, + "loss": 0.1994, + "step": 8561 + }, + { + "epoch": 3.4819032126880844, + "grad_norm": 12.025531854941752, + "learning_rate": 1.717753554427362e-05, + "loss": 0.6653, + "step": 8562 + }, + { + "epoch": 3.4823098820658807, + "grad_norm": 7.8837910070745325, + "learning_rate": 1.7176830027294885e-05, + "loss": 0.4397, + "step": 8563 + }, + { + "epoch": 3.4827165514436764, + "grad_norm": 0.6982761751091165, + "learning_rate": 1.7176124436642373e-05, + "loss": 0.0098, + "step": 8564 + }, + { + "epoch": 3.483123220821472, + "grad_norm": 6.7203213902160535, + "learning_rate": 1.7175418772323334e-05, + "loss": 0.2161, + "step": 8565 + }, + { + "epoch": 3.483529890199268, + "grad_norm": 2.021240522866068, + "learning_rate": 1.7174713034345013e-05, + "loss": 0.0426, + "step": 8566 + }, + { + "epoch": 3.4839365595770637, + "grad_norm": 15.712134152618997, + "learning_rate": 1.717400722271465e-05, + "loss": 0.5206, + "step": 8567 + }, + { + "epoch": 3.4843432289548595, + "grad_norm": 7.14061438752858, + "learning_rate": 1.717330133743949e-05, + "loss": 0.2354, + "step": 8568 + }, + { + "epoch": 3.4847498983326557, + "grad_norm": 10.154671146567043, + "learning_rate": 1.7172595378526785e-05, + "loss": 0.359, + "step": 8569 + }, + { + "epoch": 3.4851565677104515, + "grad_norm": 6.250545122841601, + "learning_rate": 1.7171889345983778e-05, + "loss": 0.2044, + "step": 8570 + }, + { + "epoch": 3.4855632370882472, + "grad_norm": 3.371030099719298, + "learning_rate": 1.7171183239817714e-05, + "loss": 0.1717, + "step": 8571 + }, + { + "epoch": 3.485969906466043, + "grad_norm": 3.562280603676735, + "learning_rate": 1.7170477060035847e-05, + "loss": 0.1017, + "step": 8572 + }, + { + "epoch": 3.486376575843839, + "grad_norm": 0.22636339570119932, + "learning_rate": 1.7169770806645424e-05, + "loss": 0.0041, + "step": 8573 + }, + { + "epoch": 3.486783245221635, + "grad_norm": 6.015699782241491, + "learning_rate": 1.7169064479653697e-05, + "loss": 0.1575, + "step": 8574 + }, + { + "epoch": 3.4871899145994307, + "grad_norm": 22.365479218859466, + "learning_rate": 1.716835807906791e-05, + "loss": 1.5756, + "step": 8575 + }, + { + "epoch": 3.4875965839772265, + "grad_norm": 1.2179921676914047, + "learning_rate": 1.7167651604895326e-05, + "loss": 0.0183, + "step": 8576 + }, + { + "epoch": 3.4880032533550223, + "grad_norm": 8.449657330866955, + "learning_rate": 1.7166945057143187e-05, + "loss": 0.3171, + "step": 8577 + }, + { + "epoch": 3.488409922732818, + "grad_norm": 8.67979522189132, + "learning_rate": 1.716623843581875e-05, + "loss": 0.2296, + "step": 8578 + }, + { + "epoch": 3.4888165921106142, + "grad_norm": 0.2532951648145703, + "learning_rate": 1.7165531740929272e-05, + "loss": 0.0043, + "step": 8579 + }, + { + "epoch": 3.48922326148841, + "grad_norm": 2.624183700733605, + "learning_rate": 1.7164824972482e-05, + "loss": 0.034, + "step": 8580 + }, + { + "epoch": 3.4896299308662058, + "grad_norm": 6.459741821577236, + "learning_rate": 1.7164118130484195e-05, + "loss": 0.2036, + "step": 8581 + }, + { + "epoch": 3.4900366002440015, + "grad_norm": 6.830521774909719, + "learning_rate": 1.7163411214943114e-05, + "loss": 0.2501, + "step": 8582 + }, + { + "epoch": 3.4904432696217973, + "grad_norm": 6.777461569082155, + "learning_rate": 1.716270422586601e-05, + "loss": 0.2332, + "step": 8583 + }, + { + "epoch": 3.4908499389995935, + "grad_norm": 4.176062956986427, + "learning_rate": 1.716199716326014e-05, + "loss": 0.0846, + "step": 8584 + }, + { + "epoch": 3.4912566083773893, + "grad_norm": 2.023846519943457, + "learning_rate": 1.716129002713277e-05, + "loss": 0.032, + "step": 8585 + }, + { + "epoch": 3.491663277755185, + "grad_norm": 11.79995720116079, + "learning_rate": 1.7160582817491146e-05, + "loss": 0.4335, + "step": 8586 + }, + { + "epoch": 3.492069947132981, + "grad_norm": 12.444661902637126, + "learning_rate": 1.7159875534342542e-05, + "loss": 0.5926, + "step": 8587 + }, + { + "epoch": 3.4924766165107766, + "grad_norm": 11.327625103970075, + "learning_rate": 1.715916817769421e-05, + "loss": 0.0732, + "step": 8588 + }, + { + "epoch": 3.4928832858885728, + "grad_norm": 17.68807530877981, + "learning_rate": 1.7158460747553413e-05, + "loss": 0.8591, + "step": 8589 + }, + { + "epoch": 3.4932899552663685, + "grad_norm": 13.48756635037949, + "learning_rate": 1.7157753243927415e-05, + "loss": 0.4678, + "step": 8590 + }, + { + "epoch": 3.4936966246441643, + "grad_norm": 5.170730710393698, + "learning_rate": 1.7157045666823474e-05, + "loss": 0.1038, + "step": 8591 + }, + { + "epoch": 3.49410329402196, + "grad_norm": 0.3021458754740739, + "learning_rate": 1.715633801624886e-05, + "loss": 0.0044, + "step": 8592 + }, + { + "epoch": 3.494509963399756, + "grad_norm": 8.076556748516422, + "learning_rate": 1.7155630292210837e-05, + "loss": 0.7432, + "step": 8593 + }, + { + "epoch": 3.494916632777552, + "grad_norm": 4.269293639533135, + "learning_rate": 1.7154922494716665e-05, + "loss": 0.3129, + "step": 8594 + }, + { + "epoch": 3.495323302155348, + "grad_norm": 0.4725862724054618, + "learning_rate": 1.7154214623773613e-05, + "loss": 0.0071, + "step": 8595 + }, + { + "epoch": 3.4957299715331436, + "grad_norm": 2.271804547972023, + "learning_rate": 1.7153506679388946e-05, + "loss": 0.0217, + "step": 8596 + }, + { + "epoch": 3.4961366409109393, + "grad_norm": 9.136931715512652, + "learning_rate": 1.7152798661569934e-05, + "loss": 0.4237, + "step": 8597 + }, + { + "epoch": 3.496543310288735, + "grad_norm": 8.373823631366339, + "learning_rate": 1.7152090570323845e-05, + "loss": 0.2363, + "step": 8598 + }, + { + "epoch": 3.496949979666531, + "grad_norm": 0.3688583795214611, + "learning_rate": 1.7151382405657943e-05, + "loss": 0.006, + "step": 8599 + }, + { + "epoch": 3.497356649044327, + "grad_norm": 13.327103015075638, + "learning_rate": 1.7150674167579503e-05, + "loss": 0.6056, + "step": 8600 + }, + { + "epoch": 3.497763318422123, + "grad_norm": 11.10838185889715, + "learning_rate": 1.7149965856095798e-05, + "loss": 0.3703, + "step": 8601 + }, + { + "epoch": 3.4981699877999186, + "grad_norm": 7.365343008856243, + "learning_rate": 1.714925747121409e-05, + "loss": 0.1432, + "step": 8602 + }, + { + "epoch": 3.4985766571777144, + "grad_norm": 7.324156333689176, + "learning_rate": 1.7148549012941655e-05, + "loss": 0.3629, + "step": 8603 + }, + { + "epoch": 3.4989833265555106, + "grad_norm": 5.3388389423028615, + "learning_rate": 1.7147840481285772e-05, + "loss": 0.2305, + "step": 8604 + }, + { + "epoch": 3.4993899959333064, + "grad_norm": 8.998371520001726, + "learning_rate": 1.7147131876253706e-05, + "loss": 0.2757, + "step": 8605 + }, + { + "epoch": 3.499796665311102, + "grad_norm": 0.1334479394889888, + "learning_rate": 1.7146423197852733e-05, + "loss": 0.0018, + "step": 8606 + }, + { + "epoch": 3.500203334688898, + "grad_norm": 10.734414490824246, + "learning_rate": 1.7145714446090128e-05, + "loss": 0.5183, + "step": 8607 + }, + { + "epoch": 3.5006100040666936, + "grad_norm": 5.519084937516475, + "learning_rate": 1.714500562097317e-05, + "loss": 0.0847, + "step": 8608 + }, + { + "epoch": 3.5010166734444894, + "grad_norm": 4.042442580922115, + "learning_rate": 1.7144296722509136e-05, + "loss": 0.0745, + "step": 8609 + }, + { + "epoch": 3.5014233428222856, + "grad_norm": 10.530529631121654, + "learning_rate": 1.7143587750705295e-05, + "loss": 0.8535, + "step": 8610 + }, + { + "epoch": 3.5018300122000814, + "grad_norm": 4.417056539068016, + "learning_rate": 1.7142878705568936e-05, + "loss": 0.0598, + "step": 8611 + }, + { + "epoch": 3.502236681577877, + "grad_norm": 5.547369646902791, + "learning_rate": 1.714216958710733e-05, + "loss": 0.1195, + "step": 8612 + }, + { + "epoch": 3.502643350955673, + "grad_norm": 10.6547868798323, + "learning_rate": 1.7141460395327755e-05, + "loss": 0.2714, + "step": 8613 + }, + { + "epoch": 3.503050020333469, + "grad_norm": 4.976095495650154, + "learning_rate": 1.71407511302375e-05, + "loss": 0.1132, + "step": 8614 + }, + { + "epoch": 3.503456689711265, + "grad_norm": 3.600246431729066, + "learning_rate": 1.714004179184384e-05, + "loss": 0.086, + "step": 8615 + }, + { + "epoch": 3.5038633590890607, + "grad_norm": 7.206974300783671, + "learning_rate": 1.7139332380154053e-05, + "loss": 0.188, + "step": 8616 + }, + { + "epoch": 3.5042700284668564, + "grad_norm": 0.9612866195302231, + "learning_rate": 1.7138622895175433e-05, + "loss": 0.0149, + "step": 8617 + }, + { + "epoch": 3.504676697844652, + "grad_norm": 2.5235472697267896, + "learning_rate": 1.713791333691525e-05, + "loss": 0.0421, + "step": 8618 + }, + { + "epoch": 3.505083367222448, + "grad_norm": 9.545646230696468, + "learning_rate": 1.71372037053808e-05, + "loss": 0.4598, + "step": 8619 + }, + { + "epoch": 3.505490036600244, + "grad_norm": 8.336741807500172, + "learning_rate": 1.7136494000579355e-05, + "loss": 0.278, + "step": 8620 + }, + { + "epoch": 3.50589670597804, + "grad_norm": 3.6669094161433247, + "learning_rate": 1.7135784222518214e-05, + "loss": 0.0443, + "step": 8621 + }, + { + "epoch": 3.5063033753558357, + "grad_norm": 3.2385753482557544, + "learning_rate": 1.713507437120465e-05, + "loss": 0.056, + "step": 8622 + }, + { + "epoch": 3.5067100447336315, + "grad_norm": 9.531625397200957, + "learning_rate": 1.713436444664596e-05, + "loss": 0.3427, + "step": 8623 + }, + { + "epoch": 3.5071167141114277, + "grad_norm": 8.169790252549323, + "learning_rate": 1.713365444884943e-05, + "loss": 0.1611, + "step": 8624 + }, + { + "epoch": 3.5075233834892234, + "grad_norm": 13.689671756730892, + "learning_rate": 1.7132944377822346e-05, + "loss": 0.7511, + "step": 8625 + }, + { + "epoch": 3.507930052867019, + "grad_norm": 7.5292629147170675, + "learning_rate": 1.7132234233571993e-05, + "loss": 0.0876, + "step": 8626 + }, + { + "epoch": 3.508336722244815, + "grad_norm": 1.966044705051467, + "learning_rate": 1.7131524016105673e-05, + "loss": 0.0392, + "step": 8627 + }, + { + "epoch": 3.5087433916226107, + "grad_norm": 7.531429671359364, + "learning_rate": 1.7130813725430666e-05, + "loss": 0.2277, + "step": 8628 + }, + { + "epoch": 3.5091500610004065, + "grad_norm": 1.1476705066187236, + "learning_rate": 1.7130103361554267e-05, + "loss": 0.0287, + "step": 8629 + }, + { + "epoch": 3.5095567303782023, + "grad_norm": 0.34872925660305687, + "learning_rate": 1.712939292448377e-05, + "loss": 0.0053, + "step": 8630 + }, + { + "epoch": 3.5099633997559985, + "grad_norm": 6.720882287695717, + "learning_rate": 1.712868241422647e-05, + "loss": 0.4356, + "step": 8631 + }, + { + "epoch": 3.5103700691337942, + "grad_norm": 5.328552579972056, + "learning_rate": 1.712797183078965e-05, + "loss": 0.2197, + "step": 8632 + }, + { + "epoch": 3.51077673851159, + "grad_norm": 6.182880832758093, + "learning_rate": 1.7127261174180614e-05, + "loss": 0.6904, + "step": 8633 + }, + { + "epoch": 3.5111834078893858, + "grad_norm": 10.64548773466659, + "learning_rate": 1.7126550444406657e-05, + "loss": 0.4525, + "step": 8634 + }, + { + "epoch": 3.511590077267182, + "grad_norm": 1.756470451776359, + "learning_rate": 1.7125839641475074e-05, + "loss": 0.0236, + "step": 8635 + }, + { + "epoch": 3.5119967466449777, + "grad_norm": 16.845232287806766, + "learning_rate": 1.7125128765393157e-05, + "loss": 1.1278, + "step": 8636 + }, + { + "epoch": 3.5124034160227735, + "grad_norm": 8.436129416358499, + "learning_rate": 1.712441781616821e-05, + "loss": 0.3528, + "step": 8637 + }, + { + "epoch": 3.5128100854005693, + "grad_norm": 7.796332810811867, + "learning_rate": 1.7123706793807528e-05, + "loss": 0.25, + "step": 8638 + }, + { + "epoch": 3.513216754778365, + "grad_norm": 5.757829371931299, + "learning_rate": 1.712299569831841e-05, + "loss": 0.1782, + "step": 8639 + }, + { + "epoch": 3.513623424156161, + "grad_norm": 26.90208840515561, + "learning_rate": 1.7122284529708154e-05, + "loss": 1.3523, + "step": 8640 + }, + { + "epoch": 3.514030093533957, + "grad_norm": 1.3005627948965626, + "learning_rate": 1.7121573287984065e-05, + "loss": 0.0166, + "step": 8641 + }, + { + "epoch": 3.5144367629117528, + "grad_norm": 27.793603038720985, + "learning_rate": 1.712086197315344e-05, + "loss": 0.7996, + "step": 8642 + }, + { + "epoch": 3.5148434322895485, + "grad_norm": 2.61786822951301, + "learning_rate": 1.7120150585223583e-05, + "loss": 0.0555, + "step": 8643 + }, + { + "epoch": 3.5152501016673443, + "grad_norm": 4.605618866268596, + "learning_rate": 1.71194391242018e-05, + "loss": 0.1387, + "step": 8644 + }, + { + "epoch": 3.5156567710451405, + "grad_norm": 1.4954687521943364, + "learning_rate": 1.711872759009539e-05, + "loss": 0.0233, + "step": 8645 + }, + { + "epoch": 3.5160634404229363, + "grad_norm": 2.6044354517723693, + "learning_rate": 1.7118015982911657e-05, + "loss": 0.0356, + "step": 8646 + }, + { + "epoch": 3.516470109800732, + "grad_norm": 6.264655161011819, + "learning_rate": 1.711730430265791e-05, + "loss": 0.1313, + "step": 8647 + }, + { + "epoch": 3.516876779178528, + "grad_norm": 4.111706593025248, + "learning_rate": 1.7116592549341452e-05, + "loss": 0.1373, + "step": 8648 + }, + { + "epoch": 3.5172834485563236, + "grad_norm": 6.362512783298129, + "learning_rate": 1.7115880722969585e-05, + "loss": 0.1867, + "step": 8649 + }, + { + "epoch": 3.5176901179341193, + "grad_norm": 8.048456993206823, + "learning_rate": 1.7115168823549627e-05, + "loss": 0.4809, + "step": 8650 + }, + { + "epoch": 3.5180967873119156, + "grad_norm": 20.506725722808387, + "learning_rate": 1.711445685108888e-05, + "loss": 0.5818, + "step": 8651 + }, + { + "epoch": 3.5185034566897113, + "grad_norm": 7.773547934222158, + "learning_rate": 1.711374480559465e-05, + "loss": 0.1755, + "step": 8652 + }, + { + "epoch": 3.518910126067507, + "grad_norm": 4.086153997943547, + "learning_rate": 1.711303268707425e-05, + "loss": 0.1811, + "step": 8653 + }, + { + "epoch": 3.519316795445303, + "grad_norm": 8.585627236381342, + "learning_rate": 1.711232049553499e-05, + "loss": 0.1767, + "step": 8654 + }, + { + "epoch": 3.519723464823099, + "grad_norm": 9.509674597535835, + "learning_rate": 1.711160823098418e-05, + "loss": 0.2465, + "step": 8655 + }, + { + "epoch": 3.520130134200895, + "grad_norm": 12.314697932219525, + "learning_rate": 1.711089589342913e-05, + "loss": 0.2203, + "step": 8656 + }, + { + "epoch": 3.5205368035786906, + "grad_norm": 4.028167782410008, + "learning_rate": 1.7110183482877157e-05, + "loss": 0.1541, + "step": 8657 + }, + { + "epoch": 3.5209434729564864, + "grad_norm": 10.434966423939937, + "learning_rate": 1.7109470999335574e-05, + "loss": 0.4104, + "step": 8658 + }, + { + "epoch": 3.521350142334282, + "grad_norm": 2.482003323497433, + "learning_rate": 1.7108758442811694e-05, + "loss": 0.021, + "step": 8659 + }, + { + "epoch": 3.521756811712078, + "grad_norm": 4.973993851916002, + "learning_rate": 1.7108045813312822e-05, + "loss": 0.1702, + "step": 8660 + }, + { + "epoch": 3.522163481089874, + "grad_norm": 1.5437350571348585, + "learning_rate": 1.710733311084629e-05, + "loss": 0.0321, + "step": 8661 + }, + { + "epoch": 3.52257015046767, + "grad_norm": 4.646714876641743, + "learning_rate": 1.7106620335419407e-05, + "loss": 0.0959, + "step": 8662 + }, + { + "epoch": 3.5229768198454656, + "grad_norm": 10.761362262193828, + "learning_rate": 1.7105907487039487e-05, + "loss": 0.2869, + "step": 8663 + }, + { + "epoch": 3.5233834892232614, + "grad_norm": 0.9960058681663352, + "learning_rate": 1.710519456571385e-05, + "loss": 0.0172, + "step": 8664 + }, + { + "epoch": 3.5237901586010576, + "grad_norm": 3.4919565817046467, + "learning_rate": 1.7104481571449816e-05, + "loss": 0.1348, + "step": 8665 + }, + { + "epoch": 3.5241968279788534, + "grad_norm": 4.1230930763031886, + "learning_rate": 1.71037685042547e-05, + "loss": 0.0559, + "step": 8666 + }, + { + "epoch": 3.524603497356649, + "grad_norm": 13.46882942122633, + "learning_rate": 1.710305536413583e-05, + "loss": 0.3644, + "step": 8667 + }, + { + "epoch": 3.525010166734445, + "grad_norm": 11.771360093309292, + "learning_rate": 1.710234215110052e-05, + "loss": 0.614, + "step": 8668 + }, + { + "epoch": 3.5254168361122407, + "grad_norm": 2.0704494234513455, + "learning_rate": 1.7101628865156094e-05, + "loss": 0.0229, + "step": 8669 + }, + { + "epoch": 3.5258235054900364, + "grad_norm": 8.422605867755316, + "learning_rate": 1.7100915506309874e-05, + "loss": 0.3301, + "step": 8670 + }, + { + "epoch": 3.526230174867832, + "grad_norm": 15.129848693167041, + "learning_rate": 1.7100202074569177e-05, + "loss": 0.4363, + "step": 8671 + }, + { + "epoch": 3.5266368442456284, + "grad_norm": 5.652664579289671, + "learning_rate": 1.709948856994134e-05, + "loss": 0.306, + "step": 8672 + }, + { + "epoch": 3.527043513623424, + "grad_norm": 10.074642314304603, + "learning_rate": 1.7098774992433673e-05, + "loss": 0.2633, + "step": 8673 + }, + { + "epoch": 3.52745018300122, + "grad_norm": 10.016450847909969, + "learning_rate": 1.7098061342053515e-05, + "loss": 0.3597, + "step": 8674 + }, + { + "epoch": 3.5278568523790157, + "grad_norm": 3.87109319062751, + "learning_rate": 1.7097347618808178e-05, + "loss": 0.1723, + "step": 8675 + }, + { + "epoch": 3.528263521756812, + "grad_norm": 29.55152220871766, + "learning_rate": 1.7096633822704997e-05, + "loss": 0.7694, + "step": 8676 + }, + { + "epoch": 3.5286701911346077, + "grad_norm": 8.993472459475882, + "learning_rate": 1.70959199537513e-05, + "loss": 0.4634, + "step": 8677 + }, + { + "epoch": 3.5290768605124034, + "grad_norm": 6.353795187555492, + "learning_rate": 1.7095206011954415e-05, + "loss": 0.4603, + "step": 8678 + }, + { + "epoch": 3.529483529890199, + "grad_norm": 0.30521702515796173, + "learning_rate": 1.7094491997321667e-05, + "loss": 0.0057, + "step": 8679 + }, + { + "epoch": 3.529890199267995, + "grad_norm": 13.356129612796254, + "learning_rate": 1.709377790986039e-05, + "loss": 0.7971, + "step": 8680 + }, + { + "epoch": 3.5302968686457907, + "grad_norm": 13.834126623484142, + "learning_rate": 1.709306374957791e-05, + "loss": 0.9118, + "step": 8681 + }, + { + "epoch": 3.530703538023587, + "grad_norm": 2.903151516060567, + "learning_rate": 1.709234951648156e-05, + "loss": 0.0681, + "step": 8682 + }, + { + "epoch": 3.5311102074013827, + "grad_norm": 0.7903019606977001, + "learning_rate": 1.7091635210578676e-05, + "loss": 0.0146, + "step": 8683 + }, + { + "epoch": 3.5315168767791785, + "grad_norm": 11.822862396495537, + "learning_rate": 1.7090920831876584e-05, + "loss": 0.5718, + "step": 8684 + }, + { + "epoch": 3.5319235461569742, + "grad_norm": 21.67046450485222, + "learning_rate": 1.7090206380382626e-05, + "loss": 0.9003, + "step": 8685 + }, + { + "epoch": 3.5323302155347704, + "grad_norm": 6.360078353613333, + "learning_rate": 1.7089491856104126e-05, + "loss": 0.2217, + "step": 8686 + }, + { + "epoch": 3.532736884912566, + "grad_norm": 4.800015351401036, + "learning_rate": 1.7088777259048427e-05, + "loss": 0.1328, + "step": 8687 + }, + { + "epoch": 3.533143554290362, + "grad_norm": 2.8998100154783057, + "learning_rate": 1.708806258922286e-05, + "loss": 0.0398, + "step": 8688 + }, + { + "epoch": 3.5335502236681577, + "grad_norm": 9.672930938013748, + "learning_rate": 1.7087347846634764e-05, + "loss": 0.2166, + "step": 8689 + }, + { + "epoch": 3.5339568930459535, + "grad_norm": 5.727642830431466, + "learning_rate": 1.7086633031291474e-05, + "loss": 0.115, + "step": 8690 + }, + { + "epoch": 3.5343635624237493, + "grad_norm": 4.769853476665077, + "learning_rate": 1.708591814320033e-05, + "loss": 0.0795, + "step": 8691 + }, + { + "epoch": 3.5347702318015455, + "grad_norm": 8.172945475961754, + "learning_rate": 1.708520318236867e-05, + "loss": 0.7982, + "step": 8692 + }, + { + "epoch": 3.5351769011793412, + "grad_norm": 5.694776594355364, + "learning_rate": 1.7084488148803834e-05, + "loss": 0.1381, + "step": 8693 + }, + { + "epoch": 3.535583570557137, + "grad_norm": 1.8551032055571883, + "learning_rate": 1.7083773042513163e-05, + "loss": 0.032, + "step": 8694 + }, + { + "epoch": 3.5359902399349328, + "grad_norm": 4.484237895487096, + "learning_rate": 1.7083057863503992e-05, + "loss": 0.094, + "step": 8695 + }, + { + "epoch": 3.536396909312729, + "grad_norm": 15.957605314725996, + "learning_rate": 1.708234261178367e-05, + "loss": 0.7352, + "step": 8696 + }, + { + "epoch": 3.5368035786905248, + "grad_norm": 1.5519604091535755, + "learning_rate": 1.708162728735954e-05, + "loss": 0.0231, + "step": 8697 + }, + { + "epoch": 3.5372102480683205, + "grad_norm": 2.2502595030764208, + "learning_rate": 1.7080911890238937e-05, + "loss": 0.0437, + "step": 8698 + }, + { + "epoch": 3.5376169174461163, + "grad_norm": 3.862477435223012, + "learning_rate": 1.708019642042921e-05, + "loss": 0.0731, + "step": 8699 + }, + { + "epoch": 3.538023586823912, + "grad_norm": 11.5325559484172, + "learning_rate": 1.7079480877937707e-05, + "loss": 0.6622, + "step": 8700 + }, + { + "epoch": 3.538430256201708, + "grad_norm": 7.857384081386217, + "learning_rate": 1.707876526277177e-05, + "loss": 0.1463, + "step": 8701 + }, + { + "epoch": 3.538836925579504, + "grad_norm": 0.6011209374251074, + "learning_rate": 1.707804957493874e-05, + "loss": 0.0116, + "step": 8702 + }, + { + "epoch": 3.5392435949573, + "grad_norm": 14.356826308540136, + "learning_rate": 1.707733381444597e-05, + "loss": 0.7103, + "step": 8703 + }, + { + "epoch": 3.5396502643350956, + "grad_norm": 6.410909889976649, + "learning_rate": 1.7076617981300807e-05, + "loss": 0.1665, + "step": 8704 + }, + { + "epoch": 3.5400569337128913, + "grad_norm": 28.624532809645945, + "learning_rate": 1.70759020755106e-05, + "loss": 0.4718, + "step": 8705 + }, + { + "epoch": 3.5404636030906875, + "grad_norm": 2.548464977951243, + "learning_rate": 1.7075186097082695e-05, + "loss": 0.0405, + "step": 8706 + }, + { + "epoch": 3.5408702724684833, + "grad_norm": 9.432956588988402, + "learning_rate": 1.707447004602445e-05, + "loss": 0.2915, + "step": 8707 + }, + { + "epoch": 3.541276941846279, + "grad_norm": 12.02817916942162, + "learning_rate": 1.7073753922343203e-05, + "loss": 0.1219, + "step": 8708 + }, + { + "epoch": 3.541683611224075, + "grad_norm": 8.164615743488266, + "learning_rate": 1.7073037726046314e-05, + "loss": 0.1501, + "step": 8709 + }, + { + "epoch": 3.5420902806018706, + "grad_norm": 12.357298093724191, + "learning_rate": 1.7072321457141134e-05, + "loss": 0.6684, + "step": 8710 + }, + { + "epoch": 3.5424969499796664, + "grad_norm": 2.2504893820410876, + "learning_rate": 1.707160511563501e-05, + "loss": 0.0275, + "step": 8711 + }, + { + "epoch": 3.542903619357462, + "grad_norm": 7.42661731135312, + "learning_rate": 1.7070888701535303e-05, + "loss": 0.3965, + "step": 8712 + }, + { + "epoch": 3.5433102887352583, + "grad_norm": 5.392920872290505, + "learning_rate": 1.7070172214849366e-05, + "loss": 0.1163, + "step": 8713 + }, + { + "epoch": 3.543716958113054, + "grad_norm": 7.892908712900886, + "learning_rate": 1.706945565558455e-05, + "loss": 0.375, + "step": 8714 + }, + { + "epoch": 3.54412362749085, + "grad_norm": 14.977906628707515, + "learning_rate": 1.706873902374822e-05, + "loss": 1.0643, + "step": 8715 + }, + { + "epoch": 3.5445302968686456, + "grad_norm": 4.84097560419354, + "learning_rate": 1.706802231934772e-05, + "loss": 0.1035, + "step": 8716 + }, + { + "epoch": 3.544936966246442, + "grad_norm": 2.0215364030089296, + "learning_rate": 1.7067305542390418e-05, + "loss": 0.0532, + "step": 8717 + }, + { + "epoch": 3.5453436356242376, + "grad_norm": 47.11276851090704, + "learning_rate": 1.7066588692883665e-05, + "loss": 0.6757, + "step": 8718 + }, + { + "epoch": 3.5457503050020334, + "grad_norm": 2.6199434039172793, + "learning_rate": 1.706587177083482e-05, + "loss": 0.0549, + "step": 8719 + }, + { + "epoch": 3.546156974379829, + "grad_norm": 4.489414908716453, + "learning_rate": 1.706515477625125e-05, + "loss": 0.0892, + "step": 8720 + }, + { + "epoch": 3.546563643757625, + "grad_norm": 3.4878593818525627, + "learning_rate": 1.706443770914031e-05, + "loss": 0.0725, + "step": 8721 + }, + { + "epoch": 3.5469703131354207, + "grad_norm": 4.176409921557714, + "learning_rate": 1.706372056950936e-05, + "loss": 0.1648, + "step": 8722 + }, + { + "epoch": 3.547376982513217, + "grad_norm": 8.616922185320332, + "learning_rate": 1.7063003357365763e-05, + "loss": 0.4523, + "step": 8723 + }, + { + "epoch": 3.5477836518910126, + "grad_norm": 15.536870638333706, + "learning_rate": 1.7062286072716883e-05, + "loss": 0.6684, + "step": 8724 + }, + { + "epoch": 3.5481903212688084, + "grad_norm": 1.6239552527635281, + "learning_rate": 1.706156871557008e-05, + "loss": 0.0291, + "step": 8725 + }, + { + "epoch": 3.548596990646604, + "grad_norm": 7.634738619184025, + "learning_rate": 1.7060851285932726e-05, + "loss": 0.1324, + "step": 8726 + }, + { + "epoch": 3.5490036600244004, + "grad_norm": 0.9195502601137109, + "learning_rate": 1.7060133783812173e-05, + "loss": 0.0197, + "step": 8727 + }, + { + "epoch": 3.549410329402196, + "grad_norm": 6.811670197500428, + "learning_rate": 1.7059416209215795e-05, + "loss": 0.1813, + "step": 8728 + }, + { + "epoch": 3.549816998779992, + "grad_norm": 0.5059332858864279, + "learning_rate": 1.7058698562150957e-05, + "loss": 0.0099, + "step": 8729 + }, + { + "epoch": 3.5502236681577877, + "grad_norm": 0.43298008483476413, + "learning_rate": 1.7057980842625025e-05, + "loss": 0.0074, + "step": 8730 + }, + { + "epoch": 3.5506303375355834, + "grad_norm": 12.417100110824443, + "learning_rate": 1.7057263050645373e-05, + "loss": 0.9225, + "step": 8731 + }, + { + "epoch": 3.551037006913379, + "grad_norm": 5.492804652345335, + "learning_rate": 1.7056545186219358e-05, + "loss": 0.1259, + "step": 8732 + }, + { + "epoch": 3.5514436762911754, + "grad_norm": 7.766016356784667, + "learning_rate": 1.7055827249354354e-05, + "loss": 0.1621, + "step": 8733 + }, + { + "epoch": 3.551850345668971, + "grad_norm": 4.2372070150768355, + "learning_rate": 1.7055109240057736e-05, + "loss": 0.1815, + "step": 8734 + }, + { + "epoch": 3.552257015046767, + "grad_norm": 4.879699515268143, + "learning_rate": 1.705439115833687e-05, + "loss": 0.265, + "step": 8735 + }, + { + "epoch": 3.5526636844245627, + "grad_norm": 2.5494289314279923, + "learning_rate": 1.705367300419913e-05, + "loss": 0.0454, + "step": 8736 + }, + { + "epoch": 3.553070353802359, + "grad_norm": 12.403958729102376, + "learning_rate": 1.705295477765188e-05, + "loss": 0.4298, + "step": 8737 + }, + { + "epoch": 3.5534770231801547, + "grad_norm": 22.29224348183534, + "learning_rate": 1.7052236478702507e-05, + "loss": 0.9967, + "step": 8738 + }, + { + "epoch": 3.5538836925579504, + "grad_norm": 5.657977311434893, + "learning_rate": 1.7051518107358377e-05, + "loss": 0.2798, + "step": 8739 + }, + { + "epoch": 3.554290361935746, + "grad_norm": 2.308630174706108, + "learning_rate": 1.705079966362686e-05, + "loss": 0.0358, + "step": 8740 + }, + { + "epoch": 3.554697031313542, + "grad_norm": 10.03572427300996, + "learning_rate": 1.7050081147515337e-05, + "loss": 0.5688, + "step": 8741 + }, + { + "epoch": 3.5551037006913377, + "grad_norm": 6.405031713099395, + "learning_rate": 1.7049362559031184e-05, + "loss": 0.2704, + "step": 8742 + }, + { + "epoch": 3.555510370069134, + "grad_norm": 11.615308640766845, + "learning_rate": 1.7048643898181776e-05, + "loss": 0.559, + "step": 8743 + }, + { + "epoch": 3.5559170394469297, + "grad_norm": 10.269213608058667, + "learning_rate": 1.704792516497449e-05, + "loss": 0.3823, + "step": 8744 + }, + { + "epoch": 3.5563237088247255, + "grad_norm": 9.749954740101352, + "learning_rate": 1.7047206359416705e-05, + "loss": 0.4844, + "step": 8745 + }, + { + "epoch": 3.5567303782025212, + "grad_norm": 11.344634979452541, + "learning_rate": 1.7046487481515806e-05, + "loss": 0.993, + "step": 8746 + }, + { + "epoch": 3.5571370475803175, + "grad_norm": 0.728533413253741, + "learning_rate": 1.7045768531279157e-05, + "loss": 0.0135, + "step": 8747 + }, + { + "epoch": 3.5575437169581132, + "grad_norm": 11.515242757844009, + "learning_rate": 1.7045049508714155e-05, + "loss": 0.4509, + "step": 8748 + }, + { + "epoch": 3.557950386335909, + "grad_norm": 15.664239323300551, + "learning_rate": 1.7044330413828173e-05, + "loss": 0.6928, + "step": 8749 + }, + { + "epoch": 3.5583570557137048, + "grad_norm": 10.590247372342477, + "learning_rate": 1.7043611246628592e-05, + "loss": 0.3701, + "step": 8750 + }, + { + "epoch": 3.5587637250915005, + "grad_norm": 7.525640284403985, + "learning_rate": 1.7042892007122795e-05, + "loss": 0.1397, + "step": 8751 + }, + { + "epoch": 3.5591703944692963, + "grad_norm": 13.89329302554722, + "learning_rate": 1.704217269531817e-05, + "loss": 0.6938, + "step": 8752 + }, + { + "epoch": 3.559577063847092, + "grad_norm": 17.88395412545745, + "learning_rate": 1.7041453311222098e-05, + "loss": 0.5085, + "step": 8753 + }, + { + "epoch": 3.5599837332248883, + "grad_norm": 7.500751994413865, + "learning_rate": 1.7040733854841966e-05, + "loss": 0.4616, + "step": 8754 + }, + { + "epoch": 3.560390402602684, + "grad_norm": 8.610723140978624, + "learning_rate": 1.7040014326185152e-05, + "loss": 0.2445, + "step": 8755 + }, + { + "epoch": 3.56079707198048, + "grad_norm": 8.057716774593805, + "learning_rate": 1.703929472525905e-05, + "loss": 0.3464, + "step": 8756 + }, + { + "epoch": 3.561203741358276, + "grad_norm": 5.198537111588553, + "learning_rate": 1.7038575052071044e-05, + "loss": 0.1957, + "step": 8757 + }, + { + "epoch": 3.5616104107360718, + "grad_norm": 5.094876320893456, + "learning_rate": 1.7037855306628524e-05, + "loss": 0.1174, + "step": 8758 + }, + { + "epoch": 3.5620170801138675, + "grad_norm": 14.583131923107812, + "learning_rate": 1.703713548893888e-05, + "loss": 0.1886, + "step": 8759 + }, + { + "epoch": 3.5624237494916633, + "grad_norm": 9.42417510310495, + "learning_rate": 1.7036415599009492e-05, + "loss": 0.4069, + "step": 8760 + }, + { + "epoch": 3.562830418869459, + "grad_norm": 6.968159953786915, + "learning_rate": 1.7035695636847762e-05, + "loss": 0.45, + "step": 8761 + }, + { + "epoch": 3.563237088247255, + "grad_norm": 6.978385026915108, + "learning_rate": 1.7034975602461075e-05, + "loss": 0.1914, + "step": 8762 + }, + { + "epoch": 3.5636437576250506, + "grad_norm": 9.864755486703514, + "learning_rate": 1.703425549585682e-05, + "loss": 0.3556, + "step": 8763 + }, + { + "epoch": 3.564050427002847, + "grad_norm": 9.272459774509416, + "learning_rate": 1.7033535317042394e-05, + "loss": 0.5015, + "step": 8764 + }, + { + "epoch": 3.5644570963806426, + "grad_norm": 4.313006095990875, + "learning_rate": 1.7032815066025187e-05, + "loss": 0.1041, + "step": 8765 + }, + { + "epoch": 3.5648637657584383, + "grad_norm": 12.081142590574311, + "learning_rate": 1.7032094742812593e-05, + "loss": 0.3034, + "step": 8766 + }, + { + "epoch": 3.565270435136234, + "grad_norm": 10.535783914887876, + "learning_rate": 1.703137434741201e-05, + "loss": 0.6282, + "step": 8767 + }, + { + "epoch": 3.5656771045140303, + "grad_norm": 3.9909101176657815, + "learning_rate": 1.7030653879830826e-05, + "loss": 0.1065, + "step": 8768 + }, + { + "epoch": 3.566083773891826, + "grad_norm": 3.37882555035943, + "learning_rate": 1.7029933340076445e-05, + "loss": 0.1924, + "step": 8769 + }, + { + "epoch": 3.566490443269622, + "grad_norm": 0.4497871556079956, + "learning_rate": 1.702921272815626e-05, + "loss": 0.0097, + "step": 8770 + }, + { + "epoch": 3.5668971126474176, + "grad_norm": 5.811293184292409, + "learning_rate": 1.702849204407767e-05, + "loss": 0.0246, + "step": 8771 + }, + { + "epoch": 3.5673037820252134, + "grad_norm": 3.2851441430411374, + "learning_rate": 1.7027771287848068e-05, + "loss": 0.0446, + "step": 8772 + }, + { + "epoch": 3.567710451403009, + "grad_norm": 1.2197637188862076, + "learning_rate": 1.702705045947486e-05, + "loss": 0.02, + "step": 8773 + }, + { + "epoch": 3.5681171207808053, + "grad_norm": 18.49867929918755, + "learning_rate": 1.702632955896544e-05, + "loss": 0.3949, + "step": 8774 + }, + { + "epoch": 3.568523790158601, + "grad_norm": 5.142797737427537, + "learning_rate": 1.702560858632721e-05, + "loss": 0.1457, + "step": 8775 + }, + { + "epoch": 3.568930459536397, + "grad_norm": 10.71197371288443, + "learning_rate": 1.7024887541567575e-05, + "loss": 0.698, + "step": 8776 + }, + { + "epoch": 3.5693371289141926, + "grad_norm": 0.07546777801820127, + "learning_rate": 1.7024166424693933e-05, + "loss": 0.0015, + "step": 8777 + }, + { + "epoch": 3.569743798291989, + "grad_norm": 12.239854628992168, + "learning_rate": 1.7023445235713687e-05, + "loss": 0.1358, + "step": 8778 + }, + { + "epoch": 3.5701504676697846, + "grad_norm": 18.01463454312107, + "learning_rate": 1.702272397463424e-05, + "loss": 0.6801, + "step": 8779 + }, + { + "epoch": 3.5705571370475804, + "grad_norm": 12.301239164296234, + "learning_rate": 1.7022002641463e-05, + "loss": 0.8199, + "step": 8780 + }, + { + "epoch": 3.570963806425376, + "grad_norm": 1.4041594588248596, + "learning_rate": 1.7021281236207366e-05, + "loss": 0.0253, + "step": 8781 + }, + { + "epoch": 3.571370475803172, + "grad_norm": 8.286799895387453, + "learning_rate": 1.7020559758874746e-05, + "loss": 0.4009, + "step": 8782 + }, + { + "epoch": 3.5717771451809677, + "grad_norm": 6.689037472696584, + "learning_rate": 1.701983820947255e-05, + "loss": 0.246, + "step": 8783 + }, + { + "epoch": 3.572183814558764, + "grad_norm": 4.0332582257838885, + "learning_rate": 1.7019116588008176e-05, + "loss": 0.4331, + "step": 8784 + }, + { + "epoch": 3.5725904839365596, + "grad_norm": 2.7521156072927817, + "learning_rate": 1.7018394894489042e-05, + "loss": 0.0364, + "step": 8785 + }, + { + "epoch": 3.5729971533143554, + "grad_norm": 12.868049010953529, + "learning_rate": 1.7017673128922552e-05, + "loss": 1.0935, + "step": 8786 + }, + { + "epoch": 3.573403822692151, + "grad_norm": 16.113384393535295, + "learning_rate": 1.7016951291316114e-05, + "loss": 0.3152, + "step": 8787 + }, + { + "epoch": 3.5738104920699474, + "grad_norm": 12.08780737005493, + "learning_rate": 1.701622938167714e-05, + "loss": 1.182, + "step": 8788 + }, + { + "epoch": 3.574217161447743, + "grad_norm": 4.3382510255482565, + "learning_rate": 1.701550740001304e-05, + "loss": 0.0927, + "step": 8789 + }, + { + "epoch": 3.574623830825539, + "grad_norm": 16.85357133743639, + "learning_rate": 1.7014785346331227e-05, + "loss": 0.3496, + "step": 8790 + }, + { + "epoch": 3.5750305002033347, + "grad_norm": 0.8155063694904727, + "learning_rate": 1.701406322063911e-05, + "loss": 0.0189, + "step": 8791 + }, + { + "epoch": 3.5754371695811304, + "grad_norm": 3.2954660628306454, + "learning_rate": 1.7013341022944104e-05, + "loss": 0.0678, + "step": 8792 + }, + { + "epoch": 3.575843838958926, + "grad_norm": 7.83595587104515, + "learning_rate": 1.7012618753253626e-05, + "loss": 0.167, + "step": 8793 + }, + { + "epoch": 3.576250508336722, + "grad_norm": 7.342760766266134, + "learning_rate": 1.7011896411575082e-05, + "loss": 0.2855, + "step": 8794 + }, + { + "epoch": 3.576657177714518, + "grad_norm": 7.6717001459913865, + "learning_rate": 1.7011173997915895e-05, + "loss": 0.2145, + "step": 8795 + }, + { + "epoch": 3.577063847092314, + "grad_norm": 3.3500693807896313, + "learning_rate": 1.701045151228348e-05, + "loss": 0.0717, + "step": 8796 + }, + { + "epoch": 3.5774705164701097, + "grad_norm": 23.99657951787318, + "learning_rate": 1.700972895468525e-05, + "loss": 1.0786, + "step": 8797 + }, + { + "epoch": 3.577877185847906, + "grad_norm": 7.086342513338989, + "learning_rate": 1.7009006325128622e-05, + "loss": 0.2543, + "step": 8798 + }, + { + "epoch": 3.5782838552257017, + "grad_norm": 3.2406163823625076, + "learning_rate": 1.700828362362102e-05, + "loss": 0.0724, + "step": 8799 + }, + { + "epoch": 3.5786905246034975, + "grad_norm": 0.06998757901511385, + "learning_rate": 1.7007560850169856e-05, + "loss": 0.0014, + "step": 8800 + }, + { + "epoch": 3.5790971939812932, + "grad_norm": 2.1416840048727903, + "learning_rate": 1.700683800478256e-05, + "loss": 0.0132, + "step": 8801 + }, + { + "epoch": 3.579503863359089, + "grad_norm": 11.250292305886603, + "learning_rate": 1.700611508746654e-05, + "loss": 0.7807, + "step": 8802 + }, + { + "epoch": 3.5799105327368848, + "grad_norm": 1.4091572768118097, + "learning_rate": 1.7005392098229225e-05, + "loss": 0.0338, + "step": 8803 + }, + { + "epoch": 3.5803172021146805, + "grad_norm": 20.367038555244477, + "learning_rate": 1.700466903707803e-05, + "loss": 1.525, + "step": 8804 + }, + { + "epoch": 3.5807238714924767, + "grad_norm": 10.706327120418422, + "learning_rate": 1.7003945904020383e-05, + "loss": 0.322, + "step": 8805 + }, + { + "epoch": 3.5811305408702725, + "grad_norm": 10.553848004856356, + "learning_rate": 1.700322269906371e-05, + "loss": 0.3644, + "step": 8806 + }, + { + "epoch": 3.5815372102480683, + "grad_norm": 15.258666180169264, + "learning_rate": 1.700249942221543e-05, + "loss": 0.2872, + "step": 8807 + }, + { + "epoch": 3.581943879625864, + "grad_norm": 12.649254585326322, + "learning_rate": 1.7001776073482966e-05, + "loss": 0.5509, + "step": 8808 + }, + { + "epoch": 3.5823505490036602, + "grad_norm": 5.372130525487662, + "learning_rate": 1.7001052652873752e-05, + "loss": 0.0984, + "step": 8809 + }, + { + "epoch": 3.582757218381456, + "grad_norm": 9.774718833191924, + "learning_rate": 1.7000329160395206e-05, + "loss": 0.6375, + "step": 8810 + }, + { + "epoch": 3.5831638877592518, + "grad_norm": 10.015165212501978, + "learning_rate": 1.6999605596054757e-05, + "loss": 0.2416, + "step": 8811 + }, + { + "epoch": 3.5835705571370475, + "grad_norm": 2.0792039401676603, + "learning_rate": 1.6998881959859835e-05, + "loss": 0.033, + "step": 8812 + }, + { + "epoch": 3.5839772265148433, + "grad_norm": 4.954068028012043, + "learning_rate": 1.699815825181787e-05, + "loss": 0.1945, + "step": 8813 + }, + { + "epoch": 3.584383895892639, + "grad_norm": 3.4133035056476384, + "learning_rate": 1.6997434471936283e-05, + "loss": 0.0672, + "step": 8814 + }, + { + "epoch": 3.5847905652704353, + "grad_norm": 13.263167556086202, + "learning_rate": 1.6996710620222515e-05, + "loss": 0.3105, + "step": 8815 + }, + { + "epoch": 3.585197234648231, + "grad_norm": 10.060462836531743, + "learning_rate": 1.6995986696683985e-05, + "loss": 0.3842, + "step": 8816 + }, + { + "epoch": 3.585603904026027, + "grad_norm": 11.581782836565893, + "learning_rate": 1.699526270132813e-05, + "loss": 0.7888, + "step": 8817 + }, + { + "epoch": 3.5860105734038226, + "grad_norm": 0.9660329331865957, + "learning_rate": 1.6994538634162392e-05, + "loss": 0.0185, + "step": 8818 + }, + { + "epoch": 3.5864172427816188, + "grad_norm": 2.339860022508974, + "learning_rate": 1.699381449519419e-05, + "loss": 0.0631, + "step": 8819 + }, + { + "epoch": 3.5868239121594145, + "grad_norm": 6.64628281387183, + "learning_rate": 1.699309028443096e-05, + "loss": 0.3199, + "step": 8820 + }, + { + "epoch": 3.5872305815372103, + "grad_norm": 8.720920993608056, + "learning_rate": 1.699236600188014e-05, + "loss": 0.2228, + "step": 8821 + }, + { + "epoch": 3.587637250915006, + "grad_norm": 6.681262489089219, + "learning_rate": 1.699164164754916e-05, + "loss": 0.4402, + "step": 8822 + }, + { + "epoch": 3.588043920292802, + "grad_norm": 8.360351132505496, + "learning_rate": 1.6990917221445468e-05, + "loss": 0.4372, + "step": 8823 + }, + { + "epoch": 3.5884505896705976, + "grad_norm": 7.765782855398287, + "learning_rate": 1.6990192723576483e-05, + "loss": 0.1538, + "step": 8824 + }, + { + "epoch": 3.588857259048394, + "grad_norm": 5.025151157692043, + "learning_rate": 1.6989468153949657e-05, + "loss": 0.3241, + "step": 8825 + }, + { + "epoch": 3.5892639284261896, + "grad_norm": 5.588466892241398, + "learning_rate": 1.6988743512572423e-05, + "loss": 0.1417, + "step": 8826 + }, + { + "epoch": 3.5896705978039853, + "grad_norm": 7.159603038001847, + "learning_rate": 1.6988018799452216e-05, + "loss": 0.2147, + "step": 8827 + }, + { + "epoch": 3.590077267181781, + "grad_norm": 0.5947692803316686, + "learning_rate": 1.6987294014596484e-05, + "loss": 0.0074, + "step": 8828 + }, + { + "epoch": 3.5904839365595773, + "grad_norm": 8.274679312972966, + "learning_rate": 1.698656915801266e-05, + "loss": 0.1943, + "step": 8829 + }, + { + "epoch": 3.590890605937373, + "grad_norm": 5.89582066261137, + "learning_rate": 1.6985844229708186e-05, + "loss": 0.1941, + "step": 8830 + }, + { + "epoch": 3.591297275315169, + "grad_norm": 4.004099836467039, + "learning_rate": 1.6985119229690507e-05, + "loss": 0.0836, + "step": 8831 + }, + { + "epoch": 3.5917039446929646, + "grad_norm": 4.294759180200321, + "learning_rate": 1.698439415796706e-05, + "loss": 0.1196, + "step": 8832 + }, + { + "epoch": 3.5921106140707604, + "grad_norm": 11.00715593555342, + "learning_rate": 1.6983669014545297e-05, + "loss": 0.2392, + "step": 8833 + }, + { + "epoch": 3.592517283448556, + "grad_norm": 4.1338874067164655, + "learning_rate": 1.6982943799432654e-05, + "loss": 0.2169, + "step": 8834 + }, + { + "epoch": 3.592923952826352, + "grad_norm": 13.988680473916835, + "learning_rate": 1.698221851263658e-05, + "loss": 0.6576, + "step": 8835 + }, + { + "epoch": 3.593330622204148, + "grad_norm": 6.394920139093037, + "learning_rate": 1.6981493154164517e-05, + "loss": 0.389, + "step": 8836 + }, + { + "epoch": 3.593737291581944, + "grad_norm": 2.478435371307128, + "learning_rate": 1.6980767724023914e-05, + "loss": 0.0437, + "step": 8837 + }, + { + "epoch": 3.5941439609597396, + "grad_norm": 2.476573707058065, + "learning_rate": 1.6980042222222216e-05, + "loss": 0.0593, + "step": 8838 + }, + { + "epoch": 3.594550630337536, + "grad_norm": 10.85673906114699, + "learning_rate": 1.6979316648766874e-05, + "loss": 0.7602, + "step": 8839 + }, + { + "epoch": 3.5949572997153316, + "grad_norm": 7.262377870603787, + "learning_rate": 1.697859100366533e-05, + "loss": 0.2622, + "step": 8840 + }, + { + "epoch": 3.5953639690931274, + "grad_norm": 2.4928993685957264, + "learning_rate": 1.6977865286925038e-05, + "loss": 0.036, + "step": 8841 + }, + { + "epoch": 3.595770638470923, + "grad_norm": 0.11912207796866772, + "learning_rate": 1.697713949855345e-05, + "loss": 0.0019, + "step": 8842 + }, + { + "epoch": 3.596177307848719, + "grad_norm": 6.8144295979007845, + "learning_rate": 1.697641363855801e-05, + "loss": 0.3032, + "step": 8843 + }, + { + "epoch": 3.5965839772265147, + "grad_norm": 4.3245502571963765, + "learning_rate": 1.6975687706946175e-05, + "loss": 0.1294, + "step": 8844 + }, + { + "epoch": 3.5969906466043104, + "grad_norm": 6.867265854232268, + "learning_rate": 1.6974961703725394e-05, + "loss": 0.2196, + "step": 8845 + }, + { + "epoch": 3.5973973159821067, + "grad_norm": 26.977544565329705, + "learning_rate": 1.6974235628903125e-05, + "loss": 0.4332, + "step": 8846 + }, + { + "epoch": 3.5978039853599024, + "grad_norm": 5.667835211183052, + "learning_rate": 1.6973509482486813e-05, + "loss": 0.1116, + "step": 8847 + }, + { + "epoch": 3.598210654737698, + "grad_norm": 3.948115782984069, + "learning_rate": 1.6972783264483915e-05, + "loss": 0.1742, + "step": 8848 + }, + { + "epoch": 3.598617324115494, + "grad_norm": 7.451895798067584, + "learning_rate": 1.697205697490189e-05, + "loss": 0.5797, + "step": 8849 + }, + { + "epoch": 3.59902399349329, + "grad_norm": 17.290052151829897, + "learning_rate": 1.697133061374819e-05, + "loss": 1.31, + "step": 8850 + }, + { + "epoch": 3.599430662871086, + "grad_norm": 7.803677744332964, + "learning_rate": 1.6970604181030274e-05, + "loss": 0.3471, + "step": 8851 + }, + { + "epoch": 3.5998373322488817, + "grad_norm": 0.2952764155227644, + "learning_rate": 1.6969877676755596e-05, + "loss": 0.0053, + "step": 8852 + }, + { + "epoch": 3.6002440016266775, + "grad_norm": 14.772007504079339, + "learning_rate": 1.696915110093162e-05, + "loss": 0.7869, + "step": 8853 + }, + { + "epoch": 3.6006506710044732, + "grad_norm": 11.047885884269006, + "learning_rate": 1.6968424453565795e-05, + "loss": 0.5212, + "step": 8854 + }, + { + "epoch": 3.601057340382269, + "grad_norm": 8.987218053068686, + "learning_rate": 1.6967697734665586e-05, + "loss": 0.455, + "step": 8855 + }, + { + "epoch": 3.601464009760065, + "grad_norm": 6.305448529602515, + "learning_rate": 1.6966970944238455e-05, + "loss": 0.1299, + "step": 8856 + }, + { + "epoch": 3.601870679137861, + "grad_norm": 10.661593209735317, + "learning_rate": 1.6966244082291864e-05, + "loss": 0.4142, + "step": 8857 + }, + { + "epoch": 3.6022773485156567, + "grad_norm": 8.38256278837082, + "learning_rate": 1.696551714883327e-05, + "loss": 0.3091, + "step": 8858 + }, + { + "epoch": 3.6026840178934525, + "grad_norm": 15.561158303368996, + "learning_rate": 1.6964790143870135e-05, + "loss": 0.5079, + "step": 8859 + }, + { + "epoch": 3.6030906872712487, + "grad_norm": 11.048395150989842, + "learning_rate": 1.6964063067409922e-05, + "loss": 0.5155, + "step": 8860 + }, + { + "epoch": 3.6034973566490445, + "grad_norm": 3.881380145496879, + "learning_rate": 1.69633359194601e-05, + "loss": 0.0806, + "step": 8861 + }, + { + "epoch": 3.6039040260268402, + "grad_norm": 6.581495999865843, + "learning_rate": 1.6962608700028127e-05, + "loss": 0.2343, + "step": 8862 + }, + { + "epoch": 3.604310695404636, + "grad_norm": 6.499086494519644, + "learning_rate": 1.6961881409121476e-05, + "loss": 0.1867, + "step": 8863 + }, + { + "epoch": 3.6047173647824318, + "grad_norm": 10.466642656405163, + "learning_rate": 1.6961154046747606e-05, + "loss": 0.3165, + "step": 8864 + }, + { + "epoch": 3.6051240341602275, + "grad_norm": 6.952168230333217, + "learning_rate": 1.6960426612913985e-05, + "loss": 0.3312, + "step": 8865 + }, + { + "epoch": 3.6055307035380237, + "grad_norm": 11.767064694881455, + "learning_rate": 1.6959699107628082e-05, + "loss": 0.5269, + "step": 8866 + }, + { + "epoch": 3.6059373729158195, + "grad_norm": 0.8547370145060037, + "learning_rate": 1.6958971530897367e-05, + "loss": 0.0198, + "step": 8867 + }, + { + "epoch": 3.6063440422936153, + "grad_norm": 5.705033195694805, + "learning_rate": 1.6958243882729305e-05, + "loss": 0.0658, + "step": 8868 + }, + { + "epoch": 3.606750711671411, + "grad_norm": 6.54995102126693, + "learning_rate": 1.6957516163131368e-05, + "loss": 0.2823, + "step": 8869 + }, + { + "epoch": 3.6071573810492072, + "grad_norm": 2.1977451315209966, + "learning_rate": 1.6956788372111027e-05, + "loss": 0.0336, + "step": 8870 + }, + { + "epoch": 3.607564050427003, + "grad_norm": 2.1909556674136885, + "learning_rate": 1.6956060509675753e-05, + "loss": 0.0225, + "step": 8871 + }, + { + "epoch": 3.6079707198047988, + "grad_norm": 5.682740256125233, + "learning_rate": 1.6955332575833012e-05, + "loss": 0.1092, + "step": 8872 + }, + { + "epoch": 3.6083773891825945, + "grad_norm": 0.43262810725812606, + "learning_rate": 1.6954604570590287e-05, + "loss": 0.0107, + "step": 8873 + }, + { + "epoch": 3.6087840585603903, + "grad_norm": 3.8361393903465846, + "learning_rate": 1.6953876493955044e-05, + "loss": 0.0729, + "step": 8874 + }, + { + "epoch": 3.609190727938186, + "grad_norm": 0.8564446180835186, + "learning_rate": 1.6953148345934754e-05, + "loss": 0.0155, + "step": 8875 + }, + { + "epoch": 3.609597397315982, + "grad_norm": 6.891203205822987, + "learning_rate": 1.6952420126536905e-05, + "loss": 0.2779, + "step": 8876 + }, + { + "epoch": 3.610004066693778, + "grad_norm": 7.203603609623269, + "learning_rate": 1.695169183576896e-05, + "loss": 0.236, + "step": 8877 + }, + { + "epoch": 3.610410736071574, + "grad_norm": 4.477217622745879, + "learning_rate": 1.69509634736384e-05, + "loss": 0.2713, + "step": 8878 + }, + { + "epoch": 3.6108174054493696, + "grad_norm": 14.966388709272131, + "learning_rate": 1.6950235040152704e-05, + "loss": 0.7217, + "step": 8879 + }, + { + "epoch": 3.611224074827166, + "grad_norm": 11.278617475450947, + "learning_rate": 1.6949506535319345e-05, + "loss": 0.517, + "step": 8880 + }, + { + "epoch": 3.6116307442049616, + "grad_norm": 6.334962623373095, + "learning_rate": 1.6948777959145804e-05, + "loss": 0.1102, + "step": 8881 + }, + { + "epoch": 3.6120374135827573, + "grad_norm": 21.467592849704186, + "learning_rate": 1.6948049311639557e-05, + "loss": 0.2926, + "step": 8882 + }, + { + "epoch": 3.612444082960553, + "grad_norm": 19.110808489162256, + "learning_rate": 1.694732059280809e-05, + "loss": 1.2916, + "step": 8883 + }, + { + "epoch": 3.612850752338349, + "grad_norm": 5.285095296410101, + "learning_rate": 1.6946591802658883e-05, + "loss": 0.2885, + "step": 8884 + }, + { + "epoch": 3.6132574217161446, + "grad_norm": 4.05324277631839, + "learning_rate": 1.6945862941199412e-05, + "loss": 0.1726, + "step": 8885 + }, + { + "epoch": 3.6136640910939404, + "grad_norm": 5.277102089368849, + "learning_rate": 1.6945134008437164e-05, + "loss": 0.3847, + "step": 8886 + }, + { + "epoch": 3.6140707604717366, + "grad_norm": 6.554496600379959, + "learning_rate": 1.6944405004379618e-05, + "loss": 0.23, + "step": 8887 + }, + { + "epoch": 3.6144774298495324, + "grad_norm": 2.076044944907329, + "learning_rate": 1.694367592903426e-05, + "loss": 0.0367, + "step": 8888 + }, + { + "epoch": 3.614884099227328, + "grad_norm": 8.801170378191294, + "learning_rate": 1.6942946782408574e-05, + "loss": 0.4272, + "step": 8889 + }, + { + "epoch": 3.615290768605124, + "grad_norm": 11.417658223849017, + "learning_rate": 1.6942217564510043e-05, + "loss": 0.5895, + "step": 8890 + }, + { + "epoch": 3.61569743798292, + "grad_norm": 0.44729843214274845, + "learning_rate": 1.694148827534616e-05, + "loss": 0.0066, + "step": 8891 + }, + { + "epoch": 3.616104107360716, + "grad_norm": 15.742068347527304, + "learning_rate": 1.6940758914924404e-05, + "loss": 0.9743, + "step": 8892 + }, + { + "epoch": 3.6165107767385116, + "grad_norm": 9.894064603856345, + "learning_rate": 1.6940029483252262e-05, + "loss": 0.423, + "step": 8893 + }, + { + "epoch": 3.6169174461163074, + "grad_norm": 5.799027073690543, + "learning_rate": 1.6939299980337227e-05, + "loss": 0.2962, + "step": 8894 + }, + { + "epoch": 3.617324115494103, + "grad_norm": 0.3971965085993542, + "learning_rate": 1.693857040618678e-05, + "loss": 0.0058, + "step": 8895 + }, + { + "epoch": 3.617730784871899, + "grad_norm": 3.127970462207875, + "learning_rate": 1.6937840760808427e-05, + "loss": 0.1287, + "step": 8896 + }, + { + "epoch": 3.618137454249695, + "grad_norm": 4.722548319693144, + "learning_rate": 1.693711104420964e-05, + "loss": 0.0945, + "step": 8897 + }, + { + "epoch": 3.618544123627491, + "grad_norm": 0.9233715566982362, + "learning_rate": 1.6936381256397914e-05, + "loss": 0.0145, + "step": 8898 + }, + { + "epoch": 3.6189507930052867, + "grad_norm": 7.854997777353627, + "learning_rate": 1.6935651397380747e-05, + "loss": 0.3164, + "step": 8899 + }, + { + "epoch": 3.6193574623830824, + "grad_norm": 9.057745019262578, + "learning_rate": 1.6934921467165627e-05, + "loss": 0.2697, + "step": 8900 + }, + { + "epoch": 3.6197641317608786, + "grad_norm": 5.400465734823487, + "learning_rate": 1.6934191465760046e-05, + "loss": 0.1508, + "step": 8901 + }, + { + "epoch": 3.6201708011386744, + "grad_norm": 7.532358438225634, + "learning_rate": 1.69334613931715e-05, + "loss": 0.2677, + "step": 8902 + }, + { + "epoch": 3.62057747051647, + "grad_norm": 7.412851256479246, + "learning_rate": 1.6932731249407485e-05, + "loss": 0.2887, + "step": 8903 + }, + { + "epoch": 3.620984139894266, + "grad_norm": 0.14030757072146413, + "learning_rate": 1.6932001034475497e-05, + "loss": 0.0046, + "step": 8904 + }, + { + "epoch": 3.6213908092720617, + "grad_norm": 0.3788493243082047, + "learning_rate": 1.6931270748383028e-05, + "loss": 0.0085, + "step": 8905 + }, + { + "epoch": 3.6217974786498575, + "grad_norm": 7.873259559716844, + "learning_rate": 1.6930540391137575e-05, + "loss": 0.194, + "step": 8906 + }, + { + "epoch": 3.6222041480276537, + "grad_norm": 4.0954748539396855, + "learning_rate": 1.6929809962746636e-05, + "loss": 0.092, + "step": 8907 + }, + { + "epoch": 3.6226108174054494, + "grad_norm": 32.43445519993224, + "learning_rate": 1.692907946321771e-05, + "loss": 1.0328, + "step": 8908 + }, + { + "epoch": 3.623017486783245, + "grad_norm": 15.473797549607378, + "learning_rate": 1.6928348892558298e-05, + "loss": 0.8072, + "step": 8909 + }, + { + "epoch": 3.623424156161041, + "grad_norm": 4.238624410811313, + "learning_rate": 1.6927618250775894e-05, + "loss": 0.0781, + "step": 8910 + }, + { + "epoch": 3.623830825538837, + "grad_norm": 0.6007494938077752, + "learning_rate": 1.6926887537878007e-05, + "loss": 0.0269, + "step": 8911 + }, + { + "epoch": 3.624237494916633, + "grad_norm": 12.302950991455427, + "learning_rate": 1.692615675387213e-05, + "loss": 0.5724, + "step": 8912 + }, + { + "epoch": 3.6246441642944287, + "grad_norm": 1.775152581138461, + "learning_rate": 1.692542589876577e-05, + "loss": 0.0302, + "step": 8913 + }, + { + "epoch": 3.6250508336722245, + "grad_norm": 7.048286171666842, + "learning_rate": 1.6924694972566426e-05, + "loss": 0.2388, + "step": 8914 + }, + { + "epoch": 3.6254575030500202, + "grad_norm": 1.7771687504455804, + "learning_rate": 1.6923963975281604e-05, + "loss": 0.1107, + "step": 8915 + }, + { + "epoch": 3.625864172427816, + "grad_norm": 6.538143308913436, + "learning_rate": 1.692323290691881e-05, + "loss": 0.0878, + "step": 8916 + }, + { + "epoch": 3.6262708418056118, + "grad_norm": 6.903547807125633, + "learning_rate": 1.692250176748554e-05, + "loss": 0.1238, + "step": 8917 + }, + { + "epoch": 3.626677511183408, + "grad_norm": 1.3659902233129686, + "learning_rate": 1.692177055698931e-05, + "loss": 0.0275, + "step": 8918 + }, + { + "epoch": 3.6270841805612037, + "grad_norm": 7.72072174146981, + "learning_rate": 1.6921039275437618e-05, + "loss": 0.2974, + "step": 8919 + }, + { + "epoch": 3.6274908499389995, + "grad_norm": 1.9248686172130223, + "learning_rate": 1.6920307922837975e-05, + "loss": 0.0224, + "step": 8920 + }, + { + "epoch": 3.6278975193167957, + "grad_norm": 8.81981783030978, + "learning_rate": 1.691957649919789e-05, + "loss": 0.3585, + "step": 8921 + }, + { + "epoch": 3.6283041886945915, + "grad_norm": 12.136337474143302, + "learning_rate": 1.6918845004524868e-05, + "loss": 0.7665, + "step": 8922 + }, + { + "epoch": 3.6287108580723872, + "grad_norm": 1.3622354763747402, + "learning_rate": 1.691811343882642e-05, + "loss": 0.0216, + "step": 8923 + }, + { + "epoch": 3.629117527450183, + "grad_norm": 4.622892035202749, + "learning_rate": 1.6917381802110057e-05, + "loss": 0.2816, + "step": 8924 + }, + { + "epoch": 3.6295241968279788, + "grad_norm": 4.253133653716977, + "learning_rate": 1.691665009438329e-05, + "loss": 0.0695, + "step": 8925 + }, + { + "epoch": 3.6299308662057745, + "grad_norm": 3.816316787609139, + "learning_rate": 1.6915918315653625e-05, + "loss": 0.1112, + "step": 8926 + }, + { + "epoch": 3.6303375355835703, + "grad_norm": 0.5556943903736836, + "learning_rate": 1.691518646592858e-05, + "loss": 0.011, + "step": 8927 + }, + { + "epoch": 3.6307442049613665, + "grad_norm": 18.77461162091151, + "learning_rate": 1.6914454545215663e-05, + "loss": 0.3134, + "step": 8928 + }, + { + "epoch": 3.6311508743391623, + "grad_norm": 13.412063353573297, + "learning_rate": 1.6913722553522394e-05, + "loss": 0.7746, + "step": 8929 + }, + { + "epoch": 3.631557543716958, + "grad_norm": 2.241126695114001, + "learning_rate": 1.691299049085628e-05, + "loss": 0.0299, + "step": 8930 + }, + { + "epoch": 3.631964213094754, + "grad_norm": 10.800706100892253, + "learning_rate": 1.691225835722484e-05, + "loss": 0.32, + "step": 8931 + }, + { + "epoch": 3.63237088247255, + "grad_norm": 2.2762123124111397, + "learning_rate": 1.691152615263559e-05, + "loss": 0.0588, + "step": 8932 + }, + { + "epoch": 3.632777551850346, + "grad_norm": 9.787680259749955, + "learning_rate": 1.691079387709605e-05, + "loss": 0.4585, + "step": 8933 + }, + { + "epoch": 3.6331842212281416, + "grad_norm": 5.859395513975473, + "learning_rate": 1.6910061530613723e-05, + "loss": 0.2172, + "step": 8934 + }, + { + "epoch": 3.6335908906059373, + "grad_norm": 0.30381586468602023, + "learning_rate": 1.690932911319614e-05, + "loss": 0.0044, + "step": 8935 + }, + { + "epoch": 3.633997559983733, + "grad_norm": 3.9989879703011493, + "learning_rate": 1.6908596624850823e-05, + "loss": 0.1544, + "step": 8936 + }, + { + "epoch": 3.634404229361529, + "grad_norm": 10.169029325520825, + "learning_rate": 1.690786406558528e-05, + "loss": 0.595, + "step": 8937 + }, + { + "epoch": 3.634810898739325, + "grad_norm": 1.3421240624613402, + "learning_rate": 1.690713143540703e-05, + "loss": 0.0208, + "step": 8938 + }, + { + "epoch": 3.635217568117121, + "grad_norm": 14.204125052308415, + "learning_rate": 1.690639873432361e-05, + "loss": 0.1401, + "step": 8939 + }, + { + "epoch": 3.6356242374949166, + "grad_norm": 3.961031671282036, + "learning_rate": 1.6905665962342525e-05, + "loss": 0.0947, + "step": 8940 + }, + { + "epoch": 3.6360309068727124, + "grad_norm": 9.103077252078847, + "learning_rate": 1.6904933119471302e-05, + "loss": 0.4079, + "step": 8941 + }, + { + "epoch": 3.6364375762505086, + "grad_norm": 16.665865029388126, + "learning_rate": 1.690420020571747e-05, + "loss": 0.6452, + "step": 8942 + }, + { + "epoch": 3.6368442456283043, + "grad_norm": 12.73832068017532, + "learning_rate": 1.6903467221088547e-05, + "loss": 0.5958, + "step": 8943 + }, + { + "epoch": 3.6372509150061, + "grad_norm": 2.0286555515387787, + "learning_rate": 1.6902734165592054e-05, + "loss": 0.0499, + "step": 8944 + }, + { + "epoch": 3.637657584383896, + "grad_norm": 4.952368374954141, + "learning_rate": 1.6902001039235526e-05, + "loss": 0.1499, + "step": 8945 + }, + { + "epoch": 3.6380642537616916, + "grad_norm": 26.297978262443987, + "learning_rate": 1.690126784202648e-05, + "loss": 1.826, + "step": 8946 + }, + { + "epoch": 3.6384709231394874, + "grad_norm": 10.139262286745849, + "learning_rate": 1.690053457397245e-05, + "loss": 0.3021, + "step": 8947 + }, + { + "epoch": 3.6388775925172836, + "grad_norm": 1.0110430060158049, + "learning_rate": 1.6899801235080957e-05, + "loss": 0.0153, + "step": 8948 + }, + { + "epoch": 3.6392842618950794, + "grad_norm": 13.78057555423773, + "learning_rate": 1.689906782535953e-05, + "loss": 0.5603, + "step": 8949 + }, + { + "epoch": 3.639690931272875, + "grad_norm": 41.96112199162856, + "learning_rate": 1.6898334344815702e-05, + "loss": 0.3785, + "step": 8950 + }, + { + "epoch": 3.640097600650671, + "grad_norm": 9.747298722132038, + "learning_rate": 1.6897600793457e-05, + "loss": 0.268, + "step": 8951 + }, + { + "epoch": 3.640504270028467, + "grad_norm": 2.2791355809629694, + "learning_rate": 1.6896867171290954e-05, + "loss": 0.0462, + "step": 8952 + }, + { + "epoch": 3.640910939406263, + "grad_norm": 10.72511716077157, + "learning_rate": 1.6896133478325095e-05, + "loss": 0.2407, + "step": 8953 + }, + { + "epoch": 3.6413176087840586, + "grad_norm": 0.13209777517463736, + "learning_rate": 1.6895399714566957e-05, + "loss": 0.003, + "step": 8954 + }, + { + "epoch": 3.6417242781618544, + "grad_norm": 7.68651178442341, + "learning_rate": 1.6894665880024068e-05, + "loss": 0.6107, + "step": 8955 + }, + { + "epoch": 3.64213094753965, + "grad_norm": 3.7680960397678445, + "learning_rate": 1.6893931974703966e-05, + "loss": 0.0868, + "step": 8956 + }, + { + "epoch": 3.642537616917446, + "grad_norm": 7.085863037306946, + "learning_rate": 1.6893197998614184e-05, + "loss": 0.3172, + "step": 8957 + }, + { + "epoch": 3.6429442862952417, + "grad_norm": 78.17207619128203, + "learning_rate": 1.689246395176225e-05, + "loss": 0.4425, + "step": 8958 + }, + { + "epoch": 3.643350955673038, + "grad_norm": 5.162897635413158, + "learning_rate": 1.6891729834155708e-05, + "loss": 0.268, + "step": 8959 + }, + { + "epoch": 3.6437576250508337, + "grad_norm": 1.2790132096160411, + "learning_rate": 1.689099564580209e-05, + "loss": 0.0261, + "step": 8960 + }, + { + "epoch": 3.6441642944286294, + "grad_norm": 4.607109241038117, + "learning_rate": 1.689026138670893e-05, + "loss": 0.075, + "step": 8961 + }, + { + "epoch": 3.6445709638064256, + "grad_norm": 26.889401323780717, + "learning_rate": 1.688952705688378e-05, + "loss": 0.7295, + "step": 8962 + }, + { + "epoch": 3.6449776331842214, + "grad_norm": 2.426060033314593, + "learning_rate": 1.688879265633416e-05, + "loss": 0.0525, + "step": 8963 + }, + { + "epoch": 3.645384302562017, + "grad_norm": 10.054560238600436, + "learning_rate": 1.6888058185067616e-05, + "loss": 0.7383, + "step": 8964 + }, + { + "epoch": 3.645790971939813, + "grad_norm": 9.93730994099406, + "learning_rate": 1.6887323643091686e-05, + "loss": 0.2887, + "step": 8965 + }, + { + "epoch": 3.6461976413176087, + "grad_norm": 5.203249371256837, + "learning_rate": 1.688658903041392e-05, + "loss": 0.1519, + "step": 8966 + }, + { + "epoch": 3.6466043106954045, + "grad_norm": 8.153813741185367, + "learning_rate": 1.6885854347041844e-05, + "loss": 0.4893, + "step": 8967 + }, + { + "epoch": 3.6470109800732002, + "grad_norm": 12.789771366186894, + "learning_rate": 1.688511959298301e-05, + "loss": 0.9354, + "step": 8968 + }, + { + "epoch": 3.6474176494509964, + "grad_norm": 6.735226965464302, + "learning_rate": 1.6884384768244958e-05, + "loss": 0.1348, + "step": 8969 + }, + { + "epoch": 3.647824318828792, + "grad_norm": 14.338681607954058, + "learning_rate": 1.688364987283523e-05, + "loss": 0.9204, + "step": 8970 + }, + { + "epoch": 3.648230988206588, + "grad_norm": 6.418383464739875, + "learning_rate": 1.6882914906761376e-05, + "loss": 0.3875, + "step": 8971 + }, + { + "epoch": 3.6486376575843837, + "grad_norm": 6.223873306469071, + "learning_rate": 1.6882179870030934e-05, + "loss": 0.3275, + "step": 8972 + }, + { + "epoch": 3.64904432696218, + "grad_norm": 4.902650356179725, + "learning_rate": 1.688144476265145e-05, + "loss": 0.315, + "step": 8973 + }, + { + "epoch": 3.6494509963399757, + "grad_norm": 7.336515107586929, + "learning_rate": 1.6880709584630475e-05, + "loss": 0.1689, + "step": 8974 + }, + { + "epoch": 3.6498576657177715, + "grad_norm": 18.11985490013428, + "learning_rate": 1.687997433597555e-05, + "loss": 0.7446, + "step": 8975 + }, + { + "epoch": 3.6502643350955672, + "grad_norm": 15.413816039016561, + "learning_rate": 1.687923901669423e-05, + "loss": 1.0495, + "step": 8976 + }, + { + "epoch": 3.650671004473363, + "grad_norm": 4.028163641007045, + "learning_rate": 1.6878503626794057e-05, + "loss": 0.0637, + "step": 8977 + }, + { + "epoch": 3.651077673851159, + "grad_norm": 10.472348694587653, + "learning_rate": 1.687776816628258e-05, + "loss": 0.2184, + "step": 8978 + }, + { + "epoch": 3.651484343228955, + "grad_norm": 8.291903207501663, + "learning_rate": 1.6877032635167356e-05, + "loss": 0.2527, + "step": 8979 + }, + { + "epoch": 3.6518910126067508, + "grad_norm": 0.4508684123870904, + "learning_rate": 1.687629703345593e-05, + "loss": 0.0068, + "step": 8980 + }, + { + "epoch": 3.6522976819845465, + "grad_norm": 2.307779040576351, + "learning_rate": 1.6875561361155853e-05, + "loss": 0.0471, + "step": 8981 + }, + { + "epoch": 3.6527043513623423, + "grad_norm": 1.2059115557666078, + "learning_rate": 1.6874825618274676e-05, + "loss": 0.0151, + "step": 8982 + }, + { + "epoch": 3.6531110207401385, + "grad_norm": 10.580214560857689, + "learning_rate": 1.6874089804819957e-05, + "loss": 0.2231, + "step": 8983 + }, + { + "epoch": 3.6535176901179343, + "grad_norm": 5.122822214145999, + "learning_rate": 1.6873353920799246e-05, + "loss": 0.1315, + "step": 8984 + }, + { + "epoch": 3.65392435949573, + "grad_norm": 9.86753037894067, + "learning_rate": 1.6872617966220097e-05, + "loss": 0.2933, + "step": 8985 + }, + { + "epoch": 3.654331028873526, + "grad_norm": 6.691888416815299, + "learning_rate": 1.6871881941090066e-05, + "loss": 0.1766, + "step": 8986 + }, + { + "epoch": 3.6547376982513216, + "grad_norm": 12.941242514915055, + "learning_rate": 1.687114584541671e-05, + "loss": 0.381, + "step": 8987 + }, + { + "epoch": 3.6551443676291173, + "grad_norm": 8.104914670757912, + "learning_rate": 1.687040967920758e-05, + "loss": 0.37, + "step": 8988 + }, + { + "epoch": 3.6555510370069135, + "grad_norm": 8.176595127460502, + "learning_rate": 1.686967344247024e-05, + "loss": 0.163, + "step": 8989 + }, + { + "epoch": 3.6559577063847093, + "grad_norm": 0.6659505340387813, + "learning_rate": 1.6868937135212244e-05, + "loss": 0.0115, + "step": 8990 + }, + { + "epoch": 3.656364375762505, + "grad_norm": 0.34139757792185044, + "learning_rate": 1.6868200757441146e-05, + "loss": 0.0047, + "step": 8991 + }, + { + "epoch": 3.656771045140301, + "grad_norm": 22.071116696787644, + "learning_rate": 1.6867464309164517e-05, + "loss": 0.5608, + "step": 8992 + }, + { + "epoch": 3.657177714518097, + "grad_norm": 15.49216406754185, + "learning_rate": 1.686672779038991e-05, + "loss": 0.8876, + "step": 8993 + }, + { + "epoch": 3.657584383895893, + "grad_norm": 8.098582769901352, + "learning_rate": 1.6865991201124886e-05, + "loss": 0.5601, + "step": 8994 + }, + { + "epoch": 3.6579910532736886, + "grad_norm": 12.837963987344265, + "learning_rate": 1.6865254541377005e-05, + "loss": 0.1417, + "step": 8995 + }, + { + "epoch": 3.6583977226514843, + "grad_norm": 9.576853012802252, + "learning_rate": 1.6864517811153832e-05, + "loss": 0.4026, + "step": 8996 + }, + { + "epoch": 3.65880439202928, + "grad_norm": 1.0821076089248542, + "learning_rate": 1.6863781010462928e-05, + "loss": 0.0139, + "step": 8997 + }, + { + "epoch": 3.659211061407076, + "grad_norm": 9.549211928674852, + "learning_rate": 1.6863044139311856e-05, + "loss": 0.496, + "step": 8998 + }, + { + "epoch": 3.6596177307848716, + "grad_norm": 10.37719938266734, + "learning_rate": 1.6862307197708188e-05, + "loss": 0.3975, + "step": 8999 + }, + { + "epoch": 3.660024400162668, + "grad_norm": 7.973896866413189, + "learning_rate": 1.6861570185659477e-05, + "loss": 0.3044, + "step": 9000 + }, + { + "epoch": 3.6604310695404636, + "grad_norm": 4.398433978348673, + "learning_rate": 1.6860833103173295e-05, + "loss": 0.0444, + "step": 9001 + }, + { + "epoch": 3.6608377389182594, + "grad_norm": 1.650406259806206, + "learning_rate": 1.686009595025721e-05, + "loss": 0.0277, + "step": 9002 + }, + { + "epoch": 3.6612444082960556, + "grad_norm": 9.122969401648625, + "learning_rate": 1.6859358726918788e-05, + "loss": 0.383, + "step": 9003 + }, + { + "epoch": 3.6616510776738513, + "grad_norm": 4.826267123777985, + "learning_rate": 1.6858621433165594e-05, + "loss": 0.1206, + "step": 9004 + }, + { + "epoch": 3.662057747051647, + "grad_norm": 13.595673161837926, + "learning_rate": 1.6857884069005202e-05, + "loss": 0.4826, + "step": 9005 + }, + { + "epoch": 3.662464416429443, + "grad_norm": 5.430609987988128, + "learning_rate": 1.6857146634445175e-05, + "loss": 0.064, + "step": 9006 + }, + { + "epoch": 3.6628710858072386, + "grad_norm": 7.448211437285405, + "learning_rate": 1.685640912949309e-05, + "loss": 0.0748, + "step": 9007 + }, + { + "epoch": 3.6632777551850344, + "grad_norm": 8.996212196408894, + "learning_rate": 1.6855671554156516e-05, + "loss": 0.4661, + "step": 9008 + }, + { + "epoch": 3.66368442456283, + "grad_norm": 0.14247777193263658, + "learning_rate": 1.685493390844302e-05, + "loss": 0.0022, + "step": 9009 + }, + { + "epoch": 3.6640910939406264, + "grad_norm": 15.783885652625782, + "learning_rate": 1.6854196192360177e-05, + "loss": 1.2653, + "step": 9010 + }, + { + "epoch": 3.664497763318422, + "grad_norm": 1.972027180593782, + "learning_rate": 1.685345840591556e-05, + "loss": 0.0214, + "step": 9011 + }, + { + "epoch": 3.664904432696218, + "grad_norm": 7.966737793733513, + "learning_rate": 1.685272054911675e-05, + "loss": 0.2787, + "step": 9012 + }, + { + "epoch": 3.6653111020740137, + "grad_norm": 12.179031527752345, + "learning_rate": 1.6851982621971307e-05, + "loss": 0.3401, + "step": 9013 + }, + { + "epoch": 3.66571777145181, + "grad_norm": 5.713443983168855, + "learning_rate": 1.6851244624486813e-05, + "loss": 0.2375, + "step": 9014 + }, + { + "epoch": 3.6661244408296056, + "grad_norm": 11.536190726458694, + "learning_rate": 1.685050655667085e-05, + "loss": 0.5617, + "step": 9015 + }, + { + "epoch": 3.6665311102074014, + "grad_norm": 10.162697863819034, + "learning_rate": 1.684976841853099e-05, + "loss": 0.2499, + "step": 9016 + }, + { + "epoch": 3.666937779585197, + "grad_norm": 9.190931169400798, + "learning_rate": 1.6849030210074802e-05, + "loss": 0.1954, + "step": 9017 + }, + { + "epoch": 3.667344448962993, + "grad_norm": 11.94578183202061, + "learning_rate": 1.684829193130988e-05, + "loss": 0.3744, + "step": 9018 + }, + { + "epoch": 3.6677511183407887, + "grad_norm": 4.93009041381423, + "learning_rate": 1.684755358224379e-05, + "loss": 0.0998, + "step": 9019 + }, + { + "epoch": 3.668157787718585, + "grad_norm": 4.069812497063172, + "learning_rate": 1.6846815162884114e-05, + "loss": 0.0655, + "step": 9020 + }, + { + "epoch": 3.6685644570963807, + "grad_norm": 0.7463062113805702, + "learning_rate": 1.684607667323844e-05, + "loss": 0.0137, + "step": 9021 + }, + { + "epoch": 3.6689711264741764, + "grad_norm": 0.9022393301348393, + "learning_rate": 1.6845338113314343e-05, + "loss": 0.0065, + "step": 9022 + }, + { + "epoch": 3.669377795851972, + "grad_norm": 13.219020488042393, + "learning_rate": 1.6844599483119402e-05, + "loss": 0.954, + "step": 9023 + }, + { + "epoch": 3.6697844652297684, + "grad_norm": 9.79194224021395, + "learning_rate": 1.6843860782661205e-05, + "loss": 0.2803, + "step": 9024 + }, + { + "epoch": 3.670191134607564, + "grad_norm": 10.084328444036231, + "learning_rate": 1.6843122011947334e-05, + "loss": 0.2135, + "step": 9025 + }, + { + "epoch": 3.67059780398536, + "grad_norm": 2.8698047434695457, + "learning_rate": 1.6842383170985367e-05, + "loss": 0.0268, + "step": 9026 + }, + { + "epoch": 3.6710044733631557, + "grad_norm": 6.8239343363653155, + "learning_rate": 1.6841644259782896e-05, + "loss": 0.171, + "step": 9027 + }, + { + "epoch": 3.6714111427409515, + "grad_norm": 0.48981797396457144, + "learning_rate": 1.6840905278347502e-05, + "loss": 0.0075, + "step": 9028 + }, + { + "epoch": 3.6718178121187472, + "grad_norm": 7.248189780569632, + "learning_rate": 1.6840166226686775e-05, + "loss": 0.1768, + "step": 9029 + }, + { + "epoch": 3.6722244814965435, + "grad_norm": 8.539385706981603, + "learning_rate": 1.6839427104808293e-05, + "loss": 0.2328, + "step": 9030 + }, + { + "epoch": 3.6726311508743392, + "grad_norm": 0.40016311395717635, + "learning_rate": 1.6838687912719654e-05, + "loss": 0.0073, + "step": 9031 + }, + { + "epoch": 3.673037820252135, + "grad_norm": 8.63971687163454, + "learning_rate": 1.6837948650428443e-05, + "loss": 0.4613, + "step": 9032 + }, + { + "epoch": 3.6734444896299308, + "grad_norm": 2.624986282003944, + "learning_rate": 1.6837209317942243e-05, + "loss": 0.0343, + "step": 9033 + }, + { + "epoch": 3.673851159007727, + "grad_norm": 5.2487985078668205, + "learning_rate": 1.6836469915268653e-05, + "loss": 0.1297, + "step": 9034 + }, + { + "epoch": 3.6742578283855227, + "grad_norm": 1.20849381504422, + "learning_rate": 1.6835730442415257e-05, + "loss": 0.0129, + "step": 9035 + }, + { + "epoch": 3.6746644977633185, + "grad_norm": 6.80171912116228, + "learning_rate": 1.6834990899389646e-05, + "loss": 0.3716, + "step": 9036 + }, + { + "epoch": 3.6750711671411143, + "grad_norm": 18.500666580563276, + "learning_rate": 1.6834251286199417e-05, + "loss": 1.1454, + "step": 9037 + }, + { + "epoch": 3.67547783651891, + "grad_norm": 14.620993952725238, + "learning_rate": 1.6833511602852153e-05, + "loss": 0.4529, + "step": 9038 + }, + { + "epoch": 3.675884505896706, + "grad_norm": 0.8124873462007447, + "learning_rate": 1.683277184935546e-05, + "loss": 0.0168, + "step": 9039 + }, + { + "epoch": 3.6762911752745016, + "grad_norm": 13.73201959948814, + "learning_rate": 1.683203202571692e-05, + "loss": 0.6986, + "step": 9040 + }, + { + "epoch": 3.6766978446522978, + "grad_norm": 12.348525482173613, + "learning_rate": 1.6831292131944136e-05, + "loss": 0.8518, + "step": 9041 + }, + { + "epoch": 3.6771045140300935, + "grad_norm": 7.605208143435812, + "learning_rate": 1.68305521680447e-05, + "loss": 0.2792, + "step": 9042 + }, + { + "epoch": 3.6775111834078893, + "grad_norm": 9.147591707285113, + "learning_rate": 1.682981213402621e-05, + "loss": 0.3622, + "step": 9043 + }, + { + "epoch": 3.6779178527856855, + "grad_norm": 1.6013016302021807, + "learning_rate": 1.6829072029896253e-05, + "loss": 0.0238, + "step": 9044 + }, + { + "epoch": 3.6783245221634813, + "grad_norm": 0.9873811343214646, + "learning_rate": 1.6828331855662442e-05, + "loss": 0.0125, + "step": 9045 + }, + { + "epoch": 3.678731191541277, + "grad_norm": 7.908847796222247, + "learning_rate": 1.6827591611332365e-05, + "loss": 0.3075, + "step": 9046 + }, + { + "epoch": 3.679137860919073, + "grad_norm": 4.06312705024083, + "learning_rate": 1.6826851296913626e-05, + "loss": 0.0618, + "step": 9047 + }, + { + "epoch": 3.6795445302968686, + "grad_norm": 9.534278008832096, + "learning_rate": 1.6826110912413823e-05, + "loss": 0.3194, + "step": 9048 + }, + { + "epoch": 3.6799511996746643, + "grad_norm": 6.794914975638935, + "learning_rate": 1.6825370457840554e-05, + "loss": 0.2852, + "step": 9049 + }, + { + "epoch": 3.68035786905246, + "grad_norm": 4.5319021656213785, + "learning_rate": 1.682462993320142e-05, + "loss": 0.2122, + "step": 9050 + }, + { + "epoch": 3.6807645384302563, + "grad_norm": 25.036712774279057, + "learning_rate": 1.682388933850403e-05, + "loss": 0.1439, + "step": 9051 + }, + { + "epoch": 3.681171207808052, + "grad_norm": 1.5602659027744388, + "learning_rate": 1.6823148673755976e-05, + "loss": 0.0308, + "step": 9052 + }, + { + "epoch": 3.681577877185848, + "grad_norm": 0.15849778335972184, + "learning_rate": 1.682240793896487e-05, + "loss": 0.0027, + "step": 9053 + }, + { + "epoch": 3.6819845465636436, + "grad_norm": 1.477982792574328, + "learning_rate": 1.6821667134138314e-05, + "loss": 0.0242, + "step": 9054 + }, + { + "epoch": 3.68239121594144, + "grad_norm": 0.8021965762991548, + "learning_rate": 1.682092625928391e-05, + "loss": 0.0115, + "step": 9055 + }, + { + "epoch": 3.6827978853192356, + "grad_norm": 5.256279502327755, + "learning_rate": 1.6820185314409263e-05, + "loss": 0.1346, + "step": 9056 + }, + { + "epoch": 3.6832045546970313, + "grad_norm": 16.195328924881057, + "learning_rate": 1.6819444299521984e-05, + "loss": 0.7339, + "step": 9057 + }, + { + "epoch": 3.683611224074827, + "grad_norm": 21.623911802662406, + "learning_rate": 1.6818703214629675e-05, + "loss": 0.487, + "step": 9058 + }, + { + "epoch": 3.684017893452623, + "grad_norm": 11.478332950371076, + "learning_rate": 1.6817962059739946e-05, + "loss": 0.7378, + "step": 9059 + }, + { + "epoch": 3.6844245628304186, + "grad_norm": 2.611714645212836, + "learning_rate": 1.681722083486041e-05, + "loss": 0.1077, + "step": 9060 + }, + { + "epoch": 3.684831232208215, + "grad_norm": 0.929784580160068, + "learning_rate": 1.6816479539998666e-05, + "loss": 0.0118, + "step": 9061 + }, + { + "epoch": 3.6852379015860106, + "grad_norm": 2.326329952131877, + "learning_rate": 1.6815738175162334e-05, + "loss": 0.0466, + "step": 9062 + }, + { + "epoch": 3.6856445709638064, + "grad_norm": 6.216788285277512, + "learning_rate": 1.6814996740359014e-05, + "loss": 0.1609, + "step": 9063 + }, + { + "epoch": 3.686051240341602, + "grad_norm": 9.083627196441732, + "learning_rate": 1.6814255235596325e-05, + "loss": 0.3225, + "step": 9064 + }, + { + "epoch": 3.6864579097193984, + "grad_norm": 0.9370101570153777, + "learning_rate": 1.681351366088187e-05, + "loss": 0.0166, + "step": 9065 + }, + { + "epoch": 3.686864579097194, + "grad_norm": 7.837079272237108, + "learning_rate": 1.681277201622328e-05, + "loss": 0.4368, + "step": 9066 + }, + { + "epoch": 3.68727124847499, + "grad_norm": 3.8325993231322797, + "learning_rate": 1.6812030301628148e-05, + "loss": 0.1113, + "step": 9067 + }, + { + "epoch": 3.6876779178527856, + "grad_norm": 13.902588860761867, + "learning_rate": 1.6811288517104097e-05, + "loss": 0.8562, + "step": 9068 + }, + { + "epoch": 3.6880845872305814, + "grad_norm": 3.1990110056027294, + "learning_rate": 1.6810546662658747e-05, + "loss": 0.0536, + "step": 9069 + }, + { + "epoch": 3.688491256608377, + "grad_norm": 0.15592766006274816, + "learning_rate": 1.68098047382997e-05, + "loss": 0.0029, + "step": 9070 + }, + { + "epoch": 3.6888979259861734, + "grad_norm": 7.50131967899837, + "learning_rate": 1.6809062744034585e-05, + "loss": 0.3095, + "step": 9071 + }, + { + "epoch": 3.689304595363969, + "grad_norm": 8.107834963175609, + "learning_rate": 1.6808320679871013e-05, + "loss": 0.3521, + "step": 9072 + }, + { + "epoch": 3.689711264741765, + "grad_norm": 14.821714619211788, + "learning_rate": 1.6807578545816604e-05, + "loss": 0.5294, + "step": 9073 + }, + { + "epoch": 3.6901179341195607, + "grad_norm": 3.1358941916238736, + "learning_rate": 1.6806836341878972e-05, + "loss": 0.0495, + "step": 9074 + }, + { + "epoch": 3.690524603497357, + "grad_norm": 1.8554489675207628, + "learning_rate": 1.680609406806574e-05, + "loss": 0.0351, + "step": 9075 + }, + { + "epoch": 3.6909312728751527, + "grad_norm": 4.2412118059566035, + "learning_rate": 1.6805351724384527e-05, + "loss": 0.0682, + "step": 9076 + }, + { + "epoch": 3.6913379422529484, + "grad_norm": 15.417137937387821, + "learning_rate": 1.6804609310842958e-05, + "loss": 0.5771, + "step": 9077 + }, + { + "epoch": 3.691744611630744, + "grad_norm": 5.817765647520494, + "learning_rate": 1.680386682744864e-05, + "loss": 0.2965, + "step": 9078 + }, + { + "epoch": 3.69215128100854, + "grad_norm": 4.168846462225637, + "learning_rate": 1.6803124274209214e-05, + "loss": 0.0823, + "step": 9079 + }, + { + "epoch": 3.6925579503863357, + "grad_norm": 11.253920966896462, + "learning_rate": 1.680238165113229e-05, + "loss": 0.4674, + "step": 9080 + }, + { + "epoch": 3.6929646197641315, + "grad_norm": 8.196112212822738, + "learning_rate": 1.6801638958225496e-05, + "loss": 0.4473, + "step": 9081 + }, + { + "epoch": 3.6933712891419277, + "grad_norm": 11.176199550527064, + "learning_rate": 1.6800896195496454e-05, + "loss": 0.5423, + "step": 9082 + }, + { + "epoch": 3.6937779585197235, + "grad_norm": 2.5456125315915212, + "learning_rate": 1.6800153362952788e-05, + "loss": 0.0395, + "step": 9083 + }, + { + "epoch": 3.6941846278975192, + "grad_norm": 4.728268630097375, + "learning_rate": 1.6799410460602125e-05, + "loss": 0.0936, + "step": 9084 + }, + { + "epoch": 3.6945912972753154, + "grad_norm": 4.067270325659976, + "learning_rate": 1.6798667488452095e-05, + "loss": 0.0519, + "step": 9085 + }, + { + "epoch": 3.694997966653111, + "grad_norm": 8.988579721791973, + "learning_rate": 1.679792444651032e-05, + "loss": 0.228, + "step": 9086 + }, + { + "epoch": 3.695404636030907, + "grad_norm": 4.8954333828782755, + "learning_rate": 1.679718133478443e-05, + "loss": 0.1279, + "step": 9087 + }, + { + "epoch": 3.6958113054087027, + "grad_norm": 8.635064115156148, + "learning_rate": 1.679643815328205e-05, + "loss": 0.2387, + "step": 9088 + }, + { + "epoch": 3.6962179747864985, + "grad_norm": 5.797369794071626, + "learning_rate": 1.6795694902010813e-05, + "loss": 0.4063, + "step": 9089 + }, + { + "epoch": 3.6966246441642943, + "grad_norm": 1.8141296113579408, + "learning_rate": 1.679495158097835e-05, + "loss": 0.0332, + "step": 9090 + }, + { + "epoch": 3.69703131354209, + "grad_norm": 0.5804066447727244, + "learning_rate": 1.6794208190192284e-05, + "loss": 0.0095, + "step": 9091 + }, + { + "epoch": 3.6974379829198862, + "grad_norm": 6.194440121502246, + "learning_rate": 1.679346472966026e-05, + "loss": 0.1542, + "step": 9092 + }, + { + "epoch": 3.697844652297682, + "grad_norm": 6.5370674433273726, + "learning_rate": 1.6792721199389892e-05, + "loss": 0.3442, + "step": 9093 + }, + { + "epoch": 3.6982513216754778, + "grad_norm": 6.901627827659407, + "learning_rate": 1.6791977599388826e-05, + "loss": 0.1946, + "step": 9094 + }, + { + "epoch": 3.6986579910532735, + "grad_norm": 2.673184406078781, + "learning_rate": 1.6791233929664694e-05, + "loss": 0.0786, + "step": 9095 + }, + { + "epoch": 3.6990646604310697, + "grad_norm": 3.8685037174392503, + "learning_rate": 1.6790490190225125e-05, + "loss": 0.066, + "step": 9096 + }, + { + "epoch": 3.6994713298088655, + "grad_norm": 2.4903836805215533, + "learning_rate": 1.6789746381077757e-05, + "loss": 0.0405, + "step": 9097 + }, + { + "epoch": 3.6998779991866613, + "grad_norm": 5.031462708978035, + "learning_rate": 1.6789002502230226e-05, + "loss": 0.1056, + "step": 9098 + }, + { + "epoch": 3.700284668564457, + "grad_norm": 0.7905158981211247, + "learning_rate": 1.6788258553690166e-05, + "loss": 0.0108, + "step": 9099 + }, + { + "epoch": 3.700691337942253, + "grad_norm": 11.933291263773667, + "learning_rate": 1.678751453546522e-05, + "loss": 0.2463, + "step": 9100 + }, + { + "epoch": 3.7010980073200486, + "grad_norm": 4.699347436548824, + "learning_rate": 1.6786770447563015e-05, + "loss": 0.0695, + "step": 9101 + }, + { + "epoch": 3.7015046766978448, + "grad_norm": 6.873442481000778, + "learning_rate": 1.67860262899912e-05, + "loss": 0.189, + "step": 9102 + }, + { + "epoch": 3.7019113460756405, + "grad_norm": 10.833720973860046, + "learning_rate": 1.6785282062757407e-05, + "loss": 0.4868, + "step": 9103 + }, + { + "epoch": 3.7023180154534363, + "grad_norm": 6.126271484708428, + "learning_rate": 1.678453776586928e-05, + "loss": 0.2196, + "step": 9104 + }, + { + "epoch": 3.702724684831232, + "grad_norm": 8.389181224475877, + "learning_rate": 1.6783793399334456e-05, + "loss": 0.3935, + "step": 9105 + }, + { + "epoch": 3.7031313542090283, + "grad_norm": 4.888188693085159, + "learning_rate": 1.678304896316058e-05, + "loss": 0.1391, + "step": 9106 + }, + { + "epoch": 3.703538023586824, + "grad_norm": 5.004084745680172, + "learning_rate": 1.6782304457355295e-05, + "loss": 0.1599, + "step": 9107 + }, + { + "epoch": 3.70394469296462, + "grad_norm": 7.576230419788127, + "learning_rate": 1.678155988192624e-05, + "loss": 0.2549, + "step": 9108 + }, + { + "epoch": 3.7043513623424156, + "grad_norm": 11.148497147841635, + "learning_rate": 1.678081523688106e-05, + "loss": 0.5899, + "step": 9109 + }, + { + "epoch": 3.7047580317202113, + "grad_norm": 5.005432206517516, + "learning_rate": 1.6780070522227396e-05, + "loss": 0.1359, + "step": 9110 + }, + { + "epoch": 3.705164701098007, + "grad_norm": 4.093476124082045, + "learning_rate": 1.67793257379729e-05, + "loss": 0.0727, + "step": 9111 + }, + { + "epoch": 3.7055713704758033, + "grad_norm": 9.918244905015188, + "learning_rate": 1.6778580884125212e-05, + "loss": 0.3603, + "step": 9112 + }, + { + "epoch": 3.705978039853599, + "grad_norm": 5.28852804852977, + "learning_rate": 1.6777835960691978e-05, + "loss": 0.1334, + "step": 9113 + }, + { + "epoch": 3.706384709231395, + "grad_norm": 2.142989047504025, + "learning_rate": 1.677709096768085e-05, + "loss": 0.0336, + "step": 9114 + }, + { + "epoch": 3.7067913786091906, + "grad_norm": 1.2869531053346281, + "learning_rate": 1.677634590509947e-05, + "loss": 0.0141, + "step": 9115 + }, + { + "epoch": 3.707198047986987, + "grad_norm": 0.162041084030911, + "learning_rate": 1.6775600772955483e-05, + "loss": 0.0026, + "step": 9116 + }, + { + "epoch": 3.7076047173647826, + "grad_norm": 2.4774086561107396, + "learning_rate": 1.677485557125655e-05, + "loss": 0.042, + "step": 9117 + }, + { + "epoch": 3.7080113867425784, + "grad_norm": 16.210635503805484, + "learning_rate": 1.6774110300010318e-05, + "loss": 0.7365, + "step": 9118 + }, + { + "epoch": 3.708418056120374, + "grad_norm": 6.259606252769411, + "learning_rate": 1.677336495922443e-05, + "loss": 0.1093, + "step": 9119 + }, + { + "epoch": 3.70882472549817, + "grad_norm": 5.492445086973414, + "learning_rate": 1.6772619548906543e-05, + "loss": 0.0977, + "step": 9120 + }, + { + "epoch": 3.7092313948759656, + "grad_norm": 3.7424397792611948, + "learning_rate": 1.6771874069064305e-05, + "loss": 0.0698, + "step": 9121 + }, + { + "epoch": 3.709638064253762, + "grad_norm": 15.234505814944338, + "learning_rate": 1.6771128519705375e-05, + "loss": 0.8336, + "step": 9122 + }, + { + "epoch": 3.7100447336315576, + "grad_norm": 23.390619946863396, + "learning_rate": 1.6770382900837404e-05, + "loss": 0.5325, + "step": 9123 + }, + { + "epoch": 3.7104514030093534, + "grad_norm": 7.285239833065952, + "learning_rate": 1.676963721246804e-05, + "loss": 0.3039, + "step": 9124 + }, + { + "epoch": 3.710858072387149, + "grad_norm": 5.296919716665765, + "learning_rate": 1.6768891454604944e-05, + "loss": 0.1829, + "step": 9125 + }, + { + "epoch": 3.7112647417649454, + "grad_norm": 24.16777401530151, + "learning_rate": 1.6768145627255774e-05, + "loss": 0.1701, + "step": 9126 + }, + { + "epoch": 3.711671411142741, + "grad_norm": 10.54809635640458, + "learning_rate": 1.676739973042818e-05, + "loss": 0.3749, + "step": 9127 + }, + { + "epoch": 3.712078080520537, + "grad_norm": 7.217850935605985, + "learning_rate": 1.6766653764129826e-05, + "loss": 0.1847, + "step": 9128 + }, + { + "epoch": 3.7124847498983327, + "grad_norm": 9.73000385884437, + "learning_rate": 1.6765907728368362e-05, + "loss": 0.2767, + "step": 9129 + }, + { + "epoch": 3.7128914192761284, + "grad_norm": 12.736517570898942, + "learning_rate": 1.676516162315145e-05, + "loss": 0.5062, + "step": 9130 + }, + { + "epoch": 3.713298088653924, + "grad_norm": 2.4610281951895185, + "learning_rate": 1.6764415448486747e-05, + "loss": 0.054, + "step": 9131 + }, + { + "epoch": 3.71370475803172, + "grad_norm": 1.79636720004012, + "learning_rate": 1.6763669204381917e-05, + "loss": 0.0258, + "step": 9132 + }, + { + "epoch": 3.714111427409516, + "grad_norm": 11.067755960212407, + "learning_rate": 1.676292289084462e-05, + "loss": 0.4449, + "step": 9133 + }, + { + "epoch": 3.714518096787312, + "grad_norm": 8.498685134287868, + "learning_rate": 1.6762176507882515e-05, + "loss": 0.6307, + "step": 9134 + }, + { + "epoch": 3.7149247661651077, + "grad_norm": 3.4455999471880587, + "learning_rate": 1.6761430055503264e-05, + "loss": 0.1117, + "step": 9135 + }, + { + "epoch": 3.7153314355429035, + "grad_norm": 23.215934275919995, + "learning_rate": 1.6760683533714532e-05, + "loss": 0.383, + "step": 9136 + }, + { + "epoch": 3.7157381049206997, + "grad_norm": 8.202863271573998, + "learning_rate": 1.675993694252398e-05, + "loss": 0.1905, + "step": 9137 + }, + { + "epoch": 3.7161447742984954, + "grad_norm": 5.197532613945416, + "learning_rate": 1.6759190281939273e-05, + "loss": 0.115, + "step": 9138 + }, + { + "epoch": 3.716551443676291, + "grad_norm": 5.8363462757218745, + "learning_rate": 1.6758443551968078e-05, + "loss": 0.1265, + "step": 9139 + }, + { + "epoch": 3.716958113054087, + "grad_norm": 7.214650427699444, + "learning_rate": 1.6757696752618057e-05, + "loss": 0.2663, + "step": 9140 + }, + { + "epoch": 3.7173647824318827, + "grad_norm": 1.3276290673625655, + "learning_rate": 1.6756949883896874e-05, + "loss": 0.0164, + "step": 9141 + }, + { + "epoch": 3.7177714518096785, + "grad_norm": 13.666816089091196, + "learning_rate": 1.6756202945812204e-05, + "loss": 0.5594, + "step": 9142 + }, + { + "epoch": 3.7181781211874747, + "grad_norm": 11.326499197887992, + "learning_rate": 1.675545593837171e-05, + "loss": 0.6451, + "step": 9143 + }, + { + "epoch": 3.7185847905652705, + "grad_norm": 10.52223189792821, + "learning_rate": 1.675470886158306e-05, + "loss": 0.3722, + "step": 9144 + }, + { + "epoch": 3.7189914599430662, + "grad_norm": 7.459416774183333, + "learning_rate": 1.6753961715453923e-05, + "loss": 0.1958, + "step": 9145 + }, + { + "epoch": 3.719398129320862, + "grad_norm": 8.003136320396587, + "learning_rate": 1.675321449999197e-05, + "loss": 0.3531, + "step": 9146 + }, + { + "epoch": 3.719804798698658, + "grad_norm": 11.130914041037192, + "learning_rate": 1.6752467215204874e-05, + "loss": 0.9395, + "step": 9147 + }, + { + "epoch": 3.720211468076454, + "grad_norm": 10.06100243672098, + "learning_rate": 1.6751719861100304e-05, + "loss": 0.2882, + "step": 9148 + }, + { + "epoch": 3.7206181374542497, + "grad_norm": 86.28583865725021, + "learning_rate": 1.6750972437685925e-05, + "loss": 1.7861, + "step": 9149 + }, + { + "epoch": 3.7210248068320455, + "grad_norm": 3.6768595334087033, + "learning_rate": 1.6750224944969424e-05, + "loss": 0.068, + "step": 9150 + }, + { + "epoch": 3.7214314762098413, + "grad_norm": 7.529806620044863, + "learning_rate": 1.6749477382958462e-05, + "loss": 0.3457, + "step": 9151 + }, + { + "epoch": 3.721838145587637, + "grad_norm": 11.389581025617586, + "learning_rate": 1.674872975166072e-05, + "loss": 0.2675, + "step": 9152 + }, + { + "epoch": 3.7222448149654332, + "grad_norm": 1.5298528671499476, + "learning_rate": 1.674798205108387e-05, + "loss": 0.0327, + "step": 9153 + }, + { + "epoch": 3.722651484343229, + "grad_norm": 10.302857702882081, + "learning_rate": 1.6747234281235587e-05, + "loss": 0.5002, + "step": 9154 + }, + { + "epoch": 3.7230581537210248, + "grad_norm": 16.83334806049367, + "learning_rate": 1.674648644212355e-05, + "loss": 0.7374, + "step": 9155 + }, + { + "epoch": 3.7234648230988205, + "grad_norm": 28.845402453276968, + "learning_rate": 1.6745738533755432e-05, + "loss": 0.778, + "step": 9156 + }, + { + "epoch": 3.7238714924766168, + "grad_norm": 6.978971017753225, + "learning_rate": 1.6744990556138915e-05, + "loss": 0.4135, + "step": 9157 + }, + { + "epoch": 3.7242781618544125, + "grad_norm": 9.405588322818375, + "learning_rate": 1.6744242509281673e-05, + "loss": 0.4209, + "step": 9158 + }, + { + "epoch": 3.7246848312322083, + "grad_norm": 7.037260488580126, + "learning_rate": 1.674349439319139e-05, + "loss": 0.2795, + "step": 9159 + }, + { + "epoch": 3.725091500610004, + "grad_norm": 1.0201708258960678, + "learning_rate": 1.674274620787574e-05, + "loss": 0.0238, + "step": 9160 + }, + { + "epoch": 3.7254981699878, + "grad_norm": 7.738379112806219, + "learning_rate": 1.674199795334241e-05, + "loss": 0.1393, + "step": 9161 + }, + { + "epoch": 3.7259048393655956, + "grad_norm": 5.18398528805653, + "learning_rate": 1.674124962959908e-05, + "loss": 0.308, + "step": 9162 + }, + { + "epoch": 3.726311508743392, + "grad_norm": 4.752377790000574, + "learning_rate": 1.6740501236653422e-05, + "loss": 0.1509, + "step": 9163 + }, + { + "epoch": 3.7267181781211876, + "grad_norm": 5.331808219339315, + "learning_rate": 1.673975277451313e-05, + "loss": 0.1085, + "step": 9164 + }, + { + "epoch": 3.7271248474989833, + "grad_norm": 97.9854479639779, + "learning_rate": 1.6739004243185884e-05, + "loss": 1.1975, + "step": 9165 + }, + { + "epoch": 3.727531516876779, + "grad_norm": 11.392552428737302, + "learning_rate": 1.673825564267937e-05, + "loss": 0.4136, + "step": 9166 + }, + { + "epoch": 3.7279381862545753, + "grad_norm": 1.8722106338624185, + "learning_rate": 1.6737506973001268e-05, + "loss": 0.0684, + "step": 9167 + }, + { + "epoch": 3.728344855632371, + "grad_norm": 20.434008377351915, + "learning_rate": 1.6736758234159267e-05, + "loss": 0.7688, + "step": 9168 + }, + { + "epoch": 3.728751525010167, + "grad_norm": 11.721542507126257, + "learning_rate": 1.6736009426161054e-05, + "loss": 0.491, + "step": 9169 + }, + { + "epoch": 3.7291581943879626, + "grad_norm": 15.076675513064155, + "learning_rate": 1.673526054901431e-05, + "loss": 0.7094, + "step": 9170 + }, + { + "epoch": 3.7295648637657584, + "grad_norm": 6.698834734384064, + "learning_rate": 1.6734511602726725e-05, + "loss": 0.3454, + "step": 9171 + }, + { + "epoch": 3.729971533143554, + "grad_norm": 10.84395212502459, + "learning_rate": 1.6733762587305996e-05, + "loss": 1.0946, + "step": 9172 + }, + { + "epoch": 3.73037820252135, + "grad_norm": 7.867795973954468, + "learning_rate": 1.67330135027598e-05, + "loss": 0.2146, + "step": 9173 + }, + { + "epoch": 3.730784871899146, + "grad_norm": 3.017780706768176, + "learning_rate": 1.6732264349095834e-05, + "loss": 0.0978, + "step": 9174 + }, + { + "epoch": 3.731191541276942, + "grad_norm": 5.700109820514557, + "learning_rate": 1.6731515126321786e-05, + "loss": 0.1994, + "step": 9175 + }, + { + "epoch": 3.7315982106547376, + "grad_norm": 5.122482507409884, + "learning_rate": 1.6730765834445347e-05, + "loss": 0.1473, + "step": 9176 + }, + { + "epoch": 3.7320048800325334, + "grad_norm": 0.24930844734851207, + "learning_rate": 1.673001647347421e-05, + "loss": 0.0049, + "step": 9177 + }, + { + "epoch": 3.7324115494103296, + "grad_norm": 4.98078739259567, + "learning_rate": 1.6729267043416065e-05, + "loss": 0.1534, + "step": 9178 + }, + { + "epoch": 3.7328182187881254, + "grad_norm": 6.231691670314644, + "learning_rate": 1.6728517544278605e-05, + "loss": 0.2109, + "step": 9179 + }, + { + "epoch": 3.733224888165921, + "grad_norm": 5.136890573673785, + "learning_rate": 1.672776797606953e-05, + "loss": 0.305, + "step": 9180 + }, + { + "epoch": 3.733631557543717, + "grad_norm": 9.554194689594024, + "learning_rate": 1.6727018338796526e-05, + "loss": 0.4147, + "step": 9181 + }, + { + "epoch": 3.7340382269215127, + "grad_norm": 1.2942619279150356, + "learning_rate": 1.67262686324673e-05, + "loss": 0.0178, + "step": 9182 + }, + { + "epoch": 3.7344448962993084, + "grad_norm": 4.385615231830351, + "learning_rate": 1.672551885708954e-05, + "loss": 0.1335, + "step": 9183 + }, + { + "epoch": 3.7348515656771046, + "grad_norm": 7.304928449810633, + "learning_rate": 1.6724769012670937e-05, + "loss": 0.2408, + "step": 9184 + }, + { + "epoch": 3.7352582350549004, + "grad_norm": 9.77232725445207, + "learning_rate": 1.6724019099219203e-05, + "loss": 0.2982, + "step": 9185 + }, + { + "epoch": 3.735664904432696, + "grad_norm": 3.3720252241933837, + "learning_rate": 1.6723269116742026e-05, + "loss": 0.0686, + "step": 9186 + }, + { + "epoch": 3.736071573810492, + "grad_norm": 3.035482081955714, + "learning_rate": 1.6722519065247107e-05, + "loss": 0.0464, + "step": 9187 + }, + { + "epoch": 3.736478243188288, + "grad_norm": 10.282566577232451, + "learning_rate": 1.6721768944742147e-05, + "loss": 0.0922, + "step": 9188 + }, + { + "epoch": 3.736884912566084, + "grad_norm": 18.028704828891954, + "learning_rate": 1.6721018755234844e-05, + "loss": 0.6922, + "step": 9189 + }, + { + "epoch": 3.7372915819438797, + "grad_norm": 1.2640428810083937, + "learning_rate": 1.6720268496732906e-05, + "loss": 0.0237, + "step": 9190 + }, + { + "epoch": 3.7376982513216754, + "grad_norm": 6.314206353106977, + "learning_rate": 1.6719518169244024e-05, + "loss": 0.2154, + "step": 9191 + }, + { + "epoch": 3.738104920699471, + "grad_norm": 8.994130341464945, + "learning_rate": 1.671876777277591e-05, + "loss": 0.3665, + "step": 9192 + }, + { + "epoch": 3.738511590077267, + "grad_norm": 7.2195035454189895, + "learning_rate": 1.6718017307336264e-05, + "loss": 0.4302, + "step": 9193 + }, + { + "epoch": 3.738918259455063, + "grad_norm": 9.916578690441616, + "learning_rate": 1.6717266772932784e-05, + "loss": 0.3282, + "step": 9194 + }, + { + "epoch": 3.739324928832859, + "grad_norm": 5.249904586465669, + "learning_rate": 1.6716516169573186e-05, + "loss": 0.0746, + "step": 9195 + }, + { + "epoch": 3.7397315982106547, + "grad_norm": 5.549019937813154, + "learning_rate": 1.6715765497265163e-05, + "loss": 0.1673, + "step": 9196 + }, + { + "epoch": 3.7401382675884505, + "grad_norm": 4.424387858387136, + "learning_rate": 1.671501475601643e-05, + "loss": 0.1118, + "step": 9197 + }, + { + "epoch": 3.7405449369662467, + "grad_norm": 10.737818813462, + "learning_rate": 1.6714263945834694e-05, + "loss": 0.4888, + "step": 9198 + }, + { + "epoch": 3.7409516063440424, + "grad_norm": 9.0736033223345, + "learning_rate": 1.6713513066727657e-05, + "loss": 0.5252, + "step": 9199 + }, + { + "epoch": 3.741358275721838, + "grad_norm": 11.21155962036153, + "learning_rate": 1.671276211870303e-05, + "loss": 0.7739, + "step": 9200 + }, + { + "epoch": 3.741764945099634, + "grad_norm": 7.480582331040698, + "learning_rate": 1.671201110176852e-05, + "loss": 0.1937, + "step": 9201 + }, + { + "epoch": 3.7421716144774297, + "grad_norm": 5.914016911798335, + "learning_rate": 1.6711260015931837e-05, + "loss": 0.2377, + "step": 9202 + }, + { + "epoch": 3.7425782838552255, + "grad_norm": 1.703330799238763, + "learning_rate": 1.6710508861200695e-05, + "loss": 0.0298, + "step": 9203 + }, + { + "epoch": 3.7429849532330217, + "grad_norm": 8.634448044749597, + "learning_rate": 1.67097576375828e-05, + "loss": 0.1606, + "step": 9204 + }, + { + "epoch": 3.7433916226108175, + "grad_norm": 6.997783502820414, + "learning_rate": 1.6709006345085866e-05, + "loss": 0.2113, + "step": 9205 + }, + { + "epoch": 3.7437982919886132, + "grad_norm": 2.668097492206534, + "learning_rate": 1.670825498371761e-05, + "loss": 0.0566, + "step": 9206 + }, + { + "epoch": 3.744204961366409, + "grad_norm": 4.624230465338007, + "learning_rate": 1.6707503553485735e-05, + "loss": 0.0857, + "step": 9207 + }, + { + "epoch": 3.744611630744205, + "grad_norm": 19.494866805085096, + "learning_rate": 1.670675205439796e-05, + "loss": 0.3612, + "step": 9208 + }, + { + "epoch": 3.745018300122001, + "grad_norm": 8.329980638622292, + "learning_rate": 1.6706000486462005e-05, + "loss": 0.4204, + "step": 9209 + }, + { + "epoch": 3.7454249694997968, + "grad_norm": 19.30623016847288, + "learning_rate": 1.6705248849685575e-05, + "loss": 1.5922, + "step": 9210 + }, + { + "epoch": 3.7458316388775925, + "grad_norm": 3.9463057391448535, + "learning_rate": 1.6704497144076395e-05, + "loss": 0.1052, + "step": 9211 + }, + { + "epoch": 3.7462383082553883, + "grad_norm": 11.949903728690122, + "learning_rate": 1.6703745369642173e-05, + "loss": 0.5418, + "step": 9212 + }, + { + "epoch": 3.746644977633184, + "grad_norm": 6.488129235705405, + "learning_rate": 1.6702993526390634e-05, + "loss": 0.3706, + "step": 9213 + }, + { + "epoch": 3.74705164701098, + "grad_norm": 5.919078475468782, + "learning_rate": 1.670224161432949e-05, + "loss": 0.0761, + "step": 9214 + }, + { + "epoch": 3.747458316388776, + "grad_norm": 7.052571242942108, + "learning_rate": 1.6701489633466467e-05, + "loss": 0.5986, + "step": 9215 + }, + { + "epoch": 3.747864985766572, + "grad_norm": 8.67624446455406, + "learning_rate": 1.6700737583809275e-05, + "loss": 0.232, + "step": 9216 + }, + { + "epoch": 3.7482716551443676, + "grad_norm": 4.128754909003234, + "learning_rate": 1.6699985465365642e-05, + "loss": 0.0733, + "step": 9217 + }, + { + "epoch": 3.7486783245221633, + "grad_norm": 5.497536582804882, + "learning_rate": 1.6699233278143287e-05, + "loss": 0.231, + "step": 9218 + }, + { + "epoch": 3.7490849938999595, + "grad_norm": 3.946103773166171, + "learning_rate": 1.669848102214993e-05, + "loss": 0.2103, + "step": 9219 + }, + { + "epoch": 3.7494916632777553, + "grad_norm": 7.8974449076300575, + "learning_rate": 1.6697728697393292e-05, + "loss": 0.2926, + "step": 9220 + }, + { + "epoch": 3.749898332655551, + "grad_norm": 1.821467453181919, + "learning_rate": 1.66969763038811e-05, + "loss": 0.0347, + "step": 9221 + }, + { + "epoch": 3.750305002033347, + "grad_norm": 6.970637335190256, + "learning_rate": 1.669622384162108e-05, + "loss": 0.3515, + "step": 9222 + }, + { + "epoch": 3.7507116714111426, + "grad_norm": 8.768603718187522, + "learning_rate": 1.6695471310620945e-05, + "loss": 0.2752, + "step": 9223 + }, + { + "epoch": 3.7511183407889384, + "grad_norm": 4.771384438117326, + "learning_rate": 1.6694718710888427e-05, + "loss": 0.0957, + "step": 9224 + }, + { + "epoch": 3.7515250101667346, + "grad_norm": 9.67374151950873, + "learning_rate": 1.669396604243126e-05, + "loss": 0.533, + "step": 9225 + }, + { + "epoch": 3.7519316795445303, + "grad_norm": 3.6627624816255304, + "learning_rate": 1.6693213305257156e-05, + "loss": 0.0751, + "step": 9226 + }, + { + "epoch": 3.752338348922326, + "grad_norm": 4.517016830533736, + "learning_rate": 1.6692460499373848e-05, + "loss": 0.0798, + "step": 9227 + }, + { + "epoch": 3.752745018300122, + "grad_norm": 11.031332945039049, + "learning_rate": 1.669170762478907e-05, + "loss": 0.4872, + "step": 9228 + }, + { + "epoch": 3.753151687677918, + "grad_norm": 5.479990226334477, + "learning_rate": 1.669095468151054e-05, + "loss": 0.3826, + "step": 9229 + }, + { + "epoch": 3.753558357055714, + "grad_norm": 7.549357308040771, + "learning_rate": 1.6690201669545996e-05, + "loss": 0.2465, + "step": 9230 + }, + { + "epoch": 3.7539650264335096, + "grad_norm": 10.32459573620345, + "learning_rate": 1.6689448588903164e-05, + "loss": 0.4903, + "step": 9231 + }, + { + "epoch": 3.7543716958113054, + "grad_norm": 8.927931894601398, + "learning_rate": 1.6688695439589776e-05, + "loss": 0.3106, + "step": 9232 + }, + { + "epoch": 3.754778365189101, + "grad_norm": 0.37152605778323305, + "learning_rate": 1.668794222161356e-05, + "loss": 0.0041, + "step": 9233 + }, + { + "epoch": 3.755185034566897, + "grad_norm": 7.2208498464448, + "learning_rate": 1.6687188934982254e-05, + "loss": 0.2622, + "step": 9234 + }, + { + "epoch": 3.755591703944693, + "grad_norm": 0.18490777847115045, + "learning_rate": 1.668643557970359e-05, + "loss": 0.0036, + "step": 9235 + }, + { + "epoch": 3.755998373322489, + "grad_norm": 4.14505666933421, + "learning_rate": 1.6685682155785296e-05, + "loss": 0.0831, + "step": 9236 + }, + { + "epoch": 3.7564050427002846, + "grad_norm": 0.9165420001379889, + "learning_rate": 1.668492866323511e-05, + "loss": 0.0169, + "step": 9237 + }, + { + "epoch": 3.7568117120780804, + "grad_norm": 11.739081164414086, + "learning_rate": 1.668417510206077e-05, + "loss": 0.8659, + "step": 9238 + }, + { + "epoch": 3.7572183814558766, + "grad_norm": 14.36754131581561, + "learning_rate": 1.6683421472270004e-05, + "loss": 0.3331, + "step": 9239 + }, + { + "epoch": 3.7576250508336724, + "grad_norm": 1.7921455783200728, + "learning_rate": 1.6682667773870556e-05, + "loss": 0.0252, + "step": 9240 + }, + { + "epoch": 3.758031720211468, + "grad_norm": 10.097658951277527, + "learning_rate": 1.6681914006870157e-05, + "loss": 0.1947, + "step": 9241 + }, + { + "epoch": 3.758438389589264, + "grad_norm": 1.1465720514000108, + "learning_rate": 1.668116017127655e-05, + "loss": 0.0179, + "step": 9242 + }, + { + "epoch": 3.7588450589670597, + "grad_norm": 4.904675872496029, + "learning_rate": 1.6680406267097474e-05, + "loss": 0.2598, + "step": 9243 + }, + { + "epoch": 3.7592517283448554, + "grad_norm": 16.16279781385065, + "learning_rate": 1.667965229434066e-05, + "loss": 0.6576, + "step": 9244 + }, + { + "epoch": 3.7596583977226516, + "grad_norm": 8.58086325885156, + "learning_rate": 1.667889825301386e-05, + "loss": 0.2801, + "step": 9245 + }, + { + "epoch": 3.7600650671004474, + "grad_norm": 14.829332010377113, + "learning_rate": 1.66781441431248e-05, + "loss": 0.8406, + "step": 9246 + }, + { + "epoch": 3.760471736478243, + "grad_norm": 8.379230648678934, + "learning_rate": 1.6677389964681233e-05, + "loss": 0.2774, + "step": 9247 + }, + { + "epoch": 3.760878405856039, + "grad_norm": 12.269608421319816, + "learning_rate": 1.66766357176909e-05, + "loss": 0.4209, + "step": 9248 + }, + { + "epoch": 3.761285075233835, + "grad_norm": 27.119344912876684, + "learning_rate": 1.667588140216154e-05, + "loss": 0.519, + "step": 9249 + }, + { + "epoch": 3.761691744611631, + "grad_norm": 9.675890640676876, + "learning_rate": 1.667512701810089e-05, + "loss": 0.2823, + "step": 9250 + }, + { + "epoch": 3.7620984139894267, + "grad_norm": 4.884082695615603, + "learning_rate": 1.6674372565516713e-05, + "loss": 0.1214, + "step": 9251 + }, + { + "epoch": 3.7625050833672224, + "grad_norm": 9.002289623593365, + "learning_rate": 1.6673618044416736e-05, + "loss": 0.3031, + "step": 9252 + }, + { + "epoch": 3.762911752745018, + "grad_norm": 6.563047315986063, + "learning_rate": 1.6672863454808713e-05, + "loss": 0.2997, + "step": 9253 + }, + { + "epoch": 3.763318422122814, + "grad_norm": 25.097117853586845, + "learning_rate": 1.6672108796700387e-05, + "loss": 1.0461, + "step": 9254 + }, + { + "epoch": 3.7637250915006097, + "grad_norm": 0.5587357118178451, + "learning_rate": 1.6671354070099507e-05, + "loss": 0.0108, + "step": 9255 + }, + { + "epoch": 3.764131760878406, + "grad_norm": 17.87981958996386, + "learning_rate": 1.667059927501382e-05, + "loss": 1.4364, + "step": 9256 + }, + { + "epoch": 3.7645384302562017, + "grad_norm": 6.52577394229092, + "learning_rate": 1.6669844411451073e-05, + "loss": 0.1834, + "step": 9257 + }, + { + "epoch": 3.7649450996339975, + "grad_norm": 2.878512783469194, + "learning_rate": 1.6669089479419018e-05, + "loss": 0.1365, + "step": 9258 + }, + { + "epoch": 3.7653517690117932, + "grad_norm": 3.997557401686048, + "learning_rate": 1.6668334478925402e-05, + "loss": 0.149, + "step": 9259 + }, + { + "epoch": 3.7657584383895895, + "grad_norm": 1.3632921288663467, + "learning_rate": 1.6667579409977975e-05, + "loss": 0.0241, + "step": 9260 + }, + { + "epoch": 3.7661651077673852, + "grad_norm": 6.565599205708798, + "learning_rate": 1.6666824272584493e-05, + "loss": 0.1632, + "step": 9261 + }, + { + "epoch": 3.766571777145181, + "grad_norm": 4.562382666748038, + "learning_rate": 1.6666069066752703e-05, + "loss": 0.1579, + "step": 9262 + }, + { + "epoch": 3.7669784465229768, + "grad_norm": 6.113123541159661, + "learning_rate": 1.6665313792490358e-05, + "loss": 0.1959, + "step": 9263 + }, + { + "epoch": 3.7673851159007725, + "grad_norm": 2.9283070917755136, + "learning_rate": 1.6664558449805213e-05, + "loss": 0.0602, + "step": 9264 + }, + { + "epoch": 3.7677917852785683, + "grad_norm": 7.6584272115933025, + "learning_rate": 1.6663803038705024e-05, + "loss": 0.1803, + "step": 9265 + }, + { + "epoch": 3.7681984546563645, + "grad_norm": 18.07410110161628, + "learning_rate": 1.666304755919754e-05, + "loss": 0.4663, + "step": 9266 + }, + { + "epoch": 3.7686051240341603, + "grad_norm": 9.911377686991655, + "learning_rate": 1.666229201129052e-05, + "loss": 0.3195, + "step": 9267 + }, + { + "epoch": 3.769011793411956, + "grad_norm": 9.945757387733213, + "learning_rate": 1.666153639499172e-05, + "loss": 0.1539, + "step": 9268 + }, + { + "epoch": 3.769418462789752, + "grad_norm": 9.585300057504819, + "learning_rate": 1.6660780710308894e-05, + "loss": 0.3749, + "step": 9269 + }, + { + "epoch": 3.769825132167548, + "grad_norm": 8.07826921993268, + "learning_rate": 1.66600249572498e-05, + "loss": 0.2501, + "step": 9270 + }, + { + "epoch": 3.7702318015453438, + "grad_norm": 8.05492292415323, + "learning_rate": 1.6659269135822204e-05, + "loss": 0.1371, + "step": 9271 + }, + { + "epoch": 3.7706384709231395, + "grad_norm": 10.542106483974209, + "learning_rate": 1.6658513246033857e-05, + "loss": 0.2629, + "step": 9272 + }, + { + "epoch": 3.7710451403009353, + "grad_norm": 9.352097089683973, + "learning_rate": 1.6657757287892517e-05, + "loss": 0.2709, + "step": 9273 + }, + { + "epoch": 3.771451809678731, + "grad_norm": 4.432023503444525, + "learning_rate": 1.665700126140595e-05, + "loss": 0.1085, + "step": 9274 + }, + { + "epoch": 3.771858479056527, + "grad_norm": 7.5694684583126755, + "learning_rate": 1.6656245166581915e-05, + "loss": 0.2418, + "step": 9275 + }, + { + "epoch": 3.772265148434323, + "grad_norm": 9.718301832979467, + "learning_rate": 1.6655489003428173e-05, + "loss": 0.26, + "step": 9276 + }, + { + "epoch": 3.772671817812119, + "grad_norm": 13.764983173791512, + "learning_rate": 1.6654732771952488e-05, + "loss": 0.4493, + "step": 9277 + }, + { + "epoch": 3.7730784871899146, + "grad_norm": 6.570981344106435, + "learning_rate": 1.665397647216262e-05, + "loss": 0.1948, + "step": 9278 + }, + { + "epoch": 3.7734851565677103, + "grad_norm": 1.0593637126643862, + "learning_rate": 1.6653220104066336e-05, + "loss": 0.0192, + "step": 9279 + }, + { + "epoch": 3.7738918259455065, + "grad_norm": 10.05758137025918, + "learning_rate": 1.6652463667671395e-05, + "loss": 0.2604, + "step": 9280 + }, + { + "epoch": 3.7742984953233023, + "grad_norm": 3.9043820973217374, + "learning_rate": 1.6651707162985572e-05, + "loss": 0.0938, + "step": 9281 + }, + { + "epoch": 3.774705164701098, + "grad_norm": 4.698769968321301, + "learning_rate": 1.6650950590016622e-05, + "loss": 0.2384, + "step": 9282 + }, + { + "epoch": 3.775111834078894, + "grad_norm": 1.6628795808373165, + "learning_rate": 1.665019394877232e-05, + "loss": 0.0176, + "step": 9283 + }, + { + "epoch": 3.7755185034566896, + "grad_norm": 1.9885303026563699, + "learning_rate": 1.6649437239260432e-05, + "loss": 0.0143, + "step": 9284 + }, + { + "epoch": 3.7759251728344854, + "grad_norm": 10.885982180371336, + "learning_rate": 1.664868046148872e-05, + "loss": 0.3357, + "step": 9285 + }, + { + "epoch": 3.7763318422122816, + "grad_norm": 1.1211733945654785, + "learning_rate": 1.664792361546496e-05, + "loss": 0.0189, + "step": 9286 + }, + { + "epoch": 3.7767385115900773, + "grad_norm": 9.296991554749775, + "learning_rate": 1.664716670119692e-05, + "loss": 0.5137, + "step": 9287 + }, + { + "epoch": 3.777145180967873, + "grad_norm": 1.6882238406117016, + "learning_rate": 1.6646409718692363e-05, + "loss": 0.0269, + "step": 9288 + }, + { + "epoch": 3.777551850345669, + "grad_norm": 11.627132090628677, + "learning_rate": 1.6645652667959067e-05, + "loss": 0.7026, + "step": 9289 + }, + { + "epoch": 3.777958519723465, + "grad_norm": 5.148891227826898, + "learning_rate": 1.66448955490048e-05, + "loss": 0.1012, + "step": 9290 + }, + { + "epoch": 3.778365189101261, + "grad_norm": 8.549812764262551, + "learning_rate": 1.664413836183734e-05, + "loss": 0.3755, + "step": 9291 + }, + { + "epoch": 3.7787718584790566, + "grad_norm": 0.3792515591089541, + "learning_rate": 1.6643381106464454e-05, + "loss": 0.0071, + "step": 9292 + }, + { + "epoch": 3.7791785278568524, + "grad_norm": 19.3438024418816, + "learning_rate": 1.664262378289392e-05, + "loss": 0.7717, + "step": 9293 + }, + { + "epoch": 3.779585197234648, + "grad_norm": 13.056512813185059, + "learning_rate": 1.6641866391133504e-05, + "loss": 0.4493, + "step": 9294 + }, + { + "epoch": 3.779991866612444, + "grad_norm": 9.393344826837147, + "learning_rate": 1.664110893119099e-05, + "loss": 0.3148, + "step": 9295 + }, + { + "epoch": 3.7803985359902397, + "grad_norm": 13.903213199999469, + "learning_rate": 1.664035140307415e-05, + "loss": 0.6325, + "step": 9296 + }, + { + "epoch": 3.780805205368036, + "grad_norm": 1.1438307166107236, + "learning_rate": 1.663959380679076e-05, + "loss": 0.0177, + "step": 9297 + }, + { + "epoch": 3.7812118747458316, + "grad_norm": 4.978024446963421, + "learning_rate": 1.6638836142348602e-05, + "loss": 0.3231, + "step": 9298 + }, + { + "epoch": 3.7816185441236274, + "grad_norm": 5.612552294024221, + "learning_rate": 1.6638078409755447e-05, + "loss": 0.141, + "step": 9299 + }, + { + "epoch": 3.782025213501423, + "grad_norm": 0.10607916070093114, + "learning_rate": 1.6637320609019073e-05, + "loss": 0.0016, + "step": 9300 + }, + { + "epoch": 3.7824318828792194, + "grad_norm": 3.809912773799094, + "learning_rate": 1.6636562740147268e-05, + "loss": 0.0838, + "step": 9301 + }, + { + "epoch": 3.782838552257015, + "grad_norm": 14.921074932679131, + "learning_rate": 1.6635804803147803e-05, + "loss": 0.6566, + "step": 9302 + }, + { + "epoch": 3.783245221634811, + "grad_norm": 7.562723620267104, + "learning_rate": 1.663504679802846e-05, + "loss": 0.0865, + "step": 9303 + }, + { + "epoch": 3.7836518910126067, + "grad_norm": 0.43299665540318555, + "learning_rate": 1.6634288724797028e-05, + "loss": 0.0075, + "step": 9304 + }, + { + "epoch": 3.7840585603904024, + "grad_norm": 12.109369389009714, + "learning_rate": 1.663353058346128e-05, + "loss": 0.4324, + "step": 9305 + }, + { + "epoch": 3.784465229768198, + "grad_norm": 9.089653679942865, + "learning_rate": 1.6632772374029002e-05, + "loss": 0.5363, + "step": 9306 + }, + { + "epoch": 3.7848718991459944, + "grad_norm": 5.3361684872051125, + "learning_rate": 1.6632014096507977e-05, + "loss": 0.2447, + "step": 9307 + }, + { + "epoch": 3.78527856852379, + "grad_norm": 0.4324790345863017, + "learning_rate": 1.663125575090599e-05, + "loss": 0.0109, + "step": 9308 + }, + { + "epoch": 3.785685237901586, + "grad_norm": 2.291146951686492, + "learning_rate": 1.6630497337230824e-05, + "loss": 0.0341, + "step": 9309 + }, + { + "epoch": 3.7860919072793817, + "grad_norm": 2.983522479376737, + "learning_rate": 1.662973885549027e-05, + "loss": 0.0628, + "step": 9310 + }, + { + "epoch": 3.786498576657178, + "grad_norm": 1.7919570031824232, + "learning_rate": 1.6628980305692105e-05, + "loss": 0.0269, + "step": 9311 + }, + { + "epoch": 3.7869052460349737, + "grad_norm": 2.916695043490639, + "learning_rate": 1.6628221687844125e-05, + "loss": 0.0516, + "step": 9312 + }, + { + "epoch": 3.7873119154127695, + "grad_norm": 1.176509594268297, + "learning_rate": 1.662746300195411e-05, + "loss": 0.0197, + "step": 9313 + }, + { + "epoch": 3.7877185847905652, + "grad_norm": 12.420036162420315, + "learning_rate": 1.6626704248029853e-05, + "loss": 0.5785, + "step": 9314 + }, + { + "epoch": 3.788125254168361, + "grad_norm": 2.2359103312135202, + "learning_rate": 1.6625945426079144e-05, + "loss": 0.1157, + "step": 9315 + }, + { + "epoch": 3.7885319235461568, + "grad_norm": 5.617561527003797, + "learning_rate": 1.6625186536109768e-05, + "loss": 0.1956, + "step": 9316 + }, + { + "epoch": 3.788938592923953, + "grad_norm": 7.8828243611381685, + "learning_rate": 1.6624427578129518e-05, + "loss": 0.4552, + "step": 9317 + }, + { + "epoch": 3.7893452623017487, + "grad_norm": 6.640500254160583, + "learning_rate": 1.6623668552146183e-05, + "loss": 0.4894, + "step": 9318 + }, + { + "epoch": 3.7897519316795445, + "grad_norm": 4.908938295701597, + "learning_rate": 1.6622909458167563e-05, + "loss": 0.103, + "step": 9319 + }, + { + "epoch": 3.7901586010573403, + "grad_norm": 44.88752373121373, + "learning_rate": 1.662215029620144e-05, + "loss": 1.3091, + "step": 9320 + }, + { + "epoch": 3.7905652704351365, + "grad_norm": 1.413690668643675, + "learning_rate": 1.6621391066255607e-05, + "loss": 0.0354, + "step": 9321 + }, + { + "epoch": 3.7909719398129322, + "grad_norm": 0.28126222303274645, + "learning_rate": 1.6620631768337867e-05, + "loss": 0.0041, + "step": 9322 + }, + { + "epoch": 3.791378609190728, + "grad_norm": 3.827704513076314, + "learning_rate": 1.6619872402456012e-05, + "loss": 0.1375, + "step": 9323 + }, + { + "epoch": 3.7917852785685238, + "grad_norm": 0.31778045859555154, + "learning_rate": 1.6619112968617833e-05, + "loss": 0.0058, + "step": 9324 + }, + { + "epoch": 3.7921919479463195, + "grad_norm": 10.78926583969636, + "learning_rate": 1.6618353466831124e-05, + "loss": 0.3398, + "step": 9325 + }, + { + "epoch": 3.7925986173241153, + "grad_norm": 10.483764200888038, + "learning_rate": 1.661759389710369e-05, + "loss": 0.7671, + "step": 9326 + }, + { + "epoch": 3.7930052867019115, + "grad_norm": 5.709891027256153, + "learning_rate": 1.6616834259443325e-05, + "loss": 0.292, + "step": 9327 + }, + { + "epoch": 3.7934119560797073, + "grad_norm": 7.350553510221107, + "learning_rate": 1.6616074553857823e-05, + "loss": 0.408, + "step": 9328 + }, + { + "epoch": 3.793818625457503, + "grad_norm": 13.837328540151951, + "learning_rate": 1.6615314780354986e-05, + "loss": 0.4566, + "step": 9329 + }, + { + "epoch": 3.794225294835299, + "grad_norm": 10.588228279455857, + "learning_rate": 1.6614554938942618e-05, + "loss": 0.3522, + "step": 9330 + }, + { + "epoch": 3.794631964213095, + "grad_norm": 6.745120141100647, + "learning_rate": 1.661379502962851e-05, + "loss": 0.2929, + "step": 9331 + }, + { + "epoch": 3.7950386335908908, + "grad_norm": 7.409564339731654, + "learning_rate": 1.6613035052420468e-05, + "loss": 0.4499, + "step": 9332 + }, + { + "epoch": 3.7954453029686865, + "grad_norm": 5.249030342363651, + "learning_rate": 1.6612275007326295e-05, + "loss": 0.1634, + "step": 9333 + }, + { + "epoch": 3.7958519723464823, + "grad_norm": 8.3792702954798, + "learning_rate": 1.6611514894353788e-05, + "loss": 0.6343, + "step": 9334 + }, + { + "epoch": 3.796258641724278, + "grad_norm": 8.321368488300482, + "learning_rate": 1.6610754713510754e-05, + "loss": 0.2321, + "step": 9335 + }, + { + "epoch": 3.796665311102074, + "grad_norm": 9.362072618406433, + "learning_rate": 1.6609994464805e-05, + "loss": 0.3247, + "step": 9336 + }, + { + "epoch": 3.7970719804798696, + "grad_norm": 1.5320573422361343, + "learning_rate": 1.6609234148244318e-05, + "loss": 0.0282, + "step": 9337 + }, + { + "epoch": 3.797478649857666, + "grad_norm": 7.733266003892554, + "learning_rate": 1.660847376383653e-05, + "loss": 0.5994, + "step": 9338 + }, + { + "epoch": 3.7978853192354616, + "grad_norm": 0.6207590183929539, + "learning_rate": 1.6607713311589427e-05, + "loss": 0.009, + "step": 9339 + }, + { + "epoch": 3.7982919886132573, + "grad_norm": 7.914495294702258, + "learning_rate": 1.6606952791510823e-05, + "loss": 0.3093, + "step": 9340 + }, + { + "epoch": 3.7986986579910536, + "grad_norm": 8.090504424881729, + "learning_rate": 1.660619220360852e-05, + "loss": 0.4061, + "step": 9341 + }, + { + "epoch": 3.7991053273688493, + "grad_norm": 6.578501024692099, + "learning_rate": 1.6605431547890333e-05, + "loss": 0.3009, + "step": 9342 + }, + { + "epoch": 3.799511996746645, + "grad_norm": 7.9788436910914085, + "learning_rate": 1.6604670824364067e-05, + "loss": 0.3787, + "step": 9343 + }, + { + "epoch": 3.799918666124441, + "grad_norm": 7.717949266830985, + "learning_rate": 1.660391003303753e-05, + "loss": 0.4503, + "step": 9344 + }, + { + "epoch": 3.8003253355022366, + "grad_norm": 7.932017970938481, + "learning_rate": 1.6603149173918535e-05, + "loss": 0.6833, + "step": 9345 + }, + { + "epoch": 3.8007320048800324, + "grad_norm": 5.235957603458041, + "learning_rate": 1.6602388247014888e-05, + "loss": 0.2228, + "step": 9346 + }, + { + "epoch": 3.801138674257828, + "grad_norm": 10.954615986361171, + "learning_rate": 1.6601627252334402e-05, + "loss": 0.4951, + "step": 9347 + }, + { + "epoch": 3.8015453436356244, + "grad_norm": 6.918647045740993, + "learning_rate": 1.660086618988489e-05, + "loss": 0.0813, + "step": 9348 + }, + { + "epoch": 3.80195201301342, + "grad_norm": 3.1251213541160814, + "learning_rate": 1.6600105059674167e-05, + "loss": 0.0664, + "step": 9349 + }, + { + "epoch": 3.802358682391216, + "grad_norm": 2.421379383389131, + "learning_rate": 1.659934386171004e-05, + "loss": 0.0423, + "step": 9350 + }, + { + "epoch": 3.8027653517690116, + "grad_norm": 3.9385238831180107, + "learning_rate": 1.659858259600033e-05, + "loss": 0.1096, + "step": 9351 + }, + { + "epoch": 3.803172021146808, + "grad_norm": 12.332709503734515, + "learning_rate": 1.6597821262552847e-05, + "loss": 0.5271, + "step": 9352 + }, + { + "epoch": 3.8035786905246036, + "grad_norm": 3.6459350419262018, + "learning_rate": 1.659705986137541e-05, + "loss": 0.0705, + "step": 9353 + }, + { + "epoch": 3.8039853599023994, + "grad_norm": 1.851780304708908, + "learning_rate": 1.659629839247583e-05, + "loss": 0.066, + "step": 9354 + }, + { + "epoch": 3.804392029280195, + "grad_norm": 15.584424102656891, + "learning_rate": 1.659553685586193e-05, + "loss": 0.8827, + "step": 9355 + }, + { + "epoch": 3.804798698657991, + "grad_norm": 5.358540436889927, + "learning_rate": 1.6594775251541522e-05, + "loss": 0.0945, + "step": 9356 + }, + { + "epoch": 3.8052053680357867, + "grad_norm": 7.962219030780525, + "learning_rate": 1.659401357952243e-05, + "loss": 0.2406, + "step": 9357 + }, + { + "epoch": 3.805612037413583, + "grad_norm": 6.489376501439594, + "learning_rate": 1.6593251839812468e-05, + "loss": 0.1333, + "step": 9358 + }, + { + "epoch": 3.8060187067913787, + "grad_norm": 3.132726036026693, + "learning_rate": 1.6592490032419458e-05, + "loss": 0.058, + "step": 9359 + }, + { + "epoch": 3.8064253761691744, + "grad_norm": 0.08151231507453563, + "learning_rate": 1.6591728157351222e-05, + "loss": 0.0018, + "step": 9360 + }, + { + "epoch": 3.80683204554697, + "grad_norm": 9.484826350072693, + "learning_rate": 1.6590966214615575e-05, + "loss": 0.4338, + "step": 9361 + }, + { + "epoch": 3.8072387149247664, + "grad_norm": 4.6018707158203, + "learning_rate": 1.6590204204220345e-05, + "loss": 0.2628, + "step": 9362 + }, + { + "epoch": 3.807645384302562, + "grad_norm": 3.6773764689498134, + "learning_rate": 1.6589442126173353e-05, + "loss": 0.0577, + "step": 9363 + }, + { + "epoch": 3.808052053680358, + "grad_norm": 7.634256494272209, + "learning_rate": 1.6588679980482417e-05, + "loss": 0.293, + "step": 9364 + }, + { + "epoch": 3.8084587230581537, + "grad_norm": 1.8693110807505662, + "learning_rate": 1.658791776715537e-05, + "loss": 0.0261, + "step": 9365 + }, + { + "epoch": 3.8088653924359495, + "grad_norm": 4.760922121120005, + "learning_rate": 1.658715548620003e-05, + "loss": 0.0871, + "step": 9366 + }, + { + "epoch": 3.8092720618137452, + "grad_norm": 4.033367690153475, + "learning_rate": 1.6586393137624224e-05, + "loss": 0.0728, + "step": 9367 + }, + { + "epoch": 3.8096787311915414, + "grad_norm": 4.881709495703302, + "learning_rate": 1.6585630721435774e-05, + "loss": 0.0945, + "step": 9368 + }, + { + "epoch": 3.810085400569337, + "grad_norm": 11.299487076850934, + "learning_rate": 1.6584868237642514e-05, + "loss": 0.3653, + "step": 9369 + }, + { + "epoch": 3.810492069947133, + "grad_norm": 9.921977098758163, + "learning_rate": 1.6584105686252267e-05, + "loss": 0.524, + "step": 9370 + }, + { + "epoch": 3.8108987393249287, + "grad_norm": 1.112140780547463, + "learning_rate": 1.658334306727286e-05, + "loss": 0.0148, + "step": 9371 + }, + { + "epoch": 3.811305408702725, + "grad_norm": 0.7025900169568248, + "learning_rate": 1.6582580380712122e-05, + "loss": 0.0116, + "step": 9372 + }, + { + "epoch": 3.8117120780805207, + "grad_norm": 5.486281220612279, + "learning_rate": 1.6581817626577885e-05, + "loss": 0.172, + "step": 9373 + }, + { + "epoch": 3.8121187474583165, + "grad_norm": 1.7400197609959605, + "learning_rate": 1.6581054804877978e-05, + "loss": 0.0258, + "step": 9374 + }, + { + "epoch": 3.8125254168361122, + "grad_norm": 6.465277852872232, + "learning_rate": 1.6580291915620234e-05, + "loss": 0.3068, + "step": 9375 + }, + { + "epoch": 3.812932086213908, + "grad_norm": 4.505366434314204, + "learning_rate": 1.6579528958812476e-05, + "loss": 0.0928, + "step": 9376 + }, + { + "epoch": 3.8133387555917038, + "grad_norm": 2.265121272614264, + "learning_rate": 1.6578765934462545e-05, + "loss": 0.0378, + "step": 9377 + }, + { + "epoch": 3.8137454249694995, + "grad_norm": 3.720048566184742, + "learning_rate": 1.657800284257827e-05, + "loss": 0.0727, + "step": 9378 + }, + { + "epoch": 3.8141520943472957, + "grad_norm": 6.719590772305393, + "learning_rate": 1.6577239683167486e-05, + "loss": 0.1391, + "step": 9379 + }, + { + "epoch": 3.8145587637250915, + "grad_norm": 12.211274840087746, + "learning_rate": 1.6576476456238024e-05, + "loss": 0.3139, + "step": 9380 + }, + { + "epoch": 3.8149654331028873, + "grad_norm": 13.36449430996524, + "learning_rate": 1.6575713161797723e-05, + "loss": 0.9916, + "step": 9381 + }, + { + "epoch": 3.8153721024806835, + "grad_norm": 12.661088771169721, + "learning_rate": 1.6574949799854415e-05, + "loss": 0.3341, + "step": 9382 + }, + { + "epoch": 3.8157787718584792, + "grad_norm": 7.124078457424106, + "learning_rate": 1.6574186370415944e-05, + "loss": 0.2738, + "step": 9383 + }, + { + "epoch": 3.816185441236275, + "grad_norm": 10.512314255142904, + "learning_rate": 1.6573422873490136e-05, + "loss": 0.1683, + "step": 9384 + }, + { + "epoch": 3.8165921106140708, + "grad_norm": 6.449383230674425, + "learning_rate": 1.6572659309084836e-05, + "loss": 0.1278, + "step": 9385 + }, + { + "epoch": 3.8169987799918665, + "grad_norm": 3.23848140681652, + "learning_rate": 1.657189567720788e-05, + "loss": 0.0684, + "step": 9386 + }, + { + "epoch": 3.8174054493696623, + "grad_norm": 8.248534023108924, + "learning_rate": 1.6571131977867106e-05, + "loss": 0.4417, + "step": 9387 + }, + { + "epoch": 3.817812118747458, + "grad_norm": 0.49435176220424343, + "learning_rate": 1.6570368211070357e-05, + "loss": 0.0184, + "step": 9388 + }, + { + "epoch": 3.8182187881252543, + "grad_norm": 5.233554092264589, + "learning_rate": 1.656960437682547e-05, + "loss": 0.1485, + "step": 9389 + }, + { + "epoch": 3.81862545750305, + "grad_norm": 1.818779218970494, + "learning_rate": 1.6568840475140285e-05, + "loss": 0.0374, + "step": 9390 + }, + { + "epoch": 3.819032126880846, + "grad_norm": 19.76419975057585, + "learning_rate": 1.6568076506022652e-05, + "loss": 0.9792, + "step": 9391 + }, + { + "epoch": 3.8194387962586416, + "grad_norm": 7.746082463372867, + "learning_rate": 1.6567312469480404e-05, + "loss": 0.3002, + "step": 9392 + }, + { + "epoch": 3.819845465636438, + "grad_norm": 7.356382167418049, + "learning_rate": 1.6566548365521387e-05, + "loss": 0.2765, + "step": 9393 + }, + { + "epoch": 3.8202521350142336, + "grad_norm": 7.189911924797197, + "learning_rate": 1.6565784194153453e-05, + "loss": 0.2607, + "step": 9394 + }, + { + "epoch": 3.8206588043920293, + "grad_norm": 8.32310874584821, + "learning_rate": 1.6565019955384433e-05, + "loss": 0.4194, + "step": 9395 + }, + { + "epoch": 3.821065473769825, + "grad_norm": 15.285195616494228, + "learning_rate": 1.6564255649222182e-05, + "loss": 0.3189, + "step": 9396 + }, + { + "epoch": 3.821472143147621, + "grad_norm": 2.9391171554375815, + "learning_rate": 1.6563491275674544e-05, + "loss": 0.0611, + "step": 9397 + }, + { + "epoch": 3.8218788125254166, + "grad_norm": 7.8361407146769135, + "learning_rate": 1.6562726834749365e-05, + "loss": 0.2756, + "step": 9398 + }, + { + "epoch": 3.822285481903213, + "grad_norm": 7.328846529475971, + "learning_rate": 1.6561962326454488e-05, + "loss": 0.1791, + "step": 9399 + }, + { + "epoch": 3.8226921512810086, + "grad_norm": 3.2456497574159386, + "learning_rate": 1.6561197750797768e-05, + "loss": 0.0743, + "step": 9400 + }, + { + "epoch": 3.8230988206588044, + "grad_norm": 14.400677570623133, + "learning_rate": 1.6560433107787052e-05, + "loss": 0.9459, + "step": 9401 + }, + { + "epoch": 3.8235054900366, + "grad_norm": 8.87647348138639, + "learning_rate": 1.6559668397430186e-05, + "loss": 0.3294, + "step": 9402 + }, + { + "epoch": 3.8239121594143963, + "grad_norm": 7.073703174666116, + "learning_rate": 1.6558903619735024e-05, + "loss": 0.2338, + "step": 9403 + }, + { + "epoch": 3.824318828792192, + "grad_norm": 11.292733370103655, + "learning_rate": 1.6558138774709412e-05, + "loss": 0.2495, + "step": 9404 + }, + { + "epoch": 3.824725498169988, + "grad_norm": 7.778586156975311, + "learning_rate": 1.655737386236121e-05, + "loss": 0.2346, + "step": 9405 + }, + { + "epoch": 3.8251321675477836, + "grad_norm": 6.546828745774286, + "learning_rate": 1.655660888269826e-05, + "loss": 0.6152, + "step": 9406 + }, + { + "epoch": 3.8255388369255794, + "grad_norm": 0.3776850237198339, + "learning_rate": 1.6555843835728424e-05, + "loss": 0.0049, + "step": 9407 + }, + { + "epoch": 3.825945506303375, + "grad_norm": 6.654645378395725, + "learning_rate": 1.655507872145955e-05, + "loss": 0.2493, + "step": 9408 + }, + { + "epoch": 3.8263521756811714, + "grad_norm": 8.891358684415163, + "learning_rate": 1.6554313539899496e-05, + "loss": 0.5704, + "step": 9409 + }, + { + "epoch": 3.826758845058967, + "grad_norm": 6.359045778494689, + "learning_rate": 1.6553548291056114e-05, + "loss": 0.3168, + "step": 9410 + }, + { + "epoch": 3.827165514436763, + "grad_norm": 1.9454687593963962, + "learning_rate": 1.655278297493726e-05, + "loss": 0.1061, + "step": 9411 + }, + { + "epoch": 3.8275721838145587, + "grad_norm": 10.447095223681156, + "learning_rate": 1.655201759155079e-05, + "loss": 0.4545, + "step": 9412 + }, + { + "epoch": 3.827978853192355, + "grad_norm": 10.58108773856773, + "learning_rate": 1.6551252140904558e-05, + "loss": 0.4947, + "step": 9413 + }, + { + "epoch": 3.8283855225701506, + "grad_norm": 0.9511644314201374, + "learning_rate": 1.6550486623006433e-05, + "loss": 0.0215, + "step": 9414 + }, + { + "epoch": 3.8287921919479464, + "grad_norm": 2.918828191589879, + "learning_rate": 1.6549721037864263e-05, + "loss": 0.0617, + "step": 9415 + }, + { + "epoch": 3.829198861325742, + "grad_norm": 9.008584262721989, + "learning_rate": 1.6548955385485904e-05, + "loss": 0.45, + "step": 9416 + }, + { + "epoch": 3.829605530703538, + "grad_norm": 0.333009777273485, + "learning_rate": 1.654818966587923e-05, + "loss": 0.0051, + "step": 9417 + }, + { + "epoch": 3.8300122000813337, + "grad_norm": 11.50895701121164, + "learning_rate": 1.654742387905209e-05, + "loss": 0.448, + "step": 9418 + }, + { + "epoch": 3.8304188694591295, + "grad_norm": 4.60739340256407, + "learning_rate": 1.654665802501235e-05, + "loss": 0.1114, + "step": 9419 + }, + { + "epoch": 3.8308255388369257, + "grad_norm": 3.3973258493419474, + "learning_rate": 1.654589210376787e-05, + "loss": 0.0903, + "step": 9420 + }, + { + "epoch": 3.8312322082147214, + "grad_norm": 0.8014295726417138, + "learning_rate": 1.6545126115326514e-05, + "loss": 0.0109, + "step": 9421 + }, + { + "epoch": 3.831638877592517, + "grad_norm": 2.0683607963296757, + "learning_rate": 1.6544360059696142e-05, + "loss": 0.0223, + "step": 9422 + }, + { + "epoch": 3.8320455469703134, + "grad_norm": 4.3372306723731215, + "learning_rate": 1.6543593936884623e-05, + "loss": 0.1307, + "step": 9423 + }, + { + "epoch": 3.832452216348109, + "grad_norm": 0.6702253412477281, + "learning_rate": 1.654282774689982e-05, + "loss": 0.015, + "step": 9424 + }, + { + "epoch": 3.832858885725905, + "grad_norm": 4.511380085017339, + "learning_rate": 1.6542061489749594e-05, + "loss": 0.1702, + "step": 9425 + }, + { + "epoch": 3.8332655551037007, + "grad_norm": 15.991845854057551, + "learning_rate": 1.6541295165441815e-05, + "loss": 0.5731, + "step": 9426 + }, + { + "epoch": 3.8336722244814965, + "grad_norm": 7.610486357290723, + "learning_rate": 1.6540528773984347e-05, + "loss": 0.2147, + "step": 9427 + }, + { + "epoch": 3.8340788938592922, + "grad_norm": 0.15763452357339153, + "learning_rate": 1.6539762315385064e-05, + "loss": 0.0026, + "step": 9428 + }, + { + "epoch": 3.834485563237088, + "grad_norm": 15.169670590602918, + "learning_rate": 1.6538995789651827e-05, + "loss": 0.7382, + "step": 9429 + }, + { + "epoch": 3.834892232614884, + "grad_norm": 2.0870620758596403, + "learning_rate": 1.6538229196792506e-05, + "loss": 0.0347, + "step": 9430 + }, + { + "epoch": 3.83529890199268, + "grad_norm": 4.327926987213197, + "learning_rate": 1.6537462536814972e-05, + "loss": 0.1289, + "step": 9431 + }, + { + "epoch": 3.8357055713704757, + "grad_norm": 15.775931922766604, + "learning_rate": 1.6536695809727096e-05, + "loss": 0.425, + "step": 9432 + }, + { + "epoch": 3.8361122407482715, + "grad_norm": 19.415207757116693, + "learning_rate": 1.6535929015536747e-05, + "loss": 0.8935, + "step": 9433 + }, + { + "epoch": 3.8365189101260677, + "grad_norm": 5.91366335979455, + "learning_rate": 1.6535162154251798e-05, + "loss": 0.1711, + "step": 9434 + }, + { + "epoch": 3.8369255795038635, + "grad_norm": 8.301846389199246, + "learning_rate": 1.653439522588012e-05, + "loss": 0.3618, + "step": 9435 + }, + { + "epoch": 3.8373322488816592, + "grad_norm": 3.550626560380955, + "learning_rate": 1.6533628230429588e-05, + "loss": 0.1164, + "step": 9436 + }, + { + "epoch": 3.837738918259455, + "grad_norm": 10.545942533519012, + "learning_rate": 1.653286116790807e-05, + "loss": 0.3376, + "step": 9437 + }, + { + "epoch": 3.8381455876372508, + "grad_norm": 9.477107730038705, + "learning_rate": 1.6532094038323447e-05, + "loss": 0.332, + "step": 9438 + }, + { + "epoch": 3.8385522570150465, + "grad_norm": 8.755316309773187, + "learning_rate": 1.6531326841683592e-05, + "loss": 0.2609, + "step": 9439 + }, + { + "epoch": 3.8389589263928428, + "grad_norm": 8.647652011981098, + "learning_rate": 1.6530559577996376e-05, + "loss": 0.4707, + "step": 9440 + }, + { + "epoch": 3.8393655957706385, + "grad_norm": 5.321911632507104, + "learning_rate": 1.6529792247269683e-05, + "loss": 0.1008, + "step": 9441 + }, + { + "epoch": 3.8397722651484343, + "grad_norm": 8.326883879378249, + "learning_rate": 1.6529024849511385e-05, + "loss": 0.4049, + "step": 9442 + }, + { + "epoch": 3.84017893452623, + "grad_norm": 11.776743901177136, + "learning_rate": 1.652825738472936e-05, + "loss": 0.826, + "step": 9443 + }, + { + "epoch": 3.8405856039040263, + "grad_norm": 2.244582150100734, + "learning_rate": 1.652748985293149e-05, + "loss": 0.0218, + "step": 9444 + }, + { + "epoch": 3.840992273281822, + "grad_norm": 14.132944787557166, + "learning_rate": 1.6526722254125648e-05, + "loss": 0.964, + "step": 9445 + }, + { + "epoch": 3.841398942659618, + "grad_norm": 8.966746773202793, + "learning_rate": 1.652595458831972e-05, + "loss": 0.3597, + "step": 9446 + }, + { + "epoch": 3.8418056120374136, + "grad_norm": 7.392650306581072, + "learning_rate": 1.652518685552158e-05, + "loss": 0.1287, + "step": 9447 + }, + { + "epoch": 3.8422122814152093, + "grad_norm": 7.7646259776637265, + "learning_rate": 1.6524419055739117e-05, + "loss": 0.3492, + "step": 9448 + }, + { + "epoch": 3.842618950793005, + "grad_norm": 18.57736142965918, + "learning_rate": 1.6523651188980205e-05, + "loss": 0.7616, + "step": 9449 + }, + { + "epoch": 3.8430256201708013, + "grad_norm": 7.179591681002641, + "learning_rate": 1.6522883255252732e-05, + "loss": 0.2233, + "step": 9450 + }, + { + "epoch": 3.843432289548597, + "grad_norm": 2.2645237120273034, + "learning_rate": 1.6522115254564578e-05, + "loss": 0.0441, + "step": 9451 + }, + { + "epoch": 3.843838958926393, + "grad_norm": 22.272766452396112, + "learning_rate": 1.6521347186923633e-05, + "loss": 0.622, + "step": 9452 + }, + { + "epoch": 3.8442456283041886, + "grad_norm": 2.122071517114192, + "learning_rate": 1.6520579052337774e-05, + "loss": 0.0429, + "step": 9453 + }, + { + "epoch": 3.844652297681985, + "grad_norm": 3.4674465471407543, + "learning_rate": 1.6519810850814886e-05, + "loss": 0.0691, + "step": 9454 + }, + { + "epoch": 3.8450589670597806, + "grad_norm": 20.750764206880405, + "learning_rate": 1.6519042582362867e-05, + "loss": 0.5937, + "step": 9455 + }, + { + "epoch": 3.8454656364375763, + "grad_norm": 0.45699576570601314, + "learning_rate": 1.6518274246989586e-05, + "loss": 0.0077, + "step": 9456 + }, + { + "epoch": 3.845872305815372, + "grad_norm": 8.726132739261848, + "learning_rate": 1.6517505844702942e-05, + "loss": 0.2522, + "step": 9457 + }, + { + "epoch": 3.846278975193168, + "grad_norm": 9.704516122022152, + "learning_rate": 1.6516737375510822e-05, + "loss": 0.3679, + "step": 9458 + }, + { + "epoch": 3.8466856445709636, + "grad_norm": 6.08760527449008, + "learning_rate": 1.651596883942111e-05, + "loss": 0.158, + "step": 9459 + }, + { + "epoch": 3.8470923139487594, + "grad_norm": 1.7345278386544167, + "learning_rate": 1.6515200236441706e-05, + "loss": 0.0227, + "step": 9460 + }, + { + "epoch": 3.8474989833265556, + "grad_norm": 5.759129762840657, + "learning_rate": 1.6514431566580488e-05, + "loss": 0.1089, + "step": 9461 + }, + { + "epoch": 3.8479056527043514, + "grad_norm": 7.593946770288956, + "learning_rate": 1.6513662829845348e-05, + "loss": 0.1338, + "step": 9462 + }, + { + "epoch": 3.848312322082147, + "grad_norm": 14.170337106682636, + "learning_rate": 1.6512894026244187e-05, + "loss": 0.5643, + "step": 9463 + }, + { + "epoch": 3.8487189914599433, + "grad_norm": 8.415087292362443, + "learning_rate": 1.6512125155784886e-05, + "loss": 0.3248, + "step": 9464 + }, + { + "epoch": 3.849125660837739, + "grad_norm": 5.671266409878349, + "learning_rate": 1.6511356218475342e-05, + "loss": 0.268, + "step": 9465 + }, + { + "epoch": 3.849532330215535, + "grad_norm": 8.941173157803483, + "learning_rate": 1.651058721432345e-05, + "loss": 0.3074, + "step": 9466 + }, + { + "epoch": 3.8499389995933306, + "grad_norm": 1.4964965338500542, + "learning_rate": 1.6509818143337106e-05, + "loss": 0.0246, + "step": 9467 + }, + { + "epoch": 3.8503456689711264, + "grad_norm": 5.883619225794127, + "learning_rate": 1.6509049005524205e-05, + "loss": 0.1245, + "step": 9468 + }, + { + "epoch": 3.850752338348922, + "grad_norm": 7.154240057651555, + "learning_rate": 1.650827980089263e-05, + "loss": 0.1674, + "step": 9469 + }, + { + "epoch": 3.851159007726718, + "grad_norm": 6.980166276257931, + "learning_rate": 1.6507510529450298e-05, + "loss": 0.3122, + "step": 9470 + }, + { + "epoch": 3.851565677104514, + "grad_norm": 8.53396402312515, + "learning_rate": 1.6506741191205092e-05, + "loss": 0.2653, + "step": 9471 + }, + { + "epoch": 3.85197234648231, + "grad_norm": 1.4326849045313466, + "learning_rate": 1.650597178616491e-05, + "loss": 0.0231, + "step": 9472 + }, + { + "epoch": 3.8523790158601057, + "grad_norm": 1.3336948600555891, + "learning_rate": 1.6505202314337655e-05, + "loss": 0.0249, + "step": 9473 + }, + { + "epoch": 3.8527856852379014, + "grad_norm": 11.78960862974003, + "learning_rate": 1.6504432775731226e-05, + "loss": 0.6372, + "step": 9474 + }, + { + "epoch": 3.8531923546156976, + "grad_norm": 3.8997548524652705, + "learning_rate": 1.650366317035352e-05, + "loss": 0.1416, + "step": 9475 + }, + { + "epoch": 3.8535990239934934, + "grad_norm": 0.3822768461988308, + "learning_rate": 1.6502893498212435e-05, + "loss": 0.0049, + "step": 9476 + }, + { + "epoch": 3.854005693371289, + "grad_norm": 6.958204527399812, + "learning_rate": 1.650212375931588e-05, + "loss": 0.0259, + "step": 9477 + }, + { + "epoch": 3.854412362749085, + "grad_norm": 19.794965868366063, + "learning_rate": 1.6501353953671748e-05, + "loss": 0.3838, + "step": 9478 + }, + { + "epoch": 3.8548190321268807, + "grad_norm": 3.2374869742646957, + "learning_rate": 1.6500584081287948e-05, + "loss": 0.0573, + "step": 9479 + }, + { + "epoch": 3.8552257015046765, + "grad_norm": 7.8250074118566815, + "learning_rate": 1.6499814142172382e-05, + "loss": 0.2459, + "step": 9480 + }, + { + "epoch": 3.8556323708824727, + "grad_norm": 2.296791758736213, + "learning_rate": 1.649904413633295e-05, + "loss": 0.0566, + "step": 9481 + }, + { + "epoch": 3.8560390402602684, + "grad_norm": 1.43212664862831, + "learning_rate": 1.649827406377756e-05, + "loss": 0.0215, + "step": 9482 + }, + { + "epoch": 3.856445709638064, + "grad_norm": 10.872799951405874, + "learning_rate": 1.6497503924514115e-05, + "loss": 0.7358, + "step": 9483 + }, + { + "epoch": 3.85685237901586, + "grad_norm": 4.191700603868233, + "learning_rate": 1.6496733718550525e-05, + "loss": 0.0738, + "step": 9484 + }, + { + "epoch": 3.857259048393656, + "grad_norm": 0.19241580924196086, + "learning_rate": 1.6495963445894692e-05, + "loss": 0.0041, + "step": 9485 + }, + { + "epoch": 3.857665717771452, + "grad_norm": 6.617831876396141, + "learning_rate": 1.6495193106554524e-05, + "loss": 0.118, + "step": 9486 + }, + { + "epoch": 3.8580723871492477, + "grad_norm": 1.202565651705694, + "learning_rate": 1.649442270053793e-05, + "loss": 0.0249, + "step": 9487 + }, + { + "epoch": 3.8584790565270435, + "grad_norm": 6.920742317625291, + "learning_rate": 1.6493652227852818e-05, + "loss": 0.1783, + "step": 9488 + }, + { + "epoch": 3.8588857259048392, + "grad_norm": 4.620196980585793, + "learning_rate": 1.64928816885071e-05, + "loss": 0.0713, + "step": 9489 + }, + { + "epoch": 3.859292395282635, + "grad_norm": 12.68876622149112, + "learning_rate": 1.649211108250868e-05, + "loss": 0.8915, + "step": 9490 + }, + { + "epoch": 3.8596990646604312, + "grad_norm": 4.221406542743624, + "learning_rate": 1.6491340409865476e-05, + "loss": 0.2051, + "step": 9491 + }, + { + "epoch": 3.860105734038227, + "grad_norm": 2.0199327286550584, + "learning_rate": 1.6490569670585396e-05, + "loss": 0.0332, + "step": 9492 + }, + { + "epoch": 3.8605124034160228, + "grad_norm": 4.100263229825749, + "learning_rate": 1.6489798864676353e-05, + "loss": 0.1071, + "step": 9493 + }, + { + "epoch": 3.8609190727938185, + "grad_norm": 7.50068126671018, + "learning_rate": 1.6489027992146254e-05, + "loss": 0.3381, + "step": 9494 + }, + { + "epoch": 3.8613257421716147, + "grad_norm": 0.1602470495602799, + "learning_rate": 1.6488257053003017e-05, + "loss": 0.0027, + "step": 9495 + }, + { + "epoch": 3.8617324115494105, + "grad_norm": 0.8656661013359045, + "learning_rate": 1.6487486047254558e-05, + "loss": 0.0105, + "step": 9496 + }, + { + "epoch": 3.8621390809272063, + "grad_norm": 17.49250808667384, + "learning_rate": 1.6486714974908787e-05, + "loss": 1.197, + "step": 9497 + }, + { + "epoch": 3.862545750305002, + "grad_norm": 1.1094372080853223, + "learning_rate": 1.6485943835973626e-05, + "loss": 0.0163, + "step": 9498 + }, + { + "epoch": 3.862952419682798, + "grad_norm": 2.2790813948918873, + "learning_rate": 1.6485172630456986e-05, + "loss": 0.0722, + "step": 9499 + }, + { + "epoch": 3.8633590890605936, + "grad_norm": 5.039769269862432, + "learning_rate": 1.6484401358366782e-05, + "loss": 0.107, + "step": 9500 + }, + { + "epoch": 3.8637657584383893, + "grad_norm": 1.9050309868947084, + "learning_rate": 1.648363001971094e-05, + "loss": 0.0462, + "step": 9501 + }, + { + "epoch": 3.8641724278161855, + "grad_norm": 11.458319616639171, + "learning_rate": 1.6482858614497364e-05, + "loss": 0.2582, + "step": 9502 + }, + { + "epoch": 3.8645790971939813, + "grad_norm": 3.620984477027064, + "learning_rate": 1.648208714273399e-05, + "loss": 0.0732, + "step": 9503 + }, + { + "epoch": 3.864985766571777, + "grad_norm": 11.195174110885615, + "learning_rate": 1.6481315604428724e-05, + "loss": 0.5334, + "step": 9504 + }, + { + "epoch": 3.8653924359495733, + "grad_norm": 2.507349088400623, + "learning_rate": 1.6480543999589497e-05, + "loss": 0.0551, + "step": 9505 + }, + { + "epoch": 3.865799105327369, + "grad_norm": 9.950287627212083, + "learning_rate": 1.6479772328224218e-05, + "loss": 0.5025, + "step": 9506 + }, + { + "epoch": 3.866205774705165, + "grad_norm": 0.9963838133899983, + "learning_rate": 1.6479000590340818e-05, + "loss": 0.0173, + "step": 9507 + }, + { + "epoch": 3.8666124440829606, + "grad_norm": 12.49043876713354, + "learning_rate": 1.6478228785947216e-05, + "loss": 0.4332, + "step": 9508 + }, + { + "epoch": 3.8670191134607563, + "grad_norm": 13.848373535206083, + "learning_rate": 1.6477456915051335e-05, + "loss": 0.577, + "step": 9509 + }, + { + "epoch": 3.867425782838552, + "grad_norm": 5.771539827990508, + "learning_rate": 1.64766849776611e-05, + "loss": 0.2855, + "step": 9510 + }, + { + "epoch": 3.867832452216348, + "grad_norm": 8.42564666477359, + "learning_rate": 1.647591297378443e-05, + "loss": 0.3917, + "step": 9511 + }, + { + "epoch": 3.868239121594144, + "grad_norm": 1.0170681617065378, + "learning_rate": 1.647514090342926e-05, + "loss": 0.0138, + "step": 9512 + }, + { + "epoch": 3.86864579097194, + "grad_norm": 4.0594720819392975, + "learning_rate": 1.6474368766603505e-05, + "loss": 0.0556, + "step": 9513 + }, + { + "epoch": 3.8690524603497356, + "grad_norm": 2.3027801553609732, + "learning_rate": 1.64735965633151e-05, + "loss": 0.0219, + "step": 9514 + }, + { + "epoch": 3.8694591297275314, + "grad_norm": 9.630907450045195, + "learning_rate": 1.647282429357196e-05, + "loss": 0.3188, + "step": 9515 + }, + { + "epoch": 3.8698657991053276, + "grad_norm": 7.560695679741742, + "learning_rate": 1.647205195738203e-05, + "loss": 0.2558, + "step": 9516 + }, + { + "epoch": 3.8702724684831233, + "grad_norm": 9.238218051722459, + "learning_rate": 1.6471279554753226e-05, + "loss": 0.3367, + "step": 9517 + }, + { + "epoch": 3.870679137860919, + "grad_norm": 1.305524077745095, + "learning_rate": 1.647050708569348e-05, + "loss": 0.0205, + "step": 9518 + }, + { + "epoch": 3.871085807238715, + "grad_norm": 0.5017655187830798, + "learning_rate": 1.6469734550210724e-05, + "loss": 0.0086, + "step": 9519 + }, + { + "epoch": 3.8714924766165106, + "grad_norm": 4.10062901296428, + "learning_rate": 1.6468961948312884e-05, + "loss": 0.0781, + "step": 9520 + }, + { + "epoch": 3.8718991459943064, + "grad_norm": 2.0375569345439644, + "learning_rate": 1.6468189280007895e-05, + "loss": 0.03, + "step": 9521 + }, + { + "epoch": 3.8723058153721026, + "grad_norm": 7.439775482139995, + "learning_rate": 1.646741654530369e-05, + "loss": 0.2243, + "step": 9522 + }, + { + "epoch": 3.8727124847498984, + "grad_norm": 7.519972425038122, + "learning_rate": 1.6466643744208196e-05, + "loss": 0.2602, + "step": 9523 + }, + { + "epoch": 3.873119154127694, + "grad_norm": 3.8467916189077727, + "learning_rate": 1.6465870876729352e-05, + "loss": 0.0675, + "step": 9524 + }, + { + "epoch": 3.87352582350549, + "grad_norm": 19.311040221678084, + "learning_rate": 1.6465097942875085e-05, + "loss": 0.5561, + "step": 9525 + }, + { + "epoch": 3.873932492883286, + "grad_norm": 19.916487599625974, + "learning_rate": 1.6464324942653334e-05, + "loss": 0.5944, + "step": 9526 + }, + { + "epoch": 3.874339162261082, + "grad_norm": 11.89879730635356, + "learning_rate": 1.646355187607204e-05, + "loss": 0.4776, + "step": 9527 + }, + { + "epoch": 3.8747458316388776, + "grad_norm": 8.744286549692351, + "learning_rate": 1.646277874313913e-05, + "loss": 0.5082, + "step": 9528 + }, + { + "epoch": 3.8751525010166734, + "grad_norm": 3.5232446202687338, + "learning_rate": 1.646200554386254e-05, + "loss": 0.0768, + "step": 9529 + }, + { + "epoch": 3.875559170394469, + "grad_norm": 8.798422327961726, + "learning_rate": 1.6461232278250213e-05, + "loss": 0.2296, + "step": 9530 + }, + { + "epoch": 3.875965839772265, + "grad_norm": 3.8354566552170817, + "learning_rate": 1.646045894631009e-05, + "loss": 0.081, + "step": 9531 + }, + { + "epoch": 3.876372509150061, + "grad_norm": 0.1823847338720654, + "learning_rate": 1.64596855480501e-05, + "loss": 0.0031, + "step": 9532 + }, + { + "epoch": 3.876779178527857, + "grad_norm": 0.16444734287682738, + "learning_rate": 1.6458912083478186e-05, + "loss": 0.0031, + "step": 9533 + }, + { + "epoch": 3.8771858479056527, + "grad_norm": 7.823569417730407, + "learning_rate": 1.645813855260229e-05, + "loss": 0.4213, + "step": 9534 + }, + { + "epoch": 3.8775925172834484, + "grad_norm": 6.157453612853693, + "learning_rate": 1.645736495543035e-05, + "loss": 0.2557, + "step": 9535 + }, + { + "epoch": 3.8779991866612447, + "grad_norm": 0.8730787230176189, + "learning_rate": 1.645659129197031e-05, + "loss": 0.0149, + "step": 9536 + }, + { + "epoch": 3.8784058560390404, + "grad_norm": 5.4508827405124975, + "learning_rate": 1.645581756223011e-05, + "loss": 0.3455, + "step": 9537 + }, + { + "epoch": 3.878812525416836, + "grad_norm": 2.1143587962419903, + "learning_rate": 1.6455043766217695e-05, + "loss": 0.0565, + "step": 9538 + }, + { + "epoch": 3.879219194794632, + "grad_norm": 5.278535527615447, + "learning_rate": 1.6454269903941006e-05, + "loss": 0.1041, + "step": 9539 + }, + { + "epoch": 3.8796258641724277, + "grad_norm": 10.703840583461918, + "learning_rate": 1.6453495975407987e-05, + "loss": 0.9337, + "step": 9540 + }, + { + "epoch": 3.8800325335502235, + "grad_norm": 16.822232652838043, + "learning_rate": 1.6452721980626584e-05, + "loss": 0.4173, + "step": 9541 + }, + { + "epoch": 3.8804392029280192, + "grad_norm": 4.672697156489649, + "learning_rate": 1.6451947919604744e-05, + "loss": 0.0523, + "step": 9542 + }, + { + "epoch": 3.8808458723058155, + "grad_norm": 7.682414532974722, + "learning_rate": 1.6451173792350413e-05, + "loss": 0.2075, + "step": 9543 + }, + { + "epoch": 3.8812525416836112, + "grad_norm": 1.70354666813542, + "learning_rate": 1.6450399598871533e-05, + "loss": 0.0279, + "step": 9544 + }, + { + "epoch": 3.881659211061407, + "grad_norm": 1.7556151699214155, + "learning_rate": 1.6449625339176056e-05, + "loss": 0.0406, + "step": 9545 + }, + { + "epoch": 3.882065880439203, + "grad_norm": 6.745783712577357, + "learning_rate": 1.644885101327193e-05, + "loss": 0.3014, + "step": 9546 + }, + { + "epoch": 3.882472549816999, + "grad_norm": 10.112291986032906, + "learning_rate": 1.64480766211671e-05, + "loss": 0.4144, + "step": 9547 + }, + { + "epoch": 3.8828792191947947, + "grad_norm": 7.03884830847557, + "learning_rate": 1.6447302162869518e-05, + "loss": 0.5033, + "step": 9548 + }, + { + "epoch": 3.8832858885725905, + "grad_norm": 7.848994396182368, + "learning_rate": 1.6446527638387136e-05, + "loss": 0.2682, + "step": 9549 + }, + { + "epoch": 3.8836925579503863, + "grad_norm": 7.681217200847061, + "learning_rate": 1.6445753047727903e-05, + "loss": 0.2127, + "step": 9550 + }, + { + "epoch": 3.884099227328182, + "grad_norm": 9.055395882493272, + "learning_rate": 1.644497839089977e-05, + "loss": 0.36, + "step": 9551 + }, + { + "epoch": 3.884505896705978, + "grad_norm": 14.389627965751405, + "learning_rate": 1.6444203667910696e-05, + "loss": 0.4076, + "step": 9552 + }, + { + "epoch": 3.884912566083774, + "grad_norm": 10.880284987185405, + "learning_rate": 1.644342887876862e-05, + "loss": 0.829, + "step": 9553 + }, + { + "epoch": 3.8853192354615698, + "grad_norm": 5.6908716049334975, + "learning_rate": 1.6442654023481507e-05, + "loss": 0.1719, + "step": 9554 + }, + { + "epoch": 3.8857259048393655, + "grad_norm": 4.699016056590517, + "learning_rate": 1.644187910205731e-05, + "loss": 0.0599, + "step": 9555 + }, + { + "epoch": 3.8861325742171613, + "grad_norm": 9.8519691951308, + "learning_rate": 1.644110411450398e-05, + "loss": 0.3732, + "step": 9556 + }, + { + "epoch": 3.8865392435949575, + "grad_norm": 8.64143616421806, + "learning_rate": 1.6440329060829473e-05, + "loss": 0.2196, + "step": 9557 + }, + { + "epoch": 3.8869459129727533, + "grad_norm": 0.33034732732881755, + "learning_rate": 1.643955394104175e-05, + "loss": 0.0064, + "step": 9558 + }, + { + "epoch": 3.887352582350549, + "grad_norm": 10.330638379854479, + "learning_rate": 1.643877875514876e-05, + "loss": 0.2978, + "step": 9559 + }, + { + "epoch": 3.887759251728345, + "grad_norm": 6.932770695795658, + "learning_rate": 1.643800350315847e-05, + "loss": 0.1513, + "step": 9560 + }, + { + "epoch": 3.8881659211061406, + "grad_norm": 3.8383215789980825, + "learning_rate": 1.6437228185078833e-05, + "loss": 0.1863, + "step": 9561 + }, + { + "epoch": 3.8885725904839363, + "grad_norm": 6.105853992524702, + "learning_rate": 1.6436452800917807e-05, + "loss": 0.0715, + "step": 9562 + }, + { + "epoch": 3.8889792598617325, + "grad_norm": 10.479749434012481, + "learning_rate": 1.6435677350683356e-05, + "loss": 0.4061, + "step": 9563 + }, + { + "epoch": 3.8893859292395283, + "grad_norm": 7.930726444133855, + "learning_rate": 1.6434901834383435e-05, + "loss": 0.1484, + "step": 9564 + }, + { + "epoch": 3.889792598617324, + "grad_norm": 0.9826790859037342, + "learning_rate": 1.6434126252026014e-05, + "loss": 0.0123, + "step": 9565 + }, + { + "epoch": 3.89019926799512, + "grad_norm": 12.566556785862893, + "learning_rate": 1.6433350603619044e-05, + "loss": 0.5256, + "step": 9566 + }, + { + "epoch": 3.890605937372916, + "grad_norm": 6.550496187160811, + "learning_rate": 1.6432574889170493e-05, + "loss": 0.1094, + "step": 9567 + }, + { + "epoch": 3.891012606750712, + "grad_norm": 7.846637330062569, + "learning_rate": 1.643179910868832e-05, + "loss": 0.2157, + "step": 9568 + }, + { + "epoch": 3.8914192761285076, + "grad_norm": 5.976303721728325, + "learning_rate": 1.6431023262180497e-05, + "loss": 0.1972, + "step": 9569 + }, + { + "epoch": 3.8918259455063033, + "grad_norm": 3.844461400099237, + "learning_rate": 1.6430247349654983e-05, + "loss": 0.0856, + "step": 9570 + }, + { + "epoch": 3.892232614884099, + "grad_norm": 5.396038409548626, + "learning_rate": 1.6429471371119744e-05, + "loss": 0.0766, + "step": 9571 + }, + { + "epoch": 3.892639284261895, + "grad_norm": 3.6886697031739737, + "learning_rate": 1.6428695326582742e-05, + "loss": 0.088, + "step": 9572 + }, + { + "epoch": 3.893045953639691, + "grad_norm": 17.679144746049207, + "learning_rate": 1.642791921605195e-05, + "loss": 0.6466, + "step": 9573 + }, + { + "epoch": 3.893452623017487, + "grad_norm": 7.560655824422695, + "learning_rate": 1.642714303953533e-05, + "loss": 0.3082, + "step": 9574 + }, + { + "epoch": 3.8938592923952826, + "grad_norm": 0.48235611595595634, + "learning_rate": 1.6426366797040852e-05, + "loss": 0.0067, + "step": 9575 + }, + { + "epoch": 3.8942659617730784, + "grad_norm": 17.38668630292055, + "learning_rate": 1.6425590488576485e-05, + "loss": 0.3008, + "step": 9576 + }, + { + "epoch": 3.8946726311508746, + "grad_norm": 12.930920202292356, + "learning_rate": 1.64248141141502e-05, + "loss": 0.5379, + "step": 9577 + }, + { + "epoch": 3.8950793005286704, + "grad_norm": 11.3674864460939, + "learning_rate": 1.642403767376996e-05, + "loss": 0.2277, + "step": 9578 + }, + { + "epoch": 3.895485969906466, + "grad_norm": 4.4933350511237595, + "learning_rate": 1.6423261167443746e-05, + "loss": 0.1312, + "step": 9579 + }, + { + "epoch": 3.895892639284262, + "grad_norm": 5.291569006859878, + "learning_rate": 1.6422484595179522e-05, + "loss": 0.0775, + "step": 9580 + }, + { + "epoch": 3.8962993086620576, + "grad_norm": 0.14128078709036046, + "learning_rate": 1.642170795698526e-05, + "loss": 0.0021, + "step": 9581 + }, + { + "epoch": 3.8967059780398534, + "grad_norm": 0.3797142660992219, + "learning_rate": 1.6420931252868932e-05, + "loss": 0.0044, + "step": 9582 + }, + { + "epoch": 3.897112647417649, + "grad_norm": 2.468978256516618, + "learning_rate": 1.6420154482838514e-05, + "loss": 0.0441, + "step": 9583 + }, + { + "epoch": 3.8975193167954454, + "grad_norm": 11.605066467648971, + "learning_rate": 1.641937764690198e-05, + "loss": 0.4575, + "step": 9584 + }, + { + "epoch": 3.897925986173241, + "grad_norm": 9.485648672414353, + "learning_rate": 1.6418600745067303e-05, + "loss": 0.3384, + "step": 9585 + }, + { + "epoch": 3.898332655551037, + "grad_norm": 6.402441983883268, + "learning_rate": 1.6417823777342458e-05, + "loss": 0.2018, + "step": 9586 + }, + { + "epoch": 3.898739324928833, + "grad_norm": 5.354220320285754, + "learning_rate": 1.6417046743735424e-05, + "loss": 0.1456, + "step": 9587 + }, + { + "epoch": 3.899145994306629, + "grad_norm": 13.790009004740998, + "learning_rate": 1.6416269644254174e-05, + "loss": 1.223, + "step": 9588 + }, + { + "epoch": 3.8995526636844247, + "grad_norm": 4.749141187755354, + "learning_rate": 1.641549247890669e-05, + "loss": 0.2688, + "step": 9589 + }, + { + "epoch": 3.8999593330622204, + "grad_norm": 10.777411695624268, + "learning_rate": 1.6414715247700943e-05, + "loss": 0.3608, + "step": 9590 + }, + { + "epoch": 3.900366002440016, + "grad_norm": 8.255352659222368, + "learning_rate": 1.6413937950644914e-05, + "loss": 0.3548, + "step": 9591 + }, + { + "epoch": 3.900772671817812, + "grad_norm": 7.941452374537804, + "learning_rate": 1.6413160587746592e-05, + "loss": 0.4532, + "step": 9592 + }, + { + "epoch": 3.9011793411956077, + "grad_norm": 7.7644039185299025, + "learning_rate": 1.641238315901394e-05, + "loss": 0.3659, + "step": 9593 + }, + { + "epoch": 3.901586010573404, + "grad_norm": 2.0566506190911324, + "learning_rate": 1.6411605664454952e-05, + "loss": 0.0407, + "step": 9594 + }, + { + "epoch": 3.9019926799511997, + "grad_norm": 19.641269715458478, + "learning_rate": 1.6410828104077607e-05, + "loss": 0.5471, + "step": 9595 + }, + { + "epoch": 3.9023993493289955, + "grad_norm": 7.0133463851221665, + "learning_rate": 1.641005047788988e-05, + "loss": 0.2201, + "step": 9596 + }, + { + "epoch": 3.9028060187067912, + "grad_norm": 2.219616173664056, + "learning_rate": 1.6409272785899763e-05, + "loss": 0.0779, + "step": 9597 + }, + { + "epoch": 3.9032126880845874, + "grad_norm": 2.822234001609482, + "learning_rate": 1.6408495028115233e-05, + "loss": 0.036, + "step": 9598 + }, + { + "epoch": 3.903619357462383, + "grad_norm": 0.3886651723446771, + "learning_rate": 1.6407717204544273e-05, + "loss": 0.0072, + "step": 9599 + }, + { + "epoch": 3.904026026840179, + "grad_norm": 1.979150499415054, + "learning_rate": 1.6406939315194875e-05, + "loss": 0.0414, + "step": 9600 + }, + { + "epoch": 3.9044326962179747, + "grad_norm": 5.65213179944745, + "learning_rate": 1.6406161360075023e-05, + "loss": 0.4118, + "step": 9601 + }, + { + "epoch": 3.9048393655957705, + "grad_norm": 1.1388930300883868, + "learning_rate": 1.6405383339192694e-05, + "loss": 0.0275, + "step": 9602 + }, + { + "epoch": 3.9052460349735663, + "grad_norm": 7.558169563352415, + "learning_rate": 1.6404605252555886e-05, + "loss": 0.4882, + "step": 9603 + }, + { + "epoch": 3.9056527043513625, + "grad_norm": 3.0715172425505717, + "learning_rate": 1.6403827100172577e-05, + "loss": 0.0937, + "step": 9604 + }, + { + "epoch": 3.9060593737291582, + "grad_norm": 11.201862447282313, + "learning_rate": 1.6403048882050766e-05, + "loss": 0.6637, + "step": 9605 + }, + { + "epoch": 3.906466043106954, + "grad_norm": 0.2985423071676357, + "learning_rate": 1.6402270598198433e-05, + "loss": 0.0049, + "step": 9606 + }, + { + "epoch": 3.9068727124847498, + "grad_norm": 8.664591768833704, + "learning_rate": 1.640149224862357e-05, + "loss": 0.4177, + "step": 9607 + }, + { + "epoch": 3.907279381862546, + "grad_norm": 1.2339205372649953, + "learning_rate": 1.6400713833334164e-05, + "loss": 0.0222, + "step": 9608 + }, + { + "epoch": 3.9076860512403417, + "grad_norm": 4.182453393233989, + "learning_rate": 1.6399935352338216e-05, + "loss": 0.1349, + "step": 9609 + }, + { + "epoch": 3.9080927206181375, + "grad_norm": 2.7495867346390157, + "learning_rate": 1.6399156805643706e-05, + "loss": 0.0628, + "step": 9610 + }, + { + "epoch": 3.9084993899959333, + "grad_norm": 1.599939550947297, + "learning_rate": 1.6398378193258633e-05, + "loss": 0.0203, + "step": 9611 + }, + { + "epoch": 3.908906059373729, + "grad_norm": 6.991128124935389, + "learning_rate": 1.6397599515190986e-05, + "loss": 0.1847, + "step": 9612 + }, + { + "epoch": 3.909312728751525, + "grad_norm": 0.7481732046158102, + "learning_rate": 1.639682077144876e-05, + "loss": 0.0209, + "step": 9613 + }, + { + "epoch": 3.909719398129321, + "grad_norm": 1.0635778011100876, + "learning_rate": 1.639604196203995e-05, + "loss": 0.0176, + "step": 9614 + }, + { + "epoch": 3.9101260675071168, + "grad_norm": 9.106787558866499, + "learning_rate": 1.6395263086972548e-05, + "loss": 0.2338, + "step": 9615 + }, + { + "epoch": 3.9105327368849125, + "grad_norm": 4.351307616038368, + "learning_rate": 1.6394484146254553e-05, + "loss": 0.1277, + "step": 9616 + }, + { + "epoch": 3.9109394062627083, + "grad_norm": 4.737085908274733, + "learning_rate": 1.639370513989396e-05, + "loss": 0.1731, + "step": 9617 + }, + { + "epoch": 3.9113460756405045, + "grad_norm": 6.917534096562231, + "learning_rate": 1.6392926067898765e-05, + "loss": 0.1305, + "step": 9618 + }, + { + "epoch": 3.9117527450183003, + "grad_norm": 12.971439504934233, + "learning_rate": 1.639214693027697e-05, + "loss": 0.9769, + "step": 9619 + }, + { + "epoch": 3.912159414396096, + "grad_norm": 11.76591047397232, + "learning_rate": 1.6391367727036568e-05, + "loss": 0.9962, + "step": 9620 + }, + { + "epoch": 3.912566083773892, + "grad_norm": 6.732607423466046, + "learning_rate": 1.639058845818556e-05, + "loss": 0.222, + "step": 9621 + }, + { + "epoch": 3.9129727531516876, + "grad_norm": 5.618278860463539, + "learning_rate": 1.6389809123731942e-05, + "loss": 0.2629, + "step": 9622 + }, + { + "epoch": 3.9133794225294833, + "grad_norm": 7.070324803968651, + "learning_rate": 1.638902972368372e-05, + "loss": 0.1778, + "step": 9623 + }, + { + "epoch": 3.913786091907279, + "grad_norm": 10.038491580180828, + "learning_rate": 1.6388250258048892e-05, + "loss": 0.4108, + "step": 9624 + }, + { + "epoch": 3.9141927612850753, + "grad_norm": 10.646511592473294, + "learning_rate": 1.638747072683546e-05, + "loss": 0.4466, + "step": 9625 + }, + { + "epoch": 3.914599430662871, + "grad_norm": 4.339570068333941, + "learning_rate": 1.6386691130051425e-05, + "loss": 0.147, + "step": 9626 + }, + { + "epoch": 3.915006100040667, + "grad_norm": 7.037635811357097, + "learning_rate": 1.6385911467704795e-05, + "loss": 0.6239, + "step": 9627 + }, + { + "epoch": 3.915412769418463, + "grad_norm": 2.1748889650147722, + "learning_rate": 1.6385131739803565e-05, + "loss": 0.0358, + "step": 9628 + }, + { + "epoch": 3.915819438796259, + "grad_norm": 1.0322092107202667, + "learning_rate": 1.6384351946355744e-05, + "loss": 0.0204, + "step": 9629 + }, + { + "epoch": 3.9162261081740546, + "grad_norm": 5.857853882681602, + "learning_rate": 1.638357208736934e-05, + "loss": 0.1065, + "step": 9630 + }, + { + "epoch": 3.9166327775518504, + "grad_norm": 16.924553270867907, + "learning_rate": 1.6382792162852353e-05, + "loss": 0.3401, + "step": 9631 + }, + { + "epoch": 3.917039446929646, + "grad_norm": 6.520666046298671, + "learning_rate": 1.6382012172812796e-05, + "loss": 0.3187, + "step": 9632 + }, + { + "epoch": 3.917446116307442, + "grad_norm": 83.49404474825047, + "learning_rate": 1.638123211725867e-05, + "loss": 1.0187, + "step": 9633 + }, + { + "epoch": 3.9178527856852376, + "grad_norm": 10.879216828570028, + "learning_rate": 1.6380451996197986e-05, + "loss": 0.36, + "step": 9634 + }, + { + "epoch": 3.918259455063034, + "grad_norm": 3.161831884021969, + "learning_rate": 1.637967180963875e-05, + "loss": 0.0747, + "step": 9635 + }, + { + "epoch": 3.9186661244408296, + "grad_norm": 7.446084810426097, + "learning_rate": 1.637889155758897e-05, + "loss": 0.2425, + "step": 9636 + }, + { + "epoch": 3.9190727938186254, + "grad_norm": 9.112019832967693, + "learning_rate": 1.637811124005666e-05, + "loss": 0.4215, + "step": 9637 + }, + { + "epoch": 3.919479463196421, + "grad_norm": 3.2962774154923844, + "learning_rate": 1.637733085704983e-05, + "loss": 0.0533, + "step": 9638 + }, + { + "epoch": 3.9198861325742174, + "grad_norm": 7.055659905080989, + "learning_rate": 1.6376550408576484e-05, + "loss": 0.3868, + "step": 9639 + }, + { + "epoch": 3.920292801952013, + "grad_norm": 3.3038577434206498, + "learning_rate": 1.6375769894644644e-05, + "loss": 0.1027, + "step": 9640 + }, + { + "epoch": 3.920699471329809, + "grad_norm": 6.944323266468014, + "learning_rate": 1.6374989315262314e-05, + "loss": 0.1643, + "step": 9641 + }, + { + "epoch": 3.9211061407076047, + "grad_norm": 8.836353358816543, + "learning_rate": 1.6374208670437513e-05, + "loss": 0.4702, + "step": 9642 + }, + { + "epoch": 3.9215128100854004, + "grad_norm": 7.647458344883991, + "learning_rate": 1.637342796017825e-05, + "loss": 0.2432, + "step": 9643 + }, + { + "epoch": 3.921919479463196, + "grad_norm": 8.636405020281803, + "learning_rate": 1.637264718449254e-05, + "loss": 0.2467, + "step": 9644 + }, + { + "epoch": 3.9223261488409924, + "grad_norm": 3.4762306895193036, + "learning_rate": 1.6371866343388402e-05, + "loss": 0.1694, + "step": 9645 + }, + { + "epoch": 3.922732818218788, + "grad_norm": 6.904881847196362, + "learning_rate": 1.6371085436873847e-05, + "loss": 0.4057, + "step": 9646 + }, + { + "epoch": 3.923139487596584, + "grad_norm": 12.421860012759158, + "learning_rate": 1.6370304464956892e-05, + "loss": 0.9867, + "step": 9647 + }, + { + "epoch": 3.9235461569743797, + "grad_norm": 10.840169053179695, + "learning_rate": 1.636952342764556e-05, + "loss": 0.5124, + "step": 9648 + }, + { + "epoch": 3.923952826352176, + "grad_norm": 0.18279352256204218, + "learning_rate": 1.636874232494786e-05, + "loss": 0.0033, + "step": 9649 + }, + { + "epoch": 3.9243594957299717, + "grad_norm": 0.8967308367425676, + "learning_rate": 1.6367961156871816e-05, + "loss": 0.013, + "step": 9650 + }, + { + "epoch": 3.9247661651077674, + "grad_norm": 3.155974487874849, + "learning_rate": 1.6367179923425447e-05, + "loss": 0.0829, + "step": 9651 + }, + { + "epoch": 3.925172834485563, + "grad_norm": 11.70855894294198, + "learning_rate": 1.6366398624616774e-05, + "loss": 0.4455, + "step": 9652 + }, + { + "epoch": 3.925579503863359, + "grad_norm": 6.268627133272473, + "learning_rate": 1.6365617260453813e-05, + "loss": 0.1504, + "step": 9653 + }, + { + "epoch": 3.9259861732411547, + "grad_norm": 0.14257854849758034, + "learning_rate": 1.636483583094458e-05, + "loss": 0.003, + "step": 9654 + }, + { + "epoch": 3.926392842618951, + "grad_norm": 6.398340491031355, + "learning_rate": 1.6364054336097112e-05, + "loss": 0.3729, + "step": 9655 + }, + { + "epoch": 3.9267995119967467, + "grad_norm": 13.404455092012038, + "learning_rate": 1.6363272775919422e-05, + "loss": 0.4211, + "step": 9656 + }, + { + "epoch": 3.9272061813745425, + "grad_norm": 2.8760378598706318, + "learning_rate": 1.6362491150419536e-05, + "loss": 0.0445, + "step": 9657 + }, + { + "epoch": 3.9276128507523382, + "grad_norm": 2.017659626964354, + "learning_rate": 1.6361709459605472e-05, + "loss": 0.0736, + "step": 9658 + }, + { + "epoch": 3.9280195201301344, + "grad_norm": 16.78116337502532, + "learning_rate": 1.6360927703485262e-05, + "loss": 0.6629, + "step": 9659 + }, + { + "epoch": 3.92842618950793, + "grad_norm": 2.4887585321632018, + "learning_rate": 1.6360145882066927e-05, + "loss": 0.0397, + "step": 9660 + }, + { + "epoch": 3.928832858885726, + "grad_norm": 8.323429277768486, + "learning_rate": 1.6359363995358493e-05, + "loss": 0.3516, + "step": 9661 + }, + { + "epoch": 3.9292395282635217, + "grad_norm": 5.549221712505297, + "learning_rate": 1.6358582043367986e-05, + "loss": 0.2348, + "step": 9662 + }, + { + "epoch": 3.9296461976413175, + "grad_norm": 4.68154002316484, + "learning_rate": 1.6357800026103436e-05, + "loss": 0.3079, + "step": 9663 + }, + { + "epoch": 3.9300528670191133, + "grad_norm": 12.76328963588078, + "learning_rate": 1.635701794357287e-05, + "loss": 0.5689, + "step": 9664 + }, + { + "epoch": 3.930459536396909, + "grad_norm": 6.679867352049805, + "learning_rate": 1.635623579578431e-05, + "loss": 0.1614, + "step": 9665 + }, + { + "epoch": 3.9308662057747052, + "grad_norm": 1.421471419647842, + "learning_rate": 1.6355453582745794e-05, + "loss": 0.0336, + "step": 9666 + }, + { + "epoch": 3.931272875152501, + "grad_norm": 8.22388106170213, + "learning_rate": 1.6354671304465348e-05, + "loss": 0.2573, + "step": 9667 + }, + { + "epoch": 3.9316795445302968, + "grad_norm": 0.20725076958352986, + "learning_rate": 1.6353888960951005e-05, + "loss": 0.003, + "step": 9668 + }, + { + "epoch": 3.932086213908093, + "grad_norm": 10.425789147377953, + "learning_rate": 1.6353106552210788e-05, + "loss": 0.517, + "step": 9669 + }, + { + "epoch": 3.9324928832858888, + "grad_norm": 10.91303924234799, + "learning_rate": 1.635232407825274e-05, + "loss": 0.3473, + "step": 9670 + }, + { + "epoch": 3.9328995526636845, + "grad_norm": 8.987353000970495, + "learning_rate": 1.6351541539084886e-05, + "loss": 0.2758, + "step": 9671 + }, + { + "epoch": 3.9333062220414803, + "grad_norm": 3.3329126260202577, + "learning_rate": 1.635075893471526e-05, + "loss": 0.0694, + "step": 9672 + }, + { + "epoch": 3.933712891419276, + "grad_norm": 10.971884604328523, + "learning_rate": 1.6349976265151903e-05, + "loss": 0.4601, + "step": 9673 + }, + { + "epoch": 3.934119560797072, + "grad_norm": 6.70321149590991, + "learning_rate": 1.634919353040284e-05, + "loss": 0.2187, + "step": 9674 + }, + { + "epoch": 3.9345262301748676, + "grad_norm": 12.972320877376111, + "learning_rate": 1.634841073047611e-05, + "loss": 0.6392, + "step": 9675 + }, + { + "epoch": 3.934932899552664, + "grad_norm": 0.4700038224284125, + "learning_rate": 1.634762786537975e-05, + "loss": 0.0084, + "step": 9676 + }, + { + "epoch": 3.9353395689304596, + "grad_norm": 8.346588730553584, + "learning_rate": 1.6346844935121793e-05, + "loss": 0.1249, + "step": 9677 + }, + { + "epoch": 3.9357462383082553, + "grad_norm": 2.142106934794018, + "learning_rate": 1.634606193971028e-05, + "loss": 0.0404, + "step": 9678 + }, + { + "epoch": 3.936152907686051, + "grad_norm": 29.11616352185552, + "learning_rate": 1.6345278879153247e-05, + "loss": 0.7069, + "step": 9679 + }, + { + "epoch": 3.9365595770638473, + "grad_norm": 4.379710726988125, + "learning_rate": 1.6344495753458733e-05, + "loss": 0.0874, + "step": 9680 + }, + { + "epoch": 3.936966246441643, + "grad_norm": 6.099237197129434, + "learning_rate": 1.6343712562634778e-05, + "loss": 0.2306, + "step": 9681 + }, + { + "epoch": 3.937372915819439, + "grad_norm": 5.015753906263863, + "learning_rate": 1.634292930668942e-05, + "loss": 0.1478, + "step": 9682 + }, + { + "epoch": 3.9377795851972346, + "grad_norm": 0.7593539777668932, + "learning_rate": 1.63421459856307e-05, + "loss": 0.0153, + "step": 9683 + }, + { + "epoch": 3.9381862545750304, + "grad_norm": 8.181128795046881, + "learning_rate": 1.634136259946666e-05, + "loss": 0.1795, + "step": 9684 + }, + { + "epoch": 3.938592923952826, + "grad_norm": 10.337384720197585, + "learning_rate": 1.634057914820534e-05, + "loss": 0.3073, + "step": 9685 + }, + { + "epoch": 3.9389995933306223, + "grad_norm": 3.9976479043462505, + "learning_rate": 1.6339795631854783e-05, + "loss": 0.1027, + "step": 9686 + }, + { + "epoch": 3.939406262708418, + "grad_norm": 0.18497966207507954, + "learning_rate": 1.6339012050423035e-05, + "loss": 0.004, + "step": 9687 + }, + { + "epoch": 3.939812932086214, + "grad_norm": 1.3183731673985029, + "learning_rate": 1.633822840391814e-05, + "loss": 0.0218, + "step": 9688 + }, + { + "epoch": 3.9402196014640096, + "grad_norm": 13.796248441322843, + "learning_rate": 1.6337444692348137e-05, + "loss": 0.7578, + "step": 9689 + }, + { + "epoch": 3.940626270841806, + "grad_norm": 4.597080255553909, + "learning_rate": 1.6336660915721078e-05, + "loss": 0.066, + "step": 9690 + }, + { + "epoch": 3.9410329402196016, + "grad_norm": 2.7763878676140425, + "learning_rate": 1.6335877074045002e-05, + "loss": 0.0343, + "step": 9691 + }, + { + "epoch": 3.9414396095973974, + "grad_norm": 0.22323909030141253, + "learning_rate": 1.633509316732796e-05, + "loss": 0.0041, + "step": 9692 + }, + { + "epoch": 3.941846278975193, + "grad_norm": 7.25516096295777, + "learning_rate": 1.6334309195578e-05, + "loss": 0.4066, + "step": 9693 + }, + { + "epoch": 3.942252948352989, + "grad_norm": 8.45495182521527, + "learning_rate": 1.6333525158803164e-05, + "loss": 0.2053, + "step": 9694 + }, + { + "epoch": 3.9426596177307847, + "grad_norm": 0.725371619689678, + "learning_rate": 1.6332741057011507e-05, + "loss": 0.0108, + "step": 9695 + }, + { + "epoch": 3.943066287108581, + "grad_norm": 6.290003980927579, + "learning_rate": 1.6331956890211077e-05, + "loss": 0.1456, + "step": 9696 + }, + { + "epoch": 3.9434729564863766, + "grad_norm": 2.434801788147963, + "learning_rate": 1.6331172658409925e-05, + "loss": 0.032, + "step": 9697 + }, + { + "epoch": 3.9438796258641724, + "grad_norm": 12.471393522779435, + "learning_rate": 1.6330388361616094e-05, + "loss": 0.2959, + "step": 9698 + }, + { + "epoch": 3.944286295241968, + "grad_norm": 0.13277754492507743, + "learning_rate": 1.6329603999837642e-05, + "loss": 0.0016, + "step": 9699 + }, + { + "epoch": 3.9446929646197644, + "grad_norm": 12.603251763772068, + "learning_rate": 1.632881957308262e-05, + "loss": 0.4678, + "step": 9700 + }, + { + "epoch": 3.94509963399756, + "grad_norm": 7.8374361304016595, + "learning_rate": 1.6328035081359083e-05, + "loss": 0.122, + "step": 9701 + }, + { + "epoch": 3.945506303375356, + "grad_norm": 9.686587531239917, + "learning_rate": 1.6327250524675076e-05, + "loss": 0.9902, + "step": 9702 + }, + { + "epoch": 3.9459129727531517, + "grad_norm": 2.8271243684037293, + "learning_rate": 1.632646590303866e-05, + "loss": 0.0377, + "step": 9703 + }, + { + "epoch": 3.9463196421309474, + "grad_norm": 4.9388056739317525, + "learning_rate": 1.6325681216457887e-05, + "loss": 0.2514, + "step": 9704 + }, + { + "epoch": 3.946726311508743, + "grad_norm": 5.566649344640633, + "learning_rate": 1.6324896464940812e-05, + "loss": 0.0963, + "step": 9705 + }, + { + "epoch": 3.947132980886539, + "grad_norm": 0.44340255908778686, + "learning_rate": 1.6324111648495494e-05, + "loss": 0.0071, + "step": 9706 + }, + { + "epoch": 3.947539650264335, + "grad_norm": 1.6806458188080284, + "learning_rate": 1.6323326767129986e-05, + "loss": 0.0341, + "step": 9707 + }, + { + "epoch": 3.947946319642131, + "grad_norm": 13.025132754074576, + "learning_rate": 1.6322541820852345e-05, + "loss": 0.9247, + "step": 9708 + }, + { + "epoch": 3.9483529890199267, + "grad_norm": 11.667624054380644, + "learning_rate": 1.6321756809670635e-05, + "loss": 0.4795, + "step": 9709 + }, + { + "epoch": 3.948759658397723, + "grad_norm": 0.09437897945481341, + "learning_rate": 1.6320971733592904e-05, + "loss": 0.0016, + "step": 9710 + }, + { + "epoch": 3.9491663277755187, + "grad_norm": 10.342512233873046, + "learning_rate": 1.632018659262722e-05, + "loss": 0.5172, + "step": 9711 + }, + { + "epoch": 3.9495729971533144, + "grad_norm": 4.5815302351397875, + "learning_rate": 1.631940138678164e-05, + "loss": 0.1159, + "step": 9712 + }, + { + "epoch": 3.94997966653111, + "grad_norm": 13.499838116539479, + "learning_rate": 1.631861611606422e-05, + "loss": 0.4497, + "step": 9713 + }, + { + "epoch": 3.950386335908906, + "grad_norm": 10.133729426846395, + "learning_rate": 1.6317830780483026e-05, + "loss": 0.3104, + "step": 9714 + }, + { + "epoch": 3.9507930052867017, + "grad_norm": 13.132873006269417, + "learning_rate": 1.6317045380046122e-05, + "loss": 0.663, + "step": 9715 + }, + { + "epoch": 3.9511996746644975, + "grad_norm": 3.6603197336835813, + "learning_rate": 1.631625991476157e-05, + "loss": 0.1754, + "step": 9716 + }, + { + "epoch": 3.9516063440422937, + "grad_norm": 11.927413412894389, + "learning_rate": 1.6315474384637426e-05, + "loss": 0.655, + "step": 9717 + }, + { + "epoch": 3.9520130134200895, + "grad_norm": 6.333941409113509, + "learning_rate": 1.631468878968176e-05, + "loss": 0.2681, + "step": 9718 + }, + { + "epoch": 3.9524196827978852, + "grad_norm": 0.7443450240420383, + "learning_rate": 1.6313903129902638e-05, + "loss": 0.0082, + "step": 9719 + }, + { + "epoch": 3.952826352175681, + "grad_norm": 9.650189128640799, + "learning_rate": 1.631311740530812e-05, + "loss": 0.3821, + "step": 9720 + }, + { + "epoch": 3.9532330215534772, + "grad_norm": 4.416173790170686, + "learning_rate": 1.6312331615906277e-05, + "loss": 0.2519, + "step": 9721 + }, + { + "epoch": 3.953639690931273, + "grad_norm": 10.536648246385964, + "learning_rate": 1.6311545761705173e-05, + "loss": 0.4921, + "step": 9722 + }, + { + "epoch": 3.9540463603090688, + "grad_norm": 4.80650313176905, + "learning_rate": 1.6310759842712874e-05, + "loss": 0.1066, + "step": 9723 + }, + { + "epoch": 3.9544530296868645, + "grad_norm": 5.699875099981466, + "learning_rate": 1.6309973858937447e-05, + "loss": 0.2853, + "step": 9724 + }, + { + "epoch": 3.9548596990646603, + "grad_norm": 7.919153602863058, + "learning_rate": 1.630918781038697e-05, + "loss": 0.2513, + "step": 9725 + }, + { + "epoch": 3.955266368442456, + "grad_norm": 9.61215024043312, + "learning_rate": 1.6308401697069495e-05, + "loss": 0.2056, + "step": 9726 + }, + { + "epoch": 3.9556730378202523, + "grad_norm": 8.065288603897756, + "learning_rate": 1.630761551899311e-05, + "loss": 0.4597, + "step": 9727 + }, + { + "epoch": 3.956079707198048, + "grad_norm": 5.609869714988996, + "learning_rate": 1.630682927616587e-05, + "loss": 0.1765, + "step": 9728 + }, + { + "epoch": 3.956486376575844, + "grad_norm": 8.52680221169193, + "learning_rate": 1.6306042968595857e-05, + "loss": 0.2378, + "step": 9729 + }, + { + "epoch": 3.9568930459536396, + "grad_norm": 13.146797397476075, + "learning_rate": 1.630525659629114e-05, + "loss": 0.8785, + "step": 9730 + }, + { + "epoch": 3.9572997153314358, + "grad_norm": 3.0851974144024097, + "learning_rate": 1.630447015925979e-05, + "loss": 0.0571, + "step": 9731 + }, + { + "epoch": 3.9577063847092315, + "grad_norm": 1.5797086901116346, + "learning_rate": 1.6303683657509882e-05, + "loss": 0.0378, + "step": 9732 + }, + { + "epoch": 3.9581130540870273, + "grad_norm": 7.7878967183245145, + "learning_rate": 1.6302897091049487e-05, + "loss": 0.2395, + "step": 9733 + }, + { + "epoch": 3.958519723464823, + "grad_norm": 4.415727882671689, + "learning_rate": 1.6302110459886678e-05, + "loss": 0.1562, + "step": 9734 + }, + { + "epoch": 3.958926392842619, + "grad_norm": 0.6333077267959488, + "learning_rate": 1.6301323764029538e-05, + "loss": 0.0125, + "step": 9735 + }, + { + "epoch": 3.9593330622204146, + "grad_norm": 1.3063683109022979, + "learning_rate": 1.6300537003486136e-05, + "loss": 0.0223, + "step": 9736 + }, + { + "epoch": 3.959739731598211, + "grad_norm": 5.53452088203061, + "learning_rate": 1.6299750178264554e-05, + "loss": 0.1401, + "step": 9737 + }, + { + "epoch": 3.9601464009760066, + "grad_norm": 12.21640735770636, + "learning_rate": 1.629896328837286e-05, + "loss": 0.5725, + "step": 9738 + }, + { + "epoch": 3.9605530703538023, + "grad_norm": 10.568910667190185, + "learning_rate": 1.629817633381914e-05, + "loss": 0.6483, + "step": 9739 + }, + { + "epoch": 3.960959739731598, + "grad_norm": 14.039899033198855, + "learning_rate": 1.6297389314611473e-05, + "loss": 0.6152, + "step": 9740 + }, + { + "epoch": 3.9613664091093943, + "grad_norm": 8.67454316300214, + "learning_rate": 1.6296602230757932e-05, + "loss": 0.2989, + "step": 9741 + }, + { + "epoch": 3.96177307848719, + "grad_norm": 3.275845778959899, + "learning_rate": 1.6295815082266603e-05, + "loss": 0.0975, + "step": 9742 + }, + { + "epoch": 3.962179747864986, + "grad_norm": 8.59084838692615, + "learning_rate": 1.6295027869145562e-05, + "loss": 0.4223, + "step": 9743 + }, + { + "epoch": 3.9625864172427816, + "grad_norm": 5.179246328806217, + "learning_rate": 1.629424059140289e-05, + "loss": 0.1073, + "step": 9744 + }, + { + "epoch": 3.9629930866205774, + "grad_norm": 2.7595661724887917, + "learning_rate": 1.6293453249046674e-05, + "loss": 0.0777, + "step": 9745 + }, + { + "epoch": 3.963399755998373, + "grad_norm": 11.600652716035656, + "learning_rate": 1.629266584208499e-05, + "loss": 0.4564, + "step": 9746 + }, + { + "epoch": 3.9638064253761693, + "grad_norm": 6.366275538622771, + "learning_rate": 1.6291878370525925e-05, + "loss": 0.0783, + "step": 9747 + }, + { + "epoch": 3.964213094753965, + "grad_norm": 11.22966306350547, + "learning_rate": 1.6291090834377563e-05, + "loss": 0.8845, + "step": 9748 + }, + { + "epoch": 3.964619764131761, + "grad_norm": 5.0381109665477055, + "learning_rate": 1.6290303233647988e-05, + "loss": 0.1809, + "step": 9749 + }, + { + "epoch": 3.9650264335095566, + "grad_norm": 8.322126592626516, + "learning_rate": 1.6289515568345282e-05, + "loss": 0.2992, + "step": 9750 + }, + { + "epoch": 3.965433102887353, + "grad_norm": 10.425666195830248, + "learning_rate": 1.6288727838477534e-05, + "loss": 0.3549, + "step": 9751 + }, + { + "epoch": 3.9658397722651486, + "grad_norm": 7.868707107130863, + "learning_rate": 1.628794004405283e-05, + "loss": 0.2101, + "step": 9752 + }, + { + "epoch": 3.9662464416429444, + "grad_norm": 0.5005815946057547, + "learning_rate": 1.6287152185079255e-05, + "loss": 0.005, + "step": 9753 + }, + { + "epoch": 3.96665311102074, + "grad_norm": 3.988744243331996, + "learning_rate": 1.62863642615649e-05, + "loss": 0.0882, + "step": 9754 + }, + { + "epoch": 3.967059780398536, + "grad_norm": 9.793784016307896, + "learning_rate": 1.628557627351785e-05, + "loss": 0.3456, + "step": 9755 + }, + { + "epoch": 3.9674664497763317, + "grad_norm": 0.5114584716145003, + "learning_rate": 1.62847882209462e-05, + "loss": 0.0075, + "step": 9756 + }, + { + "epoch": 3.9678731191541274, + "grad_norm": 3.024150219802729, + "learning_rate": 1.6284000103858035e-05, + "loss": 0.0586, + "step": 9757 + }, + { + "epoch": 3.9682797885319236, + "grad_norm": 10.780644406020276, + "learning_rate": 1.6283211922261447e-05, + "loss": 0.3822, + "step": 9758 + }, + { + "epoch": 3.9686864579097194, + "grad_norm": 0.4571516919190603, + "learning_rate": 1.6282423676164526e-05, + "loss": 0.0077, + "step": 9759 + }, + { + "epoch": 3.969093127287515, + "grad_norm": 11.573648617591045, + "learning_rate": 1.628163536557536e-05, + "loss": 0.6469, + "step": 9760 + }, + { + "epoch": 3.969499796665311, + "grad_norm": 14.545793315575466, + "learning_rate": 1.628084699050205e-05, + "loss": 0.7915, + "step": 9761 + }, + { + "epoch": 3.969906466043107, + "grad_norm": 5.958028570609359, + "learning_rate": 1.6280058550952684e-05, + "loss": 0.1352, + "step": 9762 + }, + { + "epoch": 3.970313135420903, + "grad_norm": 8.631277967770412, + "learning_rate": 1.6279270046935352e-05, + "loss": 0.5997, + "step": 9763 + }, + { + "epoch": 3.9707198047986987, + "grad_norm": 10.83409307819723, + "learning_rate": 1.627848147845816e-05, + "loss": 0.3898, + "step": 9764 + }, + { + "epoch": 3.9711264741764944, + "grad_norm": 9.773980346685931, + "learning_rate": 1.627769284552919e-05, + "loss": 0.2814, + "step": 9765 + }, + { + "epoch": 3.97153314355429, + "grad_norm": 4.751427504438641, + "learning_rate": 1.6276904148156546e-05, + "loss": 0.0882, + "step": 9766 + }, + { + "epoch": 3.971939812932086, + "grad_norm": 10.199847431285777, + "learning_rate": 1.6276115386348318e-05, + "loss": 0.614, + "step": 9767 + }, + { + "epoch": 3.972346482309882, + "grad_norm": 7.468272472098254, + "learning_rate": 1.627532656011261e-05, + "loss": 0.1921, + "step": 9768 + }, + { + "epoch": 3.972753151687678, + "grad_norm": 2.517718809217945, + "learning_rate": 1.6274537669457517e-05, + "loss": 0.0337, + "step": 9769 + }, + { + "epoch": 3.9731598210654737, + "grad_norm": 4.699717733866407, + "learning_rate": 1.6273748714391136e-05, + "loss": 0.1033, + "step": 9770 + }, + { + "epoch": 3.9735664904432695, + "grad_norm": 9.278256397631438, + "learning_rate": 1.6272959694921567e-05, + "loss": 0.5068, + "step": 9771 + }, + { + "epoch": 3.9739731598210657, + "grad_norm": 7.049424355516584, + "learning_rate": 1.627217061105691e-05, + "loss": 0.4203, + "step": 9772 + }, + { + "epoch": 3.9743798291988615, + "grad_norm": 0.5829153498633933, + "learning_rate": 1.6271381462805266e-05, + "loss": 0.0092, + "step": 9773 + }, + { + "epoch": 3.9747864985766572, + "grad_norm": 14.646238161411727, + "learning_rate": 1.6270592250174733e-05, + "loss": 0.6265, + "step": 9774 + }, + { + "epoch": 3.975193167954453, + "grad_norm": 0.1113234643314483, + "learning_rate": 1.6269802973173416e-05, + "loss": 0.0019, + "step": 9775 + }, + { + "epoch": 3.9755998373322488, + "grad_norm": 18.30617398711521, + "learning_rate": 1.6269013631809412e-05, + "loss": 0.6547, + "step": 9776 + }, + { + "epoch": 3.9760065067100445, + "grad_norm": 0.7355291923534071, + "learning_rate": 1.6268224226090834e-05, + "loss": 0.0101, + "step": 9777 + }, + { + "epoch": 3.9764131760878407, + "grad_norm": 7.612083703470809, + "learning_rate": 1.626743475602578e-05, + "loss": 0.3122, + "step": 9778 + }, + { + "epoch": 3.9768198454656365, + "grad_norm": 11.116559819108053, + "learning_rate": 1.626664522162235e-05, + "loss": 0.3265, + "step": 9779 + }, + { + "epoch": 3.9772265148434323, + "grad_norm": 0.19751968489841693, + "learning_rate": 1.6265855622888652e-05, + "loss": 0.0031, + "step": 9780 + }, + { + "epoch": 3.977633184221228, + "grad_norm": 2.561865402992509, + "learning_rate": 1.6265065959832795e-05, + "loss": 0.0883, + "step": 9781 + }, + { + "epoch": 3.9780398535990242, + "grad_norm": 12.447026843341495, + "learning_rate": 1.626427623246288e-05, + "loss": 0.5538, + "step": 9782 + }, + { + "epoch": 3.97844652297682, + "grad_norm": 6.106006115746987, + "learning_rate": 1.6263486440787017e-05, + "loss": 0.104, + "step": 9783 + }, + { + "epoch": 3.9788531923546158, + "grad_norm": 4.517435616370118, + "learning_rate": 1.6262696584813317e-05, + "loss": 0.1094, + "step": 9784 + }, + { + "epoch": 3.9792598617324115, + "grad_norm": 2.854598186203805, + "learning_rate": 1.626190666454988e-05, + "loss": 0.0558, + "step": 9785 + }, + { + "epoch": 3.9796665311102073, + "grad_norm": 7.532745049207673, + "learning_rate": 1.6261116680004823e-05, + "loss": 0.3778, + "step": 9786 + }, + { + "epoch": 3.980073200488003, + "grad_norm": 5.756227633065644, + "learning_rate": 1.626032663118625e-05, + "loss": 0.1174, + "step": 9787 + }, + { + "epoch": 3.9804798698657993, + "grad_norm": 8.916919037424137, + "learning_rate": 1.6259536518102275e-05, + "loss": 0.3978, + "step": 9788 + }, + { + "epoch": 3.980886539243595, + "grad_norm": 0.6111768950889973, + "learning_rate": 1.6258746340761006e-05, + "loss": 0.007, + "step": 9789 + }, + { + "epoch": 3.981293208621391, + "grad_norm": 4.093856705483397, + "learning_rate": 1.625795609917056e-05, + "loss": 0.0893, + "step": 9790 + }, + { + "epoch": 3.9816998779991866, + "grad_norm": 6.177090490442136, + "learning_rate": 1.625716579333904e-05, + "loss": 0.1537, + "step": 9791 + }, + { + "epoch": 3.9821065473769828, + "grad_norm": 117.43690079009072, + "learning_rate": 1.6256375423274565e-05, + "loss": 0.9399, + "step": 9792 + }, + { + "epoch": 3.9825132167547785, + "grad_norm": 8.289880471875765, + "learning_rate": 1.6255584988985248e-05, + "loss": 0.4059, + "step": 9793 + }, + { + "epoch": 3.9829198861325743, + "grad_norm": 1.9323778142767851, + "learning_rate": 1.62547944904792e-05, + "loss": 0.0341, + "step": 9794 + }, + { + "epoch": 3.98332655551037, + "grad_norm": 17.37986064750217, + "learning_rate": 1.6254003927764547e-05, + "loss": 0.3799, + "step": 9795 + }, + { + "epoch": 3.983733224888166, + "grad_norm": 6.4273587762057955, + "learning_rate": 1.625321330084939e-05, + "loss": 0.1338, + "step": 9796 + }, + { + "epoch": 3.9841398942659616, + "grad_norm": 6.760569220750853, + "learning_rate": 1.625242260974185e-05, + "loss": 0.1564, + "step": 9797 + }, + { + "epoch": 3.9845465636437574, + "grad_norm": 0.1425157550508289, + "learning_rate": 1.6251631854450046e-05, + "loss": 0.0018, + "step": 9798 + }, + { + "epoch": 3.9849532330215536, + "grad_norm": 5.692627148547044, + "learning_rate": 1.6250841034982097e-05, + "loss": 0.1013, + "step": 9799 + }, + { + "epoch": 3.9853599023993493, + "grad_norm": 11.8282793862032, + "learning_rate": 1.6250050151346118e-05, + "loss": 0.4186, + "step": 9800 + }, + { + "epoch": 3.985766571777145, + "grad_norm": 7.7413471426975295, + "learning_rate": 1.6249259203550228e-05, + "loss": 0.3083, + "step": 9801 + }, + { + "epoch": 3.986173241154941, + "grad_norm": 10.327361828130202, + "learning_rate": 1.6248468191602547e-05, + "loss": 0.3588, + "step": 9802 + }, + { + "epoch": 3.986579910532737, + "grad_norm": 4.700887963981864, + "learning_rate": 1.6247677115511195e-05, + "loss": 0.1826, + "step": 9803 + }, + { + "epoch": 3.986986579910533, + "grad_norm": 8.24232925210093, + "learning_rate": 1.6246885975284294e-05, + "loss": 0.359, + "step": 9804 + }, + { + "epoch": 3.9873932492883286, + "grad_norm": 11.315368162458512, + "learning_rate": 1.6246094770929965e-05, + "loss": 0.6035, + "step": 9805 + }, + { + "epoch": 3.9877999186661244, + "grad_norm": 0.2915791084744203, + "learning_rate": 1.6245303502456326e-05, + "loss": 0.0052, + "step": 9806 + }, + { + "epoch": 3.98820658804392, + "grad_norm": 7.37209108147195, + "learning_rate": 1.624451216987151e-05, + "loss": 0.1605, + "step": 9807 + }, + { + "epoch": 3.988613257421716, + "grad_norm": 10.727126954765973, + "learning_rate": 1.6243720773183627e-05, + "loss": 0.3384, + "step": 9808 + }, + { + "epoch": 3.989019926799512, + "grad_norm": 11.098611609274618, + "learning_rate": 1.6242929312400814e-05, + "loss": 0.4344, + "step": 9809 + }, + { + "epoch": 3.989426596177308, + "grad_norm": 4.84765152485269, + "learning_rate": 1.6242137787531183e-05, + "loss": 0.0876, + "step": 9810 + }, + { + "epoch": 3.9898332655551036, + "grad_norm": 5.108312103265355, + "learning_rate": 1.624134619858287e-05, + "loss": 0.3443, + "step": 9811 + }, + { + "epoch": 3.9902399349328994, + "grad_norm": 14.086470546781527, + "learning_rate": 1.6240554545563997e-05, + "loss": 0.4807, + "step": 9812 + }, + { + "epoch": 3.9906466043106956, + "grad_norm": 4.263560195047578, + "learning_rate": 1.6239762828482688e-05, + "loss": 0.0825, + "step": 9813 + }, + { + "epoch": 3.9910532736884914, + "grad_norm": 3.2818227459308624, + "learning_rate": 1.6238971047347074e-05, + "loss": 0.1029, + "step": 9814 + }, + { + "epoch": 3.991459943066287, + "grad_norm": 3.7312512129050277, + "learning_rate": 1.6238179202165283e-05, + "loss": 0.0728, + "step": 9815 + }, + { + "epoch": 3.991866612444083, + "grad_norm": 14.826620828069785, + "learning_rate": 1.6237387292945443e-05, + "loss": 0.3623, + "step": 9816 + }, + { + "epoch": 3.9922732818218787, + "grad_norm": 3.7307519448223725, + "learning_rate": 1.6236595319695685e-05, + "loss": 0.0561, + "step": 9817 + }, + { + "epoch": 3.9926799511996744, + "grad_norm": 0.6240448756468984, + "learning_rate": 1.6235803282424135e-05, + "loss": 0.0256, + "step": 9818 + }, + { + "epoch": 3.9930866205774707, + "grad_norm": 7.138559927744263, + "learning_rate": 1.6235011181138927e-05, + "loss": 0.2809, + "step": 9819 + }, + { + "epoch": 3.9934932899552664, + "grad_norm": 10.650936605816339, + "learning_rate": 1.623421901584819e-05, + "loss": 0.3309, + "step": 9820 + }, + { + "epoch": 3.993899959333062, + "grad_norm": 11.87307727378095, + "learning_rate": 1.6233426786560057e-05, + "loss": 0.4429, + "step": 9821 + }, + { + "epoch": 3.994306628710858, + "grad_norm": 5.243918695184786, + "learning_rate": 1.6232634493282665e-05, + "loss": 0.0962, + "step": 9822 + }, + { + "epoch": 3.994713298088654, + "grad_norm": 2.6302086221822454, + "learning_rate": 1.6231842136024138e-05, + "loss": 0.0474, + "step": 9823 + }, + { + "epoch": 3.99511996746645, + "grad_norm": 8.30325962278263, + "learning_rate": 1.6231049714792617e-05, + "loss": 0.4187, + "step": 9824 + }, + { + "epoch": 3.9955266368442457, + "grad_norm": 11.038208451928892, + "learning_rate": 1.6230257229596234e-05, + "loss": 0.4412, + "step": 9825 + }, + { + "epoch": 3.9959333062220415, + "grad_norm": 4.648851849467225, + "learning_rate": 1.6229464680443124e-05, + "loss": 0.1684, + "step": 9826 + }, + { + "epoch": 3.9963399755998372, + "grad_norm": 9.637693845750915, + "learning_rate": 1.6228672067341426e-05, + "loss": 0.3375, + "step": 9827 + }, + { + "epoch": 3.996746644977633, + "grad_norm": 1.4406059128899191, + "learning_rate": 1.6227879390299274e-05, + "loss": 0.0304, + "step": 9828 + }, + { + "epoch": 3.997153314355429, + "grad_norm": 10.820889786656796, + "learning_rate": 1.6227086649324806e-05, + "loss": 0.3735, + "step": 9829 + }, + { + "epoch": 3.997559983733225, + "grad_norm": 1.1616601664805488, + "learning_rate": 1.622629384442616e-05, + "loss": 0.0207, + "step": 9830 + }, + { + "epoch": 3.9979666531110207, + "grad_norm": 10.346065863619478, + "learning_rate": 1.6225500975611474e-05, + "loss": 0.2984, + "step": 9831 + }, + { + "epoch": 3.9983733224888165, + "grad_norm": 3.2356805035607747, + "learning_rate": 1.6224708042888882e-05, + "loss": 0.055, + "step": 9832 + }, + { + "epoch": 3.9987799918666127, + "grad_norm": 9.21506259503912, + "learning_rate": 1.6223915046266535e-05, + "loss": 0.3406, + "step": 9833 + }, + { + "epoch": 3.9991866612444085, + "grad_norm": 22.176060076948463, + "learning_rate": 1.6223121985752564e-05, + "loss": 0.2998, + "step": 9834 + }, + { + "epoch": 3.9995933306222042, + "grad_norm": 8.816925928532422, + "learning_rate": 1.6222328861355118e-05, + "loss": 0.448, + "step": 9835 + }, + { + "epoch": 4.0, + "grad_norm": 11.877509254073596, + "learning_rate": 1.622153567308233e-05, + "loss": 0.9007, + "step": 9836 + }, + { + "epoch": 4.000406669377796, + "grad_norm": 9.821638730394024, + "learning_rate": 1.622074242094235e-05, + "loss": 0.4578, + "step": 9837 + }, + { + "epoch": 4.0008133387555915, + "grad_norm": 3.788250667849451, + "learning_rate": 1.6219949104943317e-05, + "loss": 0.0583, + "step": 9838 + }, + { + "epoch": 4.001220008133387, + "grad_norm": 3.1922120244148546, + "learning_rate": 1.621915572509338e-05, + "loss": 0.0583, + "step": 9839 + }, + { + "epoch": 4.001626677511183, + "grad_norm": 9.940228704531922, + "learning_rate": 1.6218362281400676e-05, + "loss": 0.4298, + "step": 9840 + }, + { + "epoch": 4.002033346888979, + "grad_norm": 6.613165349953047, + "learning_rate": 1.621756877387335e-05, + "loss": 0.2217, + "step": 9841 + }, + { + "epoch": 4.0024400162667755, + "grad_norm": 5.824369184783811, + "learning_rate": 1.6216775202519556e-05, + "loss": 0.2051, + "step": 9842 + }, + { + "epoch": 4.002846685644571, + "grad_norm": 7.912102639392716, + "learning_rate": 1.6215981567347435e-05, + "loss": 0.5049, + "step": 9843 + }, + { + "epoch": 4.003253355022367, + "grad_norm": 11.128806457025247, + "learning_rate": 1.6215187868365135e-05, + "loss": 0.7172, + "step": 9844 + }, + { + "epoch": 4.003660024400163, + "grad_norm": 3.401118145487956, + "learning_rate": 1.6214394105580802e-05, + "loss": 0.0652, + "step": 9845 + }, + { + "epoch": 4.0040666937779585, + "grad_norm": 0.30271971094236827, + "learning_rate": 1.6213600279002587e-05, + "loss": 0.0055, + "step": 9846 + }, + { + "epoch": 4.004473363155754, + "grad_norm": 8.084969357654218, + "learning_rate": 1.6212806388638636e-05, + "loss": 0.355, + "step": 9847 + }, + { + "epoch": 4.00488003253355, + "grad_norm": 9.069291372550142, + "learning_rate": 1.6212012434497103e-05, + "loss": 0.5377, + "step": 9848 + }, + { + "epoch": 4.005286701911346, + "grad_norm": 5.277641241544494, + "learning_rate": 1.6211218416586136e-05, + "loss": 0.0995, + "step": 9849 + }, + { + "epoch": 4.005693371289142, + "grad_norm": 8.901603464264513, + "learning_rate": 1.6210424334913882e-05, + "loss": 0.2561, + "step": 9850 + }, + { + "epoch": 4.006100040666937, + "grad_norm": 4.46856704061811, + "learning_rate": 1.62096301894885e-05, + "loss": 0.1964, + "step": 9851 + }, + { + "epoch": 4.006506710044734, + "grad_norm": 4.534218606983624, + "learning_rate": 1.620883598031814e-05, + "loss": 0.0803, + "step": 9852 + }, + { + "epoch": 4.00691337942253, + "grad_norm": 2.3054692574112456, + "learning_rate": 1.6208041707410954e-05, + "loss": 0.0316, + "step": 9853 + }, + { + "epoch": 4.0073200488003256, + "grad_norm": 5.218530297830032, + "learning_rate": 1.620724737077509e-05, + "loss": 0.1146, + "step": 9854 + }, + { + "epoch": 4.007726718178121, + "grad_norm": 0.7374862188196485, + "learning_rate": 1.6206452970418712e-05, + "loss": 0.0146, + "step": 9855 + }, + { + "epoch": 4.008133387555917, + "grad_norm": 4.679470615712598, + "learning_rate": 1.6205658506349972e-05, + "loss": 0.0747, + "step": 9856 + }, + { + "epoch": 4.008540056933713, + "grad_norm": 2.147547271610194, + "learning_rate": 1.6204863978577022e-05, + "loss": 0.0464, + "step": 9857 + }, + { + "epoch": 4.008946726311509, + "grad_norm": 5.3857069360196705, + "learning_rate": 1.6204069387108017e-05, + "loss": 0.1179, + "step": 9858 + }, + { + "epoch": 4.009353395689304, + "grad_norm": 3.0694592658127755, + "learning_rate": 1.6203274731951124e-05, + "loss": 0.0773, + "step": 9859 + }, + { + "epoch": 4.0097600650671, + "grad_norm": 8.182294951346075, + "learning_rate": 1.620248001311449e-05, + "loss": 0.2336, + "step": 9860 + }, + { + "epoch": 4.010166734444896, + "grad_norm": 6.514517470795888, + "learning_rate": 1.620168523060628e-05, + "loss": 0.3757, + "step": 9861 + }, + { + "epoch": 4.010573403822693, + "grad_norm": 15.284727415663797, + "learning_rate": 1.6200890384434647e-05, + "loss": 0.2814, + "step": 9862 + }, + { + "epoch": 4.010980073200488, + "grad_norm": 4.989930235695698, + "learning_rate": 1.6200095474607753e-05, + "loss": 0.166, + "step": 9863 + }, + { + "epoch": 4.011386742578284, + "grad_norm": 10.812221364592418, + "learning_rate": 1.6199300501133758e-05, + "loss": 0.3661, + "step": 9864 + }, + { + "epoch": 4.01179341195608, + "grad_norm": 1.2626036222353876, + "learning_rate": 1.6198505464020828e-05, + "loss": 0.0206, + "step": 9865 + }, + { + "epoch": 4.012200081333876, + "grad_norm": 2.0749229153640165, + "learning_rate": 1.6197710363277117e-05, + "loss": 0.0282, + "step": 9866 + }, + { + "epoch": 4.012606750711671, + "grad_norm": 5.836947127158103, + "learning_rate": 1.619691519891079e-05, + "loss": 0.1842, + "step": 9867 + }, + { + "epoch": 4.013013420089467, + "grad_norm": 6.2802889649773554, + "learning_rate": 1.6196119970930006e-05, + "loss": 0.2215, + "step": 9868 + }, + { + "epoch": 4.013420089467263, + "grad_norm": 7.8643642702787755, + "learning_rate": 1.6195324679342937e-05, + "loss": 0.226, + "step": 9869 + }, + { + "epoch": 4.013826758845059, + "grad_norm": 0.9540266361229516, + "learning_rate": 1.619452932415774e-05, + "loss": 0.0157, + "step": 9870 + }, + { + "epoch": 4.0142334282228544, + "grad_norm": 2.104674263654033, + "learning_rate": 1.619373390538258e-05, + "loss": 0.0395, + "step": 9871 + }, + { + "epoch": 4.014640097600651, + "grad_norm": 9.760478186293149, + "learning_rate": 1.6192938423025628e-05, + "loss": 0.3665, + "step": 9872 + }, + { + "epoch": 4.015046766978447, + "grad_norm": 0.06401808868339585, + "learning_rate": 1.6192142877095043e-05, + "loss": 0.0008, + "step": 9873 + }, + { + "epoch": 4.015453436356243, + "grad_norm": 13.728187870061968, + "learning_rate": 1.6191347267599e-05, + "loss": 0.3878, + "step": 9874 + }, + { + "epoch": 4.015860105734038, + "grad_norm": 2.8868118486955274, + "learning_rate": 1.6190551594545656e-05, + "loss": 0.0282, + "step": 9875 + }, + { + "epoch": 4.016266775111834, + "grad_norm": 1.1122390526116164, + "learning_rate": 1.6189755857943185e-05, + "loss": 0.0154, + "step": 9876 + }, + { + "epoch": 4.01667344448963, + "grad_norm": 6.974121694078041, + "learning_rate": 1.6188960057799757e-05, + "loss": 0.4112, + "step": 9877 + }, + { + "epoch": 4.017080113867426, + "grad_norm": 0.058508115311864034, + "learning_rate": 1.6188164194123536e-05, + "loss": 0.0011, + "step": 9878 + }, + { + "epoch": 4.0174867832452215, + "grad_norm": 13.026892800940537, + "learning_rate": 1.6187368266922697e-05, + "loss": 0.4535, + "step": 9879 + }, + { + "epoch": 4.017893452623017, + "grad_norm": 10.978921531892457, + "learning_rate": 1.618657227620541e-05, + "loss": 0.215, + "step": 9880 + }, + { + "epoch": 4.018300122000813, + "grad_norm": 13.560312621554628, + "learning_rate": 1.6185776221979845e-05, + "loss": 0.5769, + "step": 9881 + }, + { + "epoch": 4.018706791378609, + "grad_norm": 13.640369829377988, + "learning_rate": 1.618498010425417e-05, + "loss": 0.6814, + "step": 9882 + }, + { + "epoch": 4.019113460756405, + "grad_norm": 8.341156592768066, + "learning_rate": 1.6184183923036567e-05, + "loss": 0.4196, + "step": 9883 + }, + { + "epoch": 4.019520130134201, + "grad_norm": 5.027892370538959, + "learning_rate": 1.61833876783352e-05, + "loss": 0.0915, + "step": 9884 + }, + { + "epoch": 4.019926799511997, + "grad_norm": 0.7849252668950389, + "learning_rate": 1.618259137015825e-05, + "loss": 0.0089, + "step": 9885 + }, + { + "epoch": 4.020333468889793, + "grad_norm": 3.9891265263018045, + "learning_rate": 1.618179499851388e-05, + "loss": 0.0824, + "step": 9886 + }, + { + "epoch": 4.0207401382675885, + "grad_norm": 1.7974759143069956, + "learning_rate": 1.6180998563410278e-05, + "loss": 0.0315, + "step": 9887 + }, + { + "epoch": 4.021146807645384, + "grad_norm": 2.2309398837796666, + "learning_rate": 1.6180202064855615e-05, + "loss": 0.0359, + "step": 9888 + }, + { + "epoch": 4.02155347702318, + "grad_norm": 14.470928919952721, + "learning_rate": 1.6179405502858067e-05, + "loss": 0.201, + "step": 9889 + }, + { + "epoch": 4.021960146400976, + "grad_norm": 0.5755122810806772, + "learning_rate": 1.617860887742581e-05, + "loss": 0.0066, + "step": 9890 + }, + { + "epoch": 4.0223668157787715, + "grad_norm": 5.351930467658464, + "learning_rate": 1.617781218856703e-05, + "loss": 0.2077, + "step": 9891 + }, + { + "epoch": 4.022773485156567, + "grad_norm": 3.140920769970986, + "learning_rate": 1.6177015436289892e-05, + "loss": 0.057, + "step": 9892 + }, + { + "epoch": 4.023180154534364, + "grad_norm": 13.18361893658528, + "learning_rate": 1.6176218620602584e-05, + "loss": 0.2665, + "step": 9893 + }, + { + "epoch": 4.02358682391216, + "grad_norm": 5.466901005963262, + "learning_rate": 1.6175421741513284e-05, + "loss": 0.1443, + "step": 9894 + }, + { + "epoch": 4.0239934932899555, + "grad_norm": 10.902723527353295, + "learning_rate": 1.617462479903017e-05, + "loss": 0.558, + "step": 9895 + }, + { + "epoch": 4.024400162667751, + "grad_norm": 3.27408937440907, + "learning_rate": 1.6173827793161426e-05, + "loss": 0.0589, + "step": 9896 + }, + { + "epoch": 4.024806832045547, + "grad_norm": 3.229801918196125, + "learning_rate": 1.6173030723915232e-05, + "loss": 0.081, + "step": 9897 + }, + { + "epoch": 4.025213501423343, + "grad_norm": 0.025764474046666343, + "learning_rate": 1.617223359129977e-05, + "loss": 0.0004, + "step": 9898 + }, + { + "epoch": 4.0256201708011385, + "grad_norm": 9.191196343095768, + "learning_rate": 1.6171436395323223e-05, + "loss": 0.3862, + "step": 9899 + }, + { + "epoch": 4.026026840178934, + "grad_norm": 10.678514245391899, + "learning_rate": 1.617063913599378e-05, + "loss": 0.5851, + "step": 9900 + }, + { + "epoch": 4.02643350955673, + "grad_norm": 1.7705047812206756, + "learning_rate": 1.6169841813319614e-05, + "loss": 0.0428, + "step": 9901 + }, + { + "epoch": 4.026840178934526, + "grad_norm": 6.002642447948786, + "learning_rate": 1.6169044427308926e-05, + "loss": 0.2345, + "step": 9902 + }, + { + "epoch": 4.0272468483123225, + "grad_norm": 10.993512548565517, + "learning_rate": 1.616824697796988e-05, + "loss": 0.2722, + "step": 9903 + }, + { + "epoch": 4.027653517690118, + "grad_norm": 4.638104686619045, + "learning_rate": 1.6167449465310686e-05, + "loss": 0.1585, + "step": 9904 + }, + { + "epoch": 4.028060187067914, + "grad_norm": 7.457343460399276, + "learning_rate": 1.616665188933951e-05, + "loss": 0.1704, + "step": 9905 + }, + { + "epoch": 4.02846685644571, + "grad_norm": 11.223257746445952, + "learning_rate": 1.6165854250064554e-05, + "loss": 0.2669, + "step": 9906 + }, + { + "epoch": 4.0288735258235056, + "grad_norm": 6.010575755315148, + "learning_rate": 1.6165056547494e-05, + "loss": 0.1408, + "step": 9907 + }, + { + "epoch": 4.029280195201301, + "grad_norm": 4.618767523302326, + "learning_rate": 1.6164258781636036e-05, + "loss": 0.1082, + "step": 9908 + }, + { + "epoch": 4.029686864579097, + "grad_norm": 6.037858688404247, + "learning_rate": 1.616346095249886e-05, + "loss": 0.117, + "step": 9909 + }, + { + "epoch": 4.030093533956893, + "grad_norm": 7.061416826793918, + "learning_rate": 1.6162663060090647e-05, + "loss": 0.1753, + "step": 9910 + }, + { + "epoch": 4.030500203334689, + "grad_norm": 8.18337763049142, + "learning_rate": 1.61618651044196e-05, + "loss": 0.2434, + "step": 9911 + }, + { + "epoch": 4.030906872712484, + "grad_norm": 5.215391847809238, + "learning_rate": 1.6161067085493905e-05, + "loss": 0.1357, + "step": 9912 + }, + { + "epoch": 4.031313542090281, + "grad_norm": 5.382533687560211, + "learning_rate": 1.6160269003321755e-05, + "loss": 0.2144, + "step": 9913 + }, + { + "epoch": 4.031720211468077, + "grad_norm": 12.31809879037638, + "learning_rate": 1.6159470857911345e-05, + "loss": 0.2881, + "step": 9914 + }, + { + "epoch": 4.032126880845873, + "grad_norm": 0.7777014653393822, + "learning_rate": 1.6158672649270864e-05, + "loss": 0.0152, + "step": 9915 + }, + { + "epoch": 4.032533550223668, + "grad_norm": 1.0298915233635122, + "learning_rate": 1.6157874377408512e-05, + "loss": 0.0198, + "step": 9916 + }, + { + "epoch": 4.032940219601464, + "grad_norm": 3.8091218719750968, + "learning_rate": 1.6157076042332477e-05, + "loss": 0.0718, + "step": 9917 + }, + { + "epoch": 4.03334688897926, + "grad_norm": 9.71740902285562, + "learning_rate": 1.615627764405096e-05, + "loss": 0.8561, + "step": 9918 + }, + { + "epoch": 4.033753558357056, + "grad_norm": 11.952380721678159, + "learning_rate": 1.6155479182572155e-05, + "loss": 0.5449, + "step": 9919 + }, + { + "epoch": 4.034160227734851, + "grad_norm": 3.9070161747411567, + "learning_rate": 1.6154680657904258e-05, + "loss": 0.074, + "step": 9920 + }, + { + "epoch": 4.034566897112647, + "grad_norm": 7.875139307107271, + "learning_rate": 1.6153882070055465e-05, + "loss": 0.2112, + "step": 9921 + }, + { + "epoch": 4.034973566490443, + "grad_norm": 8.218407294703319, + "learning_rate": 1.615308341903398e-05, + "loss": 0.2043, + "step": 9922 + }, + { + "epoch": 4.035380235868239, + "grad_norm": 3.4789724857768616, + "learning_rate": 1.615228470484799e-05, + "loss": 0.0368, + "step": 9923 + }, + { + "epoch": 4.035786905246035, + "grad_norm": 1.151041188950061, + "learning_rate": 1.6151485927505704e-05, + "loss": 0.0329, + "step": 9924 + }, + { + "epoch": 4.036193574623831, + "grad_norm": 3.049974452768576, + "learning_rate": 1.615068708701532e-05, + "loss": 0.1474, + "step": 9925 + }, + { + "epoch": 4.036600244001627, + "grad_norm": 7.468267559285856, + "learning_rate": 1.6149888183385035e-05, + "loss": 0.1859, + "step": 9926 + }, + { + "epoch": 4.037006913379423, + "grad_norm": 10.309875757654423, + "learning_rate": 1.6149089216623056e-05, + "loss": 0.1724, + "step": 9927 + }, + { + "epoch": 4.037413582757218, + "grad_norm": 4.020176349032536, + "learning_rate": 1.614829018673758e-05, + "loss": 0.0824, + "step": 9928 + }, + { + "epoch": 4.037820252135014, + "grad_norm": 8.941740708711963, + "learning_rate": 1.614749109373681e-05, + "loss": 0.2056, + "step": 9929 + }, + { + "epoch": 4.03822692151281, + "grad_norm": 0.09020295745699622, + "learning_rate": 1.614669193762895e-05, + "loss": 0.0015, + "step": 9930 + }, + { + "epoch": 4.038633590890606, + "grad_norm": 0.34664631971642623, + "learning_rate": 1.6145892718422204e-05, + "loss": 0.0062, + "step": 9931 + }, + { + "epoch": 4.0390402602684015, + "grad_norm": 1.0719324714929213, + "learning_rate": 1.6145093436124776e-05, + "loss": 0.0143, + "step": 9932 + }, + { + "epoch": 4.039446929646197, + "grad_norm": 14.024924464540847, + "learning_rate": 1.6144294090744873e-05, + "loss": 0.4774, + "step": 9933 + }, + { + "epoch": 4.039853599023994, + "grad_norm": 5.9423334312473575, + "learning_rate": 1.6143494682290696e-05, + "loss": 0.2422, + "step": 9934 + }, + { + "epoch": 4.04026026840179, + "grad_norm": 5.676517701199852, + "learning_rate": 1.6142695210770454e-05, + "loss": 0.119, + "step": 9935 + }, + { + "epoch": 4.040666937779585, + "grad_norm": 0.31794853669240564, + "learning_rate": 1.614189567619236e-05, + "loss": 0.0028, + "step": 9936 + }, + { + "epoch": 4.041073607157381, + "grad_norm": 2.924844780075578, + "learning_rate": 1.614109607856461e-05, + "loss": 0.0414, + "step": 9937 + }, + { + "epoch": 4.041480276535177, + "grad_norm": 6.031469012904676, + "learning_rate": 1.614029641789542e-05, + "loss": 0.1759, + "step": 9938 + }, + { + "epoch": 4.041886945912973, + "grad_norm": 9.744634168588766, + "learning_rate": 1.6139496694192993e-05, + "loss": 0.4025, + "step": 9939 + }, + { + "epoch": 4.0422936152907685, + "grad_norm": 11.755315862508066, + "learning_rate": 1.6138696907465544e-05, + "loss": 0.2249, + "step": 9940 + }, + { + "epoch": 4.042700284668564, + "grad_norm": 4.718315915326741, + "learning_rate": 1.6137897057721286e-05, + "loss": 0.1237, + "step": 9941 + }, + { + "epoch": 4.04310695404636, + "grad_norm": 12.875600119466739, + "learning_rate": 1.6137097144968422e-05, + "loss": 0.7425, + "step": 9942 + }, + { + "epoch": 4.043513623424156, + "grad_norm": 6.803760202500455, + "learning_rate": 1.613629716921517e-05, + "loss": 0.1603, + "step": 9943 + }, + { + "epoch": 4.043920292801952, + "grad_norm": 10.147616525275353, + "learning_rate": 1.6135497130469735e-05, + "loss": 0.375, + "step": 9944 + }, + { + "epoch": 4.044326962179748, + "grad_norm": 4.182768131124719, + "learning_rate": 1.6134697028740334e-05, + "loss": 0.2695, + "step": 9945 + }, + { + "epoch": 4.044733631557544, + "grad_norm": 9.119518087747968, + "learning_rate": 1.6133896864035186e-05, + "loss": 0.258, + "step": 9946 + }, + { + "epoch": 4.04514030093534, + "grad_norm": 0.14505540689657415, + "learning_rate": 1.6133096636362496e-05, + "loss": 0.0019, + "step": 9947 + }, + { + "epoch": 4.0455469703131355, + "grad_norm": 8.398188889117288, + "learning_rate": 1.613229634573048e-05, + "loss": 0.4236, + "step": 9948 + }, + { + "epoch": 4.045953639690931, + "grad_norm": 0.1768273316219848, + "learning_rate": 1.6131495992147363e-05, + "loss": 0.0025, + "step": 9949 + }, + { + "epoch": 4.046360309068727, + "grad_norm": 13.512071967574244, + "learning_rate": 1.6130695575621345e-05, + "loss": 0.466, + "step": 9950 + }, + { + "epoch": 4.046766978446523, + "grad_norm": 2.0697998822231023, + "learning_rate": 1.6129895096160658e-05, + "loss": 0.0317, + "step": 9951 + }, + { + "epoch": 4.0471736478243185, + "grad_norm": 2.027683478945861, + "learning_rate": 1.6129094553773507e-05, + "loss": 0.0334, + "step": 9952 + }, + { + "epoch": 4.047580317202114, + "grad_norm": 0.6471458580713175, + "learning_rate": 1.6128293948468118e-05, + "loss": 0.0079, + "step": 9953 + }, + { + "epoch": 4.047986986579911, + "grad_norm": 5.773724624585569, + "learning_rate": 1.612749328025271e-05, + "loss": 0.084, + "step": 9954 + }, + { + "epoch": 4.048393655957707, + "grad_norm": 0.7935345950001645, + "learning_rate": 1.61266925491355e-05, + "loss": 0.0145, + "step": 9955 + }, + { + "epoch": 4.0488003253355025, + "grad_norm": 9.648750509221564, + "learning_rate": 1.6125891755124702e-05, + "loss": 0.2679, + "step": 9956 + }, + { + "epoch": 4.049206994713298, + "grad_norm": 0.13556800116131268, + "learning_rate": 1.6125090898228545e-05, + "loss": 0.0017, + "step": 9957 + }, + { + "epoch": 4.049613664091094, + "grad_norm": 2.6267468604160853, + "learning_rate": 1.612428997845525e-05, + "loss": 0.0453, + "step": 9958 + }, + { + "epoch": 4.05002033346889, + "grad_norm": 3.7480512421247947, + "learning_rate": 1.6123488995813034e-05, + "loss": 0.068, + "step": 9959 + }, + { + "epoch": 4.0504270028466856, + "grad_norm": 9.917102127930816, + "learning_rate": 1.612268795031012e-05, + "loss": 0.1002, + "step": 9960 + }, + { + "epoch": 4.050833672224481, + "grad_norm": 0.18489573230666842, + "learning_rate": 1.6121886841954735e-05, + "loss": 0.0023, + "step": 9961 + }, + { + "epoch": 4.051240341602277, + "grad_norm": 0.9895058797441443, + "learning_rate": 1.6121085670755097e-05, + "loss": 0.0143, + "step": 9962 + }, + { + "epoch": 4.051647010980073, + "grad_norm": 6.49011293096057, + "learning_rate": 1.612028443671944e-05, + "loss": 0.3373, + "step": 9963 + }, + { + "epoch": 4.052053680357869, + "grad_norm": 9.662295460186213, + "learning_rate": 1.6119483139855975e-05, + "loss": 0.1849, + "step": 9964 + }, + { + "epoch": 4.052460349735665, + "grad_norm": 1.5589256704110157, + "learning_rate": 1.6118681780172944e-05, + "loss": 0.0273, + "step": 9965 + }, + { + "epoch": 4.052867019113461, + "grad_norm": 2.0879148929820843, + "learning_rate": 1.6117880357678556e-05, + "loss": 0.0371, + "step": 9966 + }, + { + "epoch": 4.053273688491257, + "grad_norm": 10.588580670720196, + "learning_rate": 1.6117078872381054e-05, + "loss": 0.5049, + "step": 9967 + }, + { + "epoch": 4.053680357869053, + "grad_norm": 3.0659159997361747, + "learning_rate": 1.6116277324288655e-05, + "loss": 0.0498, + "step": 9968 + }, + { + "epoch": 4.054087027246848, + "grad_norm": 4.25695094340254, + "learning_rate": 1.6115475713409593e-05, + "loss": 0.0717, + "step": 9969 + }, + { + "epoch": 4.054493696624644, + "grad_norm": 3.114170530788146, + "learning_rate": 1.6114674039752095e-05, + "loss": 0.0256, + "step": 9970 + }, + { + "epoch": 4.05490036600244, + "grad_norm": 14.584889389848517, + "learning_rate": 1.611387230332439e-05, + "loss": 0.4124, + "step": 9971 + }, + { + "epoch": 4.055307035380236, + "grad_norm": 10.972196043432804, + "learning_rate": 1.611307050413471e-05, + "loss": 0.6382, + "step": 9972 + }, + { + "epoch": 4.055713704758031, + "grad_norm": 0.9659654239925253, + "learning_rate": 1.6112268642191284e-05, + "loss": 0.0112, + "step": 9973 + }, + { + "epoch": 4.056120374135827, + "grad_norm": 15.518476278661375, + "learning_rate": 1.6111466717502345e-05, + "loss": 0.6643, + "step": 9974 + }, + { + "epoch": 4.056527043513624, + "grad_norm": 3.9775605692259783, + "learning_rate": 1.611066473007612e-05, + "loss": 0.0991, + "step": 9975 + }, + { + "epoch": 4.05693371289142, + "grad_norm": 2.991085350937087, + "learning_rate": 1.610986267992085e-05, + "loss": 0.0531, + "step": 9976 + }, + { + "epoch": 4.057340382269215, + "grad_norm": 1.3912303084680124, + "learning_rate": 1.6109060567044762e-05, + "loss": 0.0217, + "step": 9977 + }, + { + "epoch": 4.057747051647011, + "grad_norm": 6.314703714058454, + "learning_rate": 1.61082583914561e-05, + "loss": 0.2274, + "step": 9978 + }, + { + "epoch": 4.058153721024807, + "grad_norm": 4.200442685805309, + "learning_rate": 1.6107456153163084e-05, + "loss": 0.1177, + "step": 9979 + }, + { + "epoch": 4.058560390402603, + "grad_norm": 5.085441156741829, + "learning_rate": 1.6106653852173957e-05, + "loss": 0.1224, + "step": 9980 + }, + { + "epoch": 4.058967059780398, + "grad_norm": 4.3035949465154575, + "learning_rate": 1.610585148849696e-05, + "loss": 0.1762, + "step": 9981 + }, + { + "epoch": 4.059373729158194, + "grad_norm": 8.764759332372376, + "learning_rate": 1.6105049062140316e-05, + "loss": 0.2629, + "step": 9982 + }, + { + "epoch": 4.05978039853599, + "grad_norm": 12.554990637774605, + "learning_rate": 1.6104246573112277e-05, + "loss": 0.9223, + "step": 9983 + }, + { + "epoch": 4.060187067913786, + "grad_norm": 3.8303523991287127, + "learning_rate": 1.6103444021421073e-05, + "loss": 0.0525, + "step": 9984 + }, + { + "epoch": 4.060593737291582, + "grad_norm": 1.9252018803431687, + "learning_rate": 1.6102641407074944e-05, + "loss": 0.0424, + "step": 9985 + }, + { + "epoch": 4.061000406669378, + "grad_norm": 0.054245116954477514, + "learning_rate": 1.6101838730082132e-05, + "loss": 0.0007, + "step": 9986 + }, + { + "epoch": 4.061407076047174, + "grad_norm": 6.297302274710911, + "learning_rate": 1.610103599045087e-05, + "loss": 0.1219, + "step": 9987 + }, + { + "epoch": 4.06181374542497, + "grad_norm": 0.6352918003418351, + "learning_rate": 1.610023318818941e-05, + "loss": 0.0219, + "step": 9988 + }, + { + "epoch": 4.062220414802765, + "grad_norm": 0.24449364308066668, + "learning_rate": 1.6099430323305977e-05, + "loss": 0.0034, + "step": 9989 + }, + { + "epoch": 4.062627084180561, + "grad_norm": 8.046175821536417, + "learning_rate": 1.609862739580883e-05, + "loss": 0.3044, + "step": 9990 + }, + { + "epoch": 4.063033753558357, + "grad_norm": 16.996407950699783, + "learning_rate": 1.6097824405706196e-05, + "loss": 0.318, + "step": 9991 + }, + { + "epoch": 4.063440422936153, + "grad_norm": 11.58196665953305, + "learning_rate": 1.609702135300633e-05, + "loss": 0.536, + "step": 9992 + }, + { + "epoch": 4.0638470923139485, + "grad_norm": 2.8161611063425886, + "learning_rate": 1.6096218237717472e-05, + "loss": 0.0444, + "step": 9993 + }, + { + "epoch": 4.064253761691744, + "grad_norm": 14.264935511440369, + "learning_rate": 1.6095415059847864e-05, + "loss": 0.8444, + "step": 9994 + }, + { + "epoch": 4.064660431069541, + "grad_norm": 6.0937168513979785, + "learning_rate": 1.609461181940575e-05, + "loss": 0.1771, + "step": 9995 + }, + { + "epoch": 4.065067100447337, + "grad_norm": 10.124994893315582, + "learning_rate": 1.6093808516399386e-05, + "loss": 0.4013, + "step": 9996 + }, + { + "epoch": 4.065473769825132, + "grad_norm": 4.010305668496696, + "learning_rate": 1.6093005150837005e-05, + "loss": 0.0447, + "step": 9997 + }, + { + "epoch": 4.065880439202928, + "grad_norm": 3.5864317850110705, + "learning_rate": 1.6092201722726862e-05, + "loss": 0.0671, + "step": 9998 + }, + { + "epoch": 4.066287108580724, + "grad_norm": 3.342665774907072, + "learning_rate": 1.60913982320772e-05, + "loss": 0.0501, + "step": 9999 + }, + { + "epoch": 4.06669377795852, + "grad_norm": 11.413887668568984, + "learning_rate": 1.609059467889627e-05, + "loss": 0.128, + "step": 10000 + }, + { + "epoch": 4.0671004473363155, + "grad_norm": 6.507432517882747, + "learning_rate": 1.608979106319232e-05, + "loss": 0.1833, + "step": 10001 + }, + { + "epoch": 4.067507116714111, + "grad_norm": 9.979684674151509, + "learning_rate": 1.60889873849736e-05, + "loss": 0.4862, + "step": 10002 + }, + { + "epoch": 4.067913786091907, + "grad_norm": 4.215956075473628, + "learning_rate": 1.608818364424836e-05, + "loss": 0.0884, + "step": 10003 + }, + { + "epoch": 4.068320455469703, + "grad_norm": 0.7311275026042822, + "learning_rate": 1.608737984102485e-05, + "loss": 0.0148, + "step": 10004 + }, + { + "epoch": 4.0687271248474985, + "grad_norm": 3.5046347319455586, + "learning_rate": 1.6086575975311324e-05, + "loss": 0.049, + "step": 10005 + }, + { + "epoch": 4.069133794225295, + "grad_norm": 10.784071705300102, + "learning_rate": 1.6085772047116033e-05, + "loss": 0.3782, + "step": 10006 + }, + { + "epoch": 4.069540463603091, + "grad_norm": 6.018236064028299, + "learning_rate": 1.6084968056447226e-05, + "loss": 0.3498, + "step": 10007 + }, + { + "epoch": 4.069947132980887, + "grad_norm": 0.18414095284690363, + "learning_rate": 1.6084164003313162e-05, + "loss": 0.0021, + "step": 10008 + }, + { + "epoch": 4.0703538023586825, + "grad_norm": 2.2520500941896024, + "learning_rate": 1.6083359887722093e-05, + "loss": 0.037, + "step": 10009 + }, + { + "epoch": 4.070760471736478, + "grad_norm": 0.15273181438755035, + "learning_rate": 1.6082555709682273e-05, + "loss": 0.0022, + "step": 10010 + }, + { + "epoch": 4.071167141114274, + "grad_norm": 7.842573708408878, + "learning_rate": 1.6081751469201953e-05, + "loss": 0.2306, + "step": 10011 + }, + { + "epoch": 4.07157381049207, + "grad_norm": 8.633374618228334, + "learning_rate": 1.60809471662894e-05, + "loss": 0.1475, + "step": 10012 + }, + { + "epoch": 4.0719804798698656, + "grad_norm": 8.823011316552137, + "learning_rate": 1.6080142800952854e-05, + "loss": 0.3889, + "step": 10013 + }, + { + "epoch": 4.072387149247661, + "grad_norm": 9.006918409916695, + "learning_rate": 1.607933837320059e-05, + "loss": 0.4328, + "step": 10014 + }, + { + "epoch": 4.072793818625457, + "grad_norm": 3.465071427558746, + "learning_rate": 1.6078533883040855e-05, + "loss": 0.056, + "step": 10015 + }, + { + "epoch": 4.073200488003254, + "grad_norm": 8.612211211256643, + "learning_rate": 1.6077729330481915e-05, + "loss": 0.2317, + "step": 10016 + }, + { + "epoch": 4.0736071573810495, + "grad_norm": 1.672607931668203, + "learning_rate": 1.6076924715532018e-05, + "loss": 0.0267, + "step": 10017 + }, + { + "epoch": 4.074013826758845, + "grad_norm": 17.89710243791465, + "learning_rate": 1.6076120038199436e-05, + "loss": 0.2192, + "step": 10018 + }, + { + "epoch": 4.074420496136641, + "grad_norm": 1.9891757731932163, + "learning_rate": 1.607531529849242e-05, + "loss": 0.0338, + "step": 10019 + }, + { + "epoch": 4.074827165514437, + "grad_norm": 0.08145492304664988, + "learning_rate": 1.6074510496419235e-05, + "loss": 0.0012, + "step": 10020 + }, + { + "epoch": 4.075233834892233, + "grad_norm": 8.239326398545257, + "learning_rate": 1.6073705631988143e-05, + "loss": 0.2838, + "step": 10021 + }, + { + "epoch": 4.075640504270028, + "grad_norm": 2.126808981963455, + "learning_rate": 1.6072900705207407e-05, + "loss": 0.0343, + "step": 10022 + }, + { + "epoch": 4.076047173647824, + "grad_norm": 0.40191186539805374, + "learning_rate": 1.6072095716085287e-05, + "loss": 0.0046, + "step": 10023 + }, + { + "epoch": 4.07645384302562, + "grad_norm": 5.995538457745625, + "learning_rate": 1.607129066463005e-05, + "loss": 0.1399, + "step": 10024 + }, + { + "epoch": 4.076860512403416, + "grad_norm": 5.03201902824199, + "learning_rate": 1.6070485550849958e-05, + "loss": 0.0578, + "step": 10025 + }, + { + "epoch": 4.077267181781212, + "grad_norm": 8.128281363996537, + "learning_rate": 1.606968037475328e-05, + "loss": 0.2491, + "step": 10026 + }, + { + "epoch": 4.077673851159008, + "grad_norm": 2.5241367531336913, + "learning_rate": 1.6068875136348272e-05, + "loss": 0.0347, + "step": 10027 + }, + { + "epoch": 4.078080520536804, + "grad_norm": 5.247168665254705, + "learning_rate": 1.606806983564321e-05, + "loss": 0.0288, + "step": 10028 + }, + { + "epoch": 4.0784871899146, + "grad_norm": 7.808571846946999, + "learning_rate": 1.606726447264636e-05, + "loss": 0.5992, + "step": 10029 + }, + { + "epoch": 4.078893859292395, + "grad_norm": 0.9779944566504906, + "learning_rate": 1.6066459047365978e-05, + "loss": 0.0156, + "step": 10030 + }, + { + "epoch": 4.079300528670191, + "grad_norm": 6.8422473829388215, + "learning_rate": 1.606565355981035e-05, + "loss": 0.2243, + "step": 10031 + }, + { + "epoch": 4.079707198047987, + "grad_norm": 0.8781740995481385, + "learning_rate": 1.606484800998773e-05, + "loss": 0.0106, + "step": 10032 + }, + { + "epoch": 4.080113867425783, + "grad_norm": 2.2158155044042847, + "learning_rate": 1.606404239790639e-05, + "loss": 0.044, + "step": 10033 + }, + { + "epoch": 4.080520536803578, + "grad_norm": 3.207585933893421, + "learning_rate": 1.606323672357461e-05, + "loss": 0.1094, + "step": 10034 + }, + { + "epoch": 4.080927206181374, + "grad_norm": 9.111555848643857, + "learning_rate": 1.606243098700065e-05, + "loss": 0.2335, + "step": 10035 + }, + { + "epoch": 4.081333875559171, + "grad_norm": 5.94396390231687, + "learning_rate": 1.6061625188192785e-05, + "loss": 0.0522, + "step": 10036 + }, + { + "epoch": 4.081740544936967, + "grad_norm": 5.434248498255015, + "learning_rate": 1.606081932715929e-05, + "loss": 0.1568, + "step": 10037 + }, + { + "epoch": 4.082147214314762, + "grad_norm": 1.9098349812511606, + "learning_rate": 1.6060013403908428e-05, + "loss": 0.0229, + "step": 10038 + }, + { + "epoch": 4.082553883692558, + "grad_norm": 9.308958460650196, + "learning_rate": 1.6059207418448482e-05, + "loss": 0.1826, + "step": 10039 + }, + { + "epoch": 4.082960553070354, + "grad_norm": 11.902447033857486, + "learning_rate": 1.6058401370787722e-05, + "loss": 0.4941, + "step": 10040 + }, + { + "epoch": 4.08336722244815, + "grad_norm": 15.785139804832466, + "learning_rate": 1.6057595260934422e-05, + "loss": 0.7483, + "step": 10041 + }, + { + "epoch": 4.083773891825945, + "grad_norm": 6.139813072896935, + "learning_rate": 1.6056789088896855e-05, + "loss": 0.2521, + "step": 10042 + }, + { + "epoch": 4.084180561203741, + "grad_norm": 16.207103310519294, + "learning_rate": 1.6055982854683306e-05, + "loss": 0.5515, + "step": 10043 + }, + { + "epoch": 4.084587230581537, + "grad_norm": 9.714396300401413, + "learning_rate": 1.6055176558302044e-05, + "loss": 0.4172, + "step": 10044 + }, + { + "epoch": 4.084993899959333, + "grad_norm": 1.2533709936700723, + "learning_rate": 1.6054370199761345e-05, + "loss": 0.0161, + "step": 10045 + }, + { + "epoch": 4.0854005693371285, + "grad_norm": 1.7535954420510385, + "learning_rate": 1.6053563779069484e-05, + "loss": 0.0216, + "step": 10046 + }, + { + "epoch": 4.085807238714925, + "grad_norm": 0.12392157003174833, + "learning_rate": 1.6052757296234753e-05, + "loss": 0.0012, + "step": 10047 + }, + { + "epoch": 4.086213908092721, + "grad_norm": 1.0591639617413382, + "learning_rate": 1.6051950751265418e-05, + "loss": 0.0183, + "step": 10048 + }, + { + "epoch": 4.086620577470517, + "grad_norm": 8.312587952238163, + "learning_rate": 1.6051144144169764e-05, + "loss": 0.5371, + "step": 10049 + }, + { + "epoch": 4.087027246848312, + "grad_norm": 8.413943801797963, + "learning_rate": 1.605033747495607e-05, + "loss": 0.184, + "step": 10050 + }, + { + "epoch": 4.087433916226108, + "grad_norm": 8.754713521491846, + "learning_rate": 1.6049530743632615e-05, + "loss": 0.3327, + "step": 10051 + }, + { + "epoch": 4.087840585603904, + "grad_norm": 6.312054084945649, + "learning_rate": 1.604872395020768e-05, + "loss": 0.1614, + "step": 10052 + }, + { + "epoch": 4.0882472549817, + "grad_norm": 5.864299053687139, + "learning_rate": 1.6047917094689555e-05, + "loss": 0.5043, + "step": 10053 + }, + { + "epoch": 4.0886539243594955, + "grad_norm": 5.05977892334017, + "learning_rate": 1.6047110177086513e-05, + "loss": 0.0919, + "step": 10054 + }, + { + "epoch": 4.089060593737291, + "grad_norm": 0.2769113879228333, + "learning_rate": 1.6046303197406843e-05, + "loss": 0.0023, + "step": 10055 + }, + { + "epoch": 4.089467263115087, + "grad_norm": 11.046557943675204, + "learning_rate": 1.6045496155658828e-05, + "loss": 1.0333, + "step": 10056 + }, + { + "epoch": 4.089873932492884, + "grad_norm": 1.620568720282964, + "learning_rate": 1.604468905185075e-05, + "loss": 0.0187, + "step": 10057 + }, + { + "epoch": 4.090280601870679, + "grad_norm": 2.1180226628121375, + "learning_rate": 1.60438818859909e-05, + "loss": 0.0256, + "step": 10058 + }, + { + "epoch": 4.090687271248475, + "grad_norm": 7.6252490987954875, + "learning_rate": 1.6043074658087562e-05, + "loss": 0.3559, + "step": 10059 + }, + { + "epoch": 4.091093940626271, + "grad_norm": 13.297512437555895, + "learning_rate": 1.6042267368149017e-05, + "loss": 0.3418, + "step": 10060 + }, + { + "epoch": 4.091500610004067, + "grad_norm": 6.706785236394624, + "learning_rate": 1.604146001618356e-05, + "loss": 0.0221, + "step": 10061 + }, + { + "epoch": 4.0919072793818625, + "grad_norm": 3.7102152769100814, + "learning_rate": 1.6040652602199472e-05, + "loss": 0.0688, + "step": 10062 + }, + { + "epoch": 4.092313948759658, + "grad_norm": 1.9124053509658674, + "learning_rate": 1.6039845126205044e-05, + "loss": 0.0315, + "step": 10063 + }, + { + "epoch": 4.092720618137454, + "grad_norm": 9.760180299719815, + "learning_rate": 1.603903758820857e-05, + "loss": 0.2802, + "step": 10064 + }, + { + "epoch": 4.09312728751525, + "grad_norm": 0.10512964991342477, + "learning_rate": 1.6038229988218335e-05, + "loss": 0.0019, + "step": 10065 + }, + { + "epoch": 4.0935339568930456, + "grad_norm": 7.897925837362782, + "learning_rate": 1.6037422326242627e-05, + "loss": 0.3691, + "step": 10066 + }, + { + "epoch": 4.093940626270842, + "grad_norm": 1.6202307391623532, + "learning_rate": 1.6036614602289743e-05, + "loss": 0.0283, + "step": 10067 + }, + { + "epoch": 4.094347295648638, + "grad_norm": 3.0490866071302833, + "learning_rate": 1.6035806816367972e-05, + "loss": 0.0443, + "step": 10068 + }, + { + "epoch": 4.094753965026434, + "grad_norm": 6.559676853965635, + "learning_rate": 1.6034998968485606e-05, + "loss": 0.1849, + "step": 10069 + }, + { + "epoch": 4.0951606344042295, + "grad_norm": 5.390221526102683, + "learning_rate": 1.6034191058650935e-05, + "loss": 0.128, + "step": 10070 + }, + { + "epoch": 4.095567303782025, + "grad_norm": 1.6798051382198769, + "learning_rate": 1.603338308687226e-05, + "loss": 0.0433, + "step": 10071 + }, + { + "epoch": 4.095973973159821, + "grad_norm": 10.99554264354388, + "learning_rate": 1.6032575053157868e-05, + "loss": 0.4024, + "step": 10072 + }, + { + "epoch": 4.096380642537617, + "grad_norm": 1.9709645629602461, + "learning_rate": 1.603176695751606e-05, + "loss": 0.0422, + "step": 10073 + }, + { + "epoch": 4.096787311915413, + "grad_norm": 3.056007204755352, + "learning_rate": 1.603095879995513e-05, + "loss": 0.0457, + "step": 10074 + }, + { + "epoch": 4.097193981293208, + "grad_norm": 0.7928027541698827, + "learning_rate": 1.6030150580483365e-05, + "loss": 0.0123, + "step": 10075 + }, + { + "epoch": 4.097600650671004, + "grad_norm": 7.018632321892383, + "learning_rate": 1.6029342299109076e-05, + "loss": 0.2515, + "step": 10076 + }, + { + "epoch": 4.098007320048801, + "grad_norm": 2.810499735834892, + "learning_rate": 1.6028533955840554e-05, + "loss": 0.1825, + "step": 10077 + }, + { + "epoch": 4.0984139894265965, + "grad_norm": 11.306604784646611, + "learning_rate": 1.6027725550686095e-05, + "loss": 0.5239, + "step": 10078 + }, + { + "epoch": 4.098820658804392, + "grad_norm": 14.365593027270563, + "learning_rate": 1.6026917083653998e-05, + "loss": 0.4114, + "step": 10079 + }, + { + "epoch": 4.099227328182188, + "grad_norm": 9.897953422706957, + "learning_rate": 1.6026108554752564e-05, + "loss": 0.3337, + "step": 10080 + }, + { + "epoch": 4.099633997559984, + "grad_norm": 5.280020970512689, + "learning_rate": 1.6025299963990096e-05, + "loss": 0.1524, + "step": 10081 + }, + { + "epoch": 4.10004066693778, + "grad_norm": 3.6296779655843667, + "learning_rate": 1.602449131137489e-05, + "loss": 0.1653, + "step": 10082 + }, + { + "epoch": 4.100447336315575, + "grad_norm": 11.493044820989116, + "learning_rate": 1.6023682596915246e-05, + "loss": 0.2876, + "step": 10083 + }, + { + "epoch": 4.100854005693371, + "grad_norm": 7.9549392619704955, + "learning_rate": 1.602287382061947e-05, + "loss": 0.1696, + "step": 10084 + }, + { + "epoch": 4.101260675071167, + "grad_norm": 4.739328047941151, + "learning_rate": 1.6022064982495865e-05, + "loss": 0.2093, + "step": 10085 + }, + { + "epoch": 4.101667344448963, + "grad_norm": 0.42299027891830493, + "learning_rate": 1.602125608255273e-05, + "loss": 0.0039, + "step": 10086 + }, + { + "epoch": 4.102074013826758, + "grad_norm": 0.3094900292053531, + "learning_rate": 1.602044712079837e-05, + "loss": 0.0052, + "step": 10087 + }, + { + "epoch": 4.102480683204555, + "grad_norm": 8.101120094157121, + "learning_rate": 1.6019638097241092e-05, + "loss": 0.3849, + "step": 10088 + }, + { + "epoch": 4.102887352582351, + "grad_norm": 6.708925160170298, + "learning_rate": 1.6018829011889198e-05, + "loss": 0.218, + "step": 10089 + }, + { + "epoch": 4.103294021960147, + "grad_norm": 1.763381479903865, + "learning_rate": 1.6018019864750996e-05, + "loss": 0.0298, + "step": 10090 + }, + { + "epoch": 4.103700691337942, + "grad_norm": 8.38790039627216, + "learning_rate": 1.6017210655834792e-05, + "loss": 0.289, + "step": 10091 + }, + { + "epoch": 4.104107360715738, + "grad_norm": 12.21929277863292, + "learning_rate": 1.601640138514889e-05, + "loss": 0.9652, + "step": 10092 + }, + { + "epoch": 4.104514030093534, + "grad_norm": 12.21794498196028, + "learning_rate": 1.60155920527016e-05, + "loss": 0.1307, + "step": 10093 + }, + { + "epoch": 4.10492069947133, + "grad_norm": 6.487665539613569, + "learning_rate": 1.6014782658501237e-05, + "loss": 0.3746, + "step": 10094 + }, + { + "epoch": 4.105327368849125, + "grad_norm": 5.904370349107612, + "learning_rate": 1.6013973202556094e-05, + "loss": 0.1644, + "step": 10095 + }, + { + "epoch": 4.105734038226921, + "grad_norm": 11.528852952142595, + "learning_rate": 1.6013163684874493e-05, + "loss": 0.508, + "step": 10096 + }, + { + "epoch": 4.106140707604717, + "grad_norm": 8.705542164306163, + "learning_rate": 1.6012354105464737e-05, + "loss": 0.2253, + "step": 10097 + }, + { + "epoch": 4.106547376982514, + "grad_norm": 0.7394558202589527, + "learning_rate": 1.6011544464335145e-05, + "loss": 0.0109, + "step": 10098 + }, + { + "epoch": 4.106954046360309, + "grad_norm": 4.728232969701458, + "learning_rate": 1.6010734761494023e-05, + "loss": 0.1098, + "step": 10099 + }, + { + "epoch": 4.107360715738105, + "grad_norm": 7.372328484254652, + "learning_rate": 1.600992499694968e-05, + "loss": 0.1929, + "step": 10100 + }, + { + "epoch": 4.107767385115901, + "grad_norm": 7.53976584665662, + "learning_rate": 1.6009115170710434e-05, + "loss": 0.1918, + "step": 10101 + }, + { + "epoch": 4.108174054493697, + "grad_norm": 4.81625080005225, + "learning_rate": 1.6008305282784596e-05, + "loss": 0.2335, + "step": 10102 + }, + { + "epoch": 4.108580723871492, + "grad_norm": 7.1083997717792045, + "learning_rate": 1.6007495333180483e-05, + "loss": 0.202, + "step": 10103 + }, + { + "epoch": 4.108987393249288, + "grad_norm": 7.36609265865158, + "learning_rate": 1.6006685321906404e-05, + "loss": 0.2339, + "step": 10104 + }, + { + "epoch": 4.109394062627084, + "grad_norm": 3.5768530823879825, + "learning_rate": 1.6005875248970678e-05, + "loss": 0.0532, + "step": 10105 + }, + { + "epoch": 4.10980073200488, + "grad_norm": 2.1578264841196906, + "learning_rate": 1.600506511438162e-05, + "loss": 0.034, + "step": 10106 + }, + { + "epoch": 4.1102074013826755, + "grad_norm": 12.96346186677751, + "learning_rate": 1.6004254918147546e-05, + "loss": 0.4798, + "step": 10107 + }, + { + "epoch": 4.110614070760472, + "grad_norm": 1.6860082134924474, + "learning_rate": 1.6003444660276775e-05, + "loss": 0.02, + "step": 10108 + }, + { + "epoch": 4.111020740138268, + "grad_norm": 4.358358255964677, + "learning_rate": 1.6002634340777622e-05, + "loss": 0.0973, + "step": 10109 + }, + { + "epoch": 4.111427409516064, + "grad_norm": 0.11042008860260866, + "learning_rate": 1.6001823959658405e-05, + "loss": 0.002, + "step": 10110 + }, + { + "epoch": 4.111834078893859, + "grad_norm": 5.3274889068387195, + "learning_rate": 1.6001013516927447e-05, + "loss": 0.1949, + "step": 10111 + }, + { + "epoch": 4.112240748271655, + "grad_norm": 6.737002928180371, + "learning_rate": 1.6000203012593064e-05, + "loss": 0.1409, + "step": 10112 + }, + { + "epoch": 4.112647417649451, + "grad_norm": 2.06324653648308, + "learning_rate": 1.5999392446663576e-05, + "loss": 0.0376, + "step": 10113 + }, + { + "epoch": 4.113054087027247, + "grad_norm": 11.882691695936654, + "learning_rate": 1.5998581819147306e-05, + "loss": 0.9539, + "step": 10114 + }, + { + "epoch": 4.1134607564050425, + "grad_norm": 2.463663818063875, + "learning_rate": 1.5997771130052573e-05, + "loss": 0.0794, + "step": 10115 + }, + { + "epoch": 4.113867425782838, + "grad_norm": 4.376724526733151, + "learning_rate": 1.5996960379387704e-05, + "loss": 0.0857, + "step": 10116 + }, + { + "epoch": 4.114274095160634, + "grad_norm": 5.459569847216566, + "learning_rate": 1.5996149567161015e-05, + "loss": 0.1271, + "step": 10117 + }, + { + "epoch": 4.114680764538431, + "grad_norm": 7.612478991859463, + "learning_rate": 1.5995338693380835e-05, + "loss": 0.2286, + "step": 10118 + }, + { + "epoch": 4.1150874339162264, + "grad_norm": 7.488946849273051, + "learning_rate": 1.5994527758055485e-05, + "loss": 0.1998, + "step": 10119 + }, + { + "epoch": 4.115494103294022, + "grad_norm": 2.000803687381153, + "learning_rate": 1.5993716761193293e-05, + "loss": 0.0314, + "step": 10120 + }, + { + "epoch": 4.115900772671818, + "grad_norm": 1.439977554547966, + "learning_rate": 1.5992905702802577e-05, + "loss": 0.0466, + "step": 10121 + }, + { + "epoch": 4.116307442049614, + "grad_norm": 7.265742130450584, + "learning_rate": 1.599209458289167e-05, + "loss": 0.1307, + "step": 10122 + }, + { + "epoch": 4.1167141114274095, + "grad_norm": 1.670597977105642, + "learning_rate": 1.5991283401468898e-05, + "loss": 0.029, + "step": 10123 + }, + { + "epoch": 4.117120780805205, + "grad_norm": 1.0076312293373848, + "learning_rate": 1.5990472158542588e-05, + "loss": 0.0327, + "step": 10124 + }, + { + "epoch": 4.117527450183001, + "grad_norm": 3.4475183928494153, + "learning_rate": 1.5989660854121062e-05, + "loss": 0.0623, + "step": 10125 + }, + { + "epoch": 4.117934119560797, + "grad_norm": 0.8066100665323345, + "learning_rate": 1.5988849488212656e-05, + "loss": 0.012, + "step": 10126 + }, + { + "epoch": 4.118340788938593, + "grad_norm": 0.6795321516193221, + "learning_rate": 1.5988038060825694e-05, + "loss": 0.0094, + "step": 10127 + }, + { + "epoch": 4.118747458316388, + "grad_norm": 4.482821804104517, + "learning_rate": 1.5987226571968508e-05, + "loss": 0.1248, + "step": 10128 + }, + { + "epoch": 4.119154127694185, + "grad_norm": 6.714921483672232, + "learning_rate": 1.5986415021649428e-05, + "loss": 0.3179, + "step": 10129 + }, + { + "epoch": 4.119560797071981, + "grad_norm": 12.986168845872028, + "learning_rate": 1.5985603409876783e-05, + "loss": 0.6615, + "step": 10130 + }, + { + "epoch": 4.1199674664497765, + "grad_norm": 1.1524973546051054, + "learning_rate": 1.598479173665891e-05, + "loss": 0.0144, + "step": 10131 + }, + { + "epoch": 4.120374135827572, + "grad_norm": 2.7416735761008617, + "learning_rate": 1.5983980002004134e-05, + "loss": 0.0784, + "step": 10132 + }, + { + "epoch": 4.120780805205368, + "grad_norm": 0.25039703388118484, + "learning_rate": 1.5983168205920796e-05, + "loss": 0.0053, + "step": 10133 + }, + { + "epoch": 4.121187474583164, + "grad_norm": 3.8464417861880604, + "learning_rate": 1.5982356348417222e-05, + "loss": 0.0589, + "step": 10134 + }, + { + "epoch": 4.12159414396096, + "grad_norm": 3.8393307302800377, + "learning_rate": 1.598154442950175e-05, + "loss": 0.1393, + "step": 10135 + }, + { + "epoch": 4.122000813338755, + "grad_norm": 4.916172252854958, + "learning_rate": 1.5980732449182716e-05, + "loss": 0.079, + "step": 10136 + }, + { + "epoch": 4.122407482716551, + "grad_norm": 8.38710974208782, + "learning_rate": 1.5979920407468453e-05, + "loss": 0.2855, + "step": 10137 + }, + { + "epoch": 4.122814152094347, + "grad_norm": 7.349326605093842, + "learning_rate": 1.5979108304367296e-05, + "loss": 0.4611, + "step": 10138 + }, + { + "epoch": 4.1232208214721435, + "grad_norm": 0.037530391597671096, + "learning_rate": 1.597829613988758e-05, + "loss": 0.0006, + "step": 10139 + }, + { + "epoch": 4.123627490849939, + "grad_norm": 3.7506546032834818, + "learning_rate": 1.597748391403765e-05, + "loss": 0.076, + "step": 10140 + }, + { + "epoch": 4.124034160227735, + "grad_norm": 3.7441677673706195, + "learning_rate": 1.5976671626825838e-05, + "loss": 0.0866, + "step": 10141 + }, + { + "epoch": 4.124440829605531, + "grad_norm": 5.660848029828946, + "learning_rate": 1.5975859278260483e-05, + "loss": 0.1243, + "step": 10142 + }, + { + "epoch": 4.124847498983327, + "grad_norm": 4.334525238665904, + "learning_rate": 1.5975046868349925e-05, + "loss": 0.0823, + "step": 10143 + }, + { + "epoch": 4.125254168361122, + "grad_norm": 7.783074667748837, + "learning_rate": 1.5974234397102503e-05, + "loss": 0.1731, + "step": 10144 + }, + { + "epoch": 4.125660837738918, + "grad_norm": 1.419627285824213, + "learning_rate": 1.597342186452656e-05, + "loss": 0.024, + "step": 10145 + }, + { + "epoch": 4.126067507116714, + "grad_norm": 2.1853961103909847, + "learning_rate": 1.5972609270630433e-05, + "loss": 0.0337, + "step": 10146 + }, + { + "epoch": 4.12647417649451, + "grad_norm": 9.680890155616009, + "learning_rate": 1.5971796615422468e-05, + "loss": 0.2151, + "step": 10147 + }, + { + "epoch": 4.126880845872305, + "grad_norm": 3.3888087317839206, + "learning_rate": 1.5970983898911005e-05, + "loss": 0.0545, + "step": 10148 + }, + { + "epoch": 4.127287515250102, + "grad_norm": 2.875691981729176, + "learning_rate": 1.5970171121104386e-05, + "loss": 0.0508, + "step": 10149 + }, + { + "epoch": 4.127694184627898, + "grad_norm": 4.0022658152298325, + "learning_rate": 1.5969358282010955e-05, + "loss": 0.0968, + "step": 10150 + }, + { + "epoch": 4.128100854005694, + "grad_norm": 0.19906590612814054, + "learning_rate": 1.596854538163906e-05, + "loss": 0.0037, + "step": 10151 + }, + { + "epoch": 4.128507523383489, + "grad_norm": 1.6561113314395388, + "learning_rate": 1.596773241999704e-05, + "loss": 0.0207, + "step": 10152 + }, + { + "epoch": 4.128914192761285, + "grad_norm": 8.888794677060432, + "learning_rate": 1.5966919397093243e-05, + "loss": 0.3856, + "step": 10153 + }, + { + "epoch": 4.129320862139081, + "grad_norm": 1.3893199590215983, + "learning_rate": 1.5966106312936017e-05, + "loss": 0.0249, + "step": 10154 + }, + { + "epoch": 4.129727531516877, + "grad_norm": 2.4660785982886675, + "learning_rate": 1.5965293167533707e-05, + "loss": 0.0523, + "step": 10155 + }, + { + "epoch": 4.130134200894672, + "grad_norm": 23.756159933482, + "learning_rate": 1.596447996089466e-05, + "loss": 1.5217, + "step": 10156 + }, + { + "epoch": 4.130540870272468, + "grad_norm": 8.348926307333116, + "learning_rate": 1.5963666693027224e-05, + "loss": 0.1485, + "step": 10157 + }, + { + "epoch": 4.130947539650264, + "grad_norm": 5.436370800870743, + "learning_rate": 1.596285336393975e-05, + "loss": 0.2401, + "step": 10158 + }, + { + "epoch": 4.131354209028061, + "grad_norm": 4.604594572072762, + "learning_rate": 1.5962039973640582e-05, + "loss": 0.147, + "step": 10159 + }, + { + "epoch": 4.131760878405856, + "grad_norm": 0.8309639015788401, + "learning_rate": 1.5961226522138075e-05, + "loss": 0.0122, + "step": 10160 + }, + { + "epoch": 4.132167547783652, + "grad_norm": 0.19400544794801286, + "learning_rate": 1.5960413009440575e-05, + "loss": 0.0043, + "step": 10161 + }, + { + "epoch": 4.132574217161448, + "grad_norm": 0.06780308158172753, + "learning_rate": 1.595959943555644e-05, + "loss": 0.001, + "step": 10162 + }, + { + "epoch": 4.132980886539244, + "grad_norm": 6.0566780961340125, + "learning_rate": 1.5958785800494012e-05, + "loss": 0.2275, + "step": 10163 + }, + { + "epoch": 4.133387555917039, + "grad_norm": 2.1261676984077256, + "learning_rate": 1.5957972104261654e-05, + "loss": 0.0294, + "step": 10164 + }, + { + "epoch": 4.133794225294835, + "grad_norm": 7.015477299861226, + "learning_rate": 1.595715834686771e-05, + "loss": 0.1401, + "step": 10165 + }, + { + "epoch": 4.134200894672631, + "grad_norm": 0.053189039253591024, + "learning_rate": 1.5956344528320537e-05, + "loss": 0.0009, + "step": 10166 + }, + { + "epoch": 4.134607564050427, + "grad_norm": 6.4167635066815665, + "learning_rate": 1.5955530648628492e-05, + "loss": 0.1905, + "step": 10167 + }, + { + "epoch": 4.1350142334282225, + "grad_norm": 2.477202396732873, + "learning_rate": 1.595471670779993e-05, + "loss": 0.0356, + "step": 10168 + }, + { + "epoch": 4.135420902806018, + "grad_norm": 1.814919362439912, + "learning_rate": 1.59539027058432e-05, + "loss": 0.0248, + "step": 10169 + }, + { + "epoch": 4.135827572183815, + "grad_norm": 6.564009423953356, + "learning_rate": 1.595308864276666e-05, + "loss": 0.2161, + "step": 10170 + }, + { + "epoch": 4.136234241561611, + "grad_norm": 2.657348160706653, + "learning_rate": 1.5952274518578675e-05, + "loss": 0.0452, + "step": 10171 + }, + { + "epoch": 4.1366409109394064, + "grad_norm": 0.9853562308204183, + "learning_rate": 1.5951460333287592e-05, + "loss": 0.0214, + "step": 10172 + }, + { + "epoch": 4.137047580317202, + "grad_norm": 18.706136880652735, + "learning_rate": 1.5950646086901774e-05, + "loss": 1.0199, + "step": 10173 + }, + { + "epoch": 4.137454249694998, + "grad_norm": 7.137116483876165, + "learning_rate": 1.5949831779429583e-05, + "loss": 0.1991, + "step": 10174 + }, + { + "epoch": 4.137860919072794, + "grad_norm": 7.941198406089034, + "learning_rate": 1.594901741087937e-05, + "loss": 0.3309, + "step": 10175 + }, + { + "epoch": 4.1382675884505895, + "grad_norm": 7.130932136439739, + "learning_rate": 1.59482029812595e-05, + "loss": 0.1804, + "step": 10176 + }, + { + "epoch": 4.138674257828385, + "grad_norm": 10.180168541117053, + "learning_rate": 1.594738849057833e-05, + "loss": 0.6242, + "step": 10177 + }, + { + "epoch": 4.139080927206181, + "grad_norm": 11.797214163395173, + "learning_rate": 1.594657393884423e-05, + "loss": 0.4389, + "step": 10178 + }, + { + "epoch": 4.139487596583977, + "grad_norm": 4.996161120223716, + "learning_rate": 1.5945759326065553e-05, + "loss": 0.1174, + "step": 10179 + }, + { + "epoch": 4.1398942659617735, + "grad_norm": 7.95810307028992, + "learning_rate": 1.594494465225066e-05, + "loss": 0.5532, + "step": 10180 + }, + { + "epoch": 4.140300935339569, + "grad_norm": 1.2403414855533936, + "learning_rate": 1.5944129917407926e-05, + "loss": 0.0201, + "step": 10181 + }, + { + "epoch": 4.140707604717365, + "grad_norm": 0.4305052269097065, + "learning_rate": 1.5943315121545702e-05, + "loss": 0.009, + "step": 10182 + }, + { + "epoch": 4.141114274095161, + "grad_norm": 0.454747903762896, + "learning_rate": 1.5942500264672356e-05, + "loss": 0.0065, + "step": 10183 + }, + { + "epoch": 4.1415209434729565, + "grad_norm": 4.273802710275314, + "learning_rate": 1.5941685346796257e-05, + "loss": 0.0576, + "step": 10184 + }, + { + "epoch": 4.141927612850752, + "grad_norm": 1.680258895316438, + "learning_rate": 1.5940870367925765e-05, + "loss": 0.0248, + "step": 10185 + }, + { + "epoch": 4.142334282228548, + "grad_norm": 8.734738552490803, + "learning_rate": 1.594005532806925e-05, + "loss": 0.4779, + "step": 10186 + }, + { + "epoch": 4.142740951606344, + "grad_norm": 0.985881562900716, + "learning_rate": 1.593924022723508e-05, + "loss": 0.0094, + "step": 10187 + }, + { + "epoch": 4.14314762098414, + "grad_norm": 3.3048681705839345, + "learning_rate": 1.5938425065431616e-05, + "loss": 0.0511, + "step": 10188 + }, + { + "epoch": 4.143554290361935, + "grad_norm": 6.215243168681938, + "learning_rate": 1.593760984266723e-05, + "loss": 0.365, + "step": 10189 + }, + { + "epoch": 4.143960959739732, + "grad_norm": 6.490956885103959, + "learning_rate": 1.5936794558950293e-05, + "loss": 0.4493, + "step": 10190 + }, + { + "epoch": 4.144367629117528, + "grad_norm": 3.9146544834231687, + "learning_rate": 1.5935979214289172e-05, + "loss": 0.0854, + "step": 10191 + }, + { + "epoch": 4.1447742984953235, + "grad_norm": 10.984840214320984, + "learning_rate": 1.5935163808692236e-05, + "loss": 0.5248, + "step": 10192 + }, + { + "epoch": 4.145180967873119, + "grad_norm": 7.982219773056562, + "learning_rate": 1.5934348342167855e-05, + "loss": 0.3538, + "step": 10193 + }, + { + "epoch": 4.145587637250915, + "grad_norm": 0.4457857852762333, + "learning_rate": 1.59335328147244e-05, + "loss": 0.007, + "step": 10194 + }, + { + "epoch": 4.145994306628711, + "grad_norm": 8.948902196959335, + "learning_rate": 1.593271722637025e-05, + "loss": 0.4929, + "step": 10195 + }, + { + "epoch": 4.146400976006507, + "grad_norm": 8.56620235204261, + "learning_rate": 1.5931901577113768e-05, + "loss": 0.377, + "step": 10196 + }, + { + "epoch": 4.146807645384302, + "grad_norm": 9.551658441868913, + "learning_rate": 1.593108586696333e-05, + "loss": 0.2099, + "step": 10197 + }, + { + "epoch": 4.147214314762098, + "grad_norm": 1.0167645183579277, + "learning_rate": 1.5930270095927315e-05, + "loss": 0.0139, + "step": 10198 + }, + { + "epoch": 4.147620984139894, + "grad_norm": 5.71021364728332, + "learning_rate": 1.592945426401409e-05, + "loss": 0.0941, + "step": 10199 + }, + { + "epoch": 4.1480276535176905, + "grad_norm": 10.129985503571039, + "learning_rate": 1.5928638371232033e-05, + "loss": 1.0819, + "step": 10200 + }, + { + "epoch": 4.148434322895486, + "grad_norm": 11.893219952266168, + "learning_rate": 1.5927822417589517e-05, + "loss": 0.8549, + "step": 10201 + }, + { + "epoch": 4.148840992273282, + "grad_norm": 8.890233716646652, + "learning_rate": 1.5927006403094924e-05, + "loss": 0.3792, + "step": 10202 + }, + { + "epoch": 4.149247661651078, + "grad_norm": 3.8250124790009123, + "learning_rate": 1.5926190327756623e-05, + "loss": 0.0435, + "step": 10203 + }, + { + "epoch": 4.149654331028874, + "grad_norm": 2.088137025359697, + "learning_rate": 1.5925374191582997e-05, + "loss": 0.0237, + "step": 10204 + }, + { + "epoch": 4.150061000406669, + "grad_norm": 8.023604075551523, + "learning_rate": 1.5924557994582426e-05, + "loss": 0.1284, + "step": 10205 + }, + { + "epoch": 4.150467669784465, + "grad_norm": 0.10538783291279387, + "learning_rate": 1.5923741736763285e-05, + "loss": 0.0016, + "step": 10206 + }, + { + "epoch": 4.150874339162261, + "grad_norm": 7.071822927859189, + "learning_rate": 1.5922925418133953e-05, + "loss": 0.3039, + "step": 10207 + }, + { + "epoch": 4.151281008540057, + "grad_norm": 6.997733390200213, + "learning_rate": 1.592210903870281e-05, + "loss": 0.3118, + "step": 10208 + }, + { + "epoch": 4.151687677917852, + "grad_norm": 7.28263923002806, + "learning_rate": 1.5921292598478238e-05, + "loss": 0.1807, + "step": 10209 + }, + { + "epoch": 4.152094347295648, + "grad_norm": 4.502453640899566, + "learning_rate": 1.5920476097468615e-05, + "loss": 0.1346, + "step": 10210 + }, + { + "epoch": 4.152501016673445, + "grad_norm": 9.060773232792327, + "learning_rate": 1.591965953568233e-05, + "loss": 0.2621, + "step": 10211 + }, + { + "epoch": 4.152907686051241, + "grad_norm": 24.24166809506629, + "learning_rate": 1.5918842913127757e-05, + "loss": 0.5948, + "step": 10212 + }, + { + "epoch": 4.153314355429036, + "grad_norm": 5.1955562218884195, + "learning_rate": 1.591802622981328e-05, + "loss": 0.0984, + "step": 10213 + }, + { + "epoch": 4.153721024806832, + "grad_norm": 0.6793776832549195, + "learning_rate": 1.5917209485747288e-05, + "loss": 0.0127, + "step": 10214 + }, + { + "epoch": 4.154127694184628, + "grad_norm": 13.037767068225055, + "learning_rate": 1.5916392680938166e-05, + "loss": 0.1842, + "step": 10215 + }, + { + "epoch": 4.154534363562424, + "grad_norm": 14.796262840963701, + "learning_rate": 1.591557581539429e-05, + "loss": 0.6207, + "step": 10216 + }, + { + "epoch": 4.154941032940219, + "grad_norm": 3.231354760950123, + "learning_rate": 1.5914758889124055e-05, + "loss": 0.0504, + "step": 10217 + }, + { + "epoch": 4.155347702318015, + "grad_norm": 0.6003765337127331, + "learning_rate": 1.5913941902135838e-05, + "loss": 0.0121, + "step": 10218 + }, + { + "epoch": 4.155754371695811, + "grad_norm": 7.066905984162223, + "learning_rate": 1.5913124854438035e-05, + "loss": 0.2743, + "step": 10219 + }, + { + "epoch": 4.156161041073607, + "grad_norm": 5.7980859936208775, + "learning_rate": 1.591230774603903e-05, + "loss": 0.2058, + "step": 10220 + }, + { + "epoch": 4.156567710451403, + "grad_norm": 10.55696571177737, + "learning_rate": 1.591149057694721e-05, + "loss": 0.6176, + "step": 10221 + }, + { + "epoch": 4.156974379829199, + "grad_norm": 4.344900908007107, + "learning_rate": 1.591067334717096e-05, + "loss": 0.0912, + "step": 10222 + }, + { + "epoch": 4.157381049206995, + "grad_norm": 7.7016527759572515, + "learning_rate": 1.590985605671868e-05, + "loss": 0.0907, + "step": 10223 + }, + { + "epoch": 4.157787718584791, + "grad_norm": 9.220916466001347, + "learning_rate": 1.5909038705598747e-05, + "loss": 0.4151, + "step": 10224 + }, + { + "epoch": 4.1581943879625864, + "grad_norm": 0.2622047594513011, + "learning_rate": 1.590822129381956e-05, + "loss": 0.0055, + "step": 10225 + }, + { + "epoch": 4.158601057340382, + "grad_norm": 2.9348648986444874, + "learning_rate": 1.5907403821389507e-05, + "loss": 0.0491, + "step": 10226 + }, + { + "epoch": 4.159007726718178, + "grad_norm": 0.1423580513785935, + "learning_rate": 1.5906586288316984e-05, + "loss": 0.0015, + "step": 10227 + }, + { + "epoch": 4.159414396095974, + "grad_norm": 12.870004528683465, + "learning_rate": 1.5905768694610377e-05, + "loss": 0.6393, + "step": 10228 + }, + { + "epoch": 4.1598210654737695, + "grad_norm": 4.98864936744567, + "learning_rate": 1.5904951040278084e-05, + "loss": 0.1639, + "step": 10229 + }, + { + "epoch": 4.160227734851565, + "grad_norm": 17.104819706017082, + "learning_rate": 1.5904133325328495e-05, + "loss": 0.3145, + "step": 10230 + }, + { + "epoch": 4.160634404229362, + "grad_norm": 5.791069166489187, + "learning_rate": 1.5903315549770006e-05, + "loss": 0.1073, + "step": 10231 + }, + { + "epoch": 4.161041073607158, + "grad_norm": 0.605474993991465, + "learning_rate": 1.5902497713611015e-05, + "loss": 0.0095, + "step": 10232 + }, + { + "epoch": 4.1614477429849535, + "grad_norm": 3.8162557809952578, + "learning_rate": 1.590167981685991e-05, + "loss": 0.0627, + "step": 10233 + }, + { + "epoch": 4.161854412362749, + "grad_norm": 0.41137153282064787, + "learning_rate": 1.5900861859525093e-05, + "loss": 0.0041, + "step": 10234 + }, + { + "epoch": 4.162261081740545, + "grad_norm": 13.957915638668878, + "learning_rate": 1.590004384161496e-05, + "loss": 0.1823, + "step": 10235 + }, + { + "epoch": 4.162667751118341, + "grad_norm": 0.4929825257813838, + "learning_rate": 1.5899225763137905e-05, + "loss": 0.0076, + "step": 10236 + }, + { + "epoch": 4.1630744204961365, + "grad_norm": 0.2357087229835033, + "learning_rate": 1.589840762410233e-05, + "loss": 0.0055, + "step": 10237 + }, + { + "epoch": 4.163481089873932, + "grad_norm": 5.122133726744756, + "learning_rate": 1.589758942451663e-05, + "loss": 0.1497, + "step": 10238 + }, + { + "epoch": 4.163887759251728, + "grad_norm": 11.091351614853329, + "learning_rate": 1.589677116438921e-05, + "loss": 0.5322, + "step": 10239 + }, + { + "epoch": 4.164294428629524, + "grad_norm": 4.350420249969282, + "learning_rate": 1.5895952843728465e-05, + "loss": 0.0692, + "step": 10240 + }, + { + "epoch": 4.1647010980073205, + "grad_norm": 14.102566483862377, + "learning_rate": 1.5895134462542798e-05, + "loss": 0.6776, + "step": 10241 + }, + { + "epoch": 4.165107767385116, + "grad_norm": 6.721806381308513, + "learning_rate": 1.5894316020840607e-05, + "loss": 0.2231, + "step": 10242 + }, + { + "epoch": 4.165514436762912, + "grad_norm": 7.156778274362446, + "learning_rate": 1.5893497518630296e-05, + "loss": 0.1689, + "step": 10243 + }, + { + "epoch": 4.165921106140708, + "grad_norm": 5.281221446807091, + "learning_rate": 1.5892678955920267e-05, + "loss": 0.132, + "step": 10244 + }, + { + "epoch": 4.1663277755185035, + "grad_norm": 4.057847837535631, + "learning_rate": 1.5891860332718923e-05, + "loss": 0.2743, + "step": 10245 + }, + { + "epoch": 4.166734444896299, + "grad_norm": 3.266975757202991, + "learning_rate": 1.5891041649034665e-05, + "loss": 0.1336, + "step": 10246 + }, + { + "epoch": 4.167141114274095, + "grad_norm": 8.48204872181571, + "learning_rate": 1.58902229048759e-05, + "loss": 0.6768, + "step": 10247 + }, + { + "epoch": 4.167547783651891, + "grad_norm": 1.8729760538880988, + "learning_rate": 1.5889404100251037e-05, + "loss": 0.0273, + "step": 10248 + }, + { + "epoch": 4.167954453029687, + "grad_norm": 3.140385411733788, + "learning_rate": 1.5888585235168474e-05, + "loss": 0.045, + "step": 10249 + }, + { + "epoch": 4.168361122407482, + "grad_norm": 6.673879788159423, + "learning_rate": 1.588776630963662e-05, + "loss": 0.3285, + "step": 10250 + }, + { + "epoch": 4.168767791785278, + "grad_norm": 2.414131665389106, + "learning_rate": 1.588694732366388e-05, + "loss": 0.0417, + "step": 10251 + }, + { + "epoch": 4.169174461163075, + "grad_norm": 11.32563589287079, + "learning_rate": 1.5886128277258665e-05, + "loss": 0.4774, + "step": 10252 + }, + { + "epoch": 4.1695811305408705, + "grad_norm": 7.895353999553494, + "learning_rate": 1.5885309170429378e-05, + "loss": 0.3903, + "step": 10253 + }, + { + "epoch": 4.169987799918666, + "grad_norm": 11.079932396033108, + "learning_rate": 1.5884490003184433e-05, + "loss": 0.4885, + "step": 10254 + }, + { + "epoch": 4.170394469296462, + "grad_norm": 4.4065665127835985, + "learning_rate": 1.5883670775532236e-05, + "loss": 0.1665, + "step": 10255 + }, + { + "epoch": 4.170801138674258, + "grad_norm": 5.364491689343461, + "learning_rate": 1.5882851487481196e-05, + "loss": 0.0938, + "step": 10256 + }, + { + "epoch": 4.171207808052054, + "grad_norm": 12.508655100955458, + "learning_rate": 1.5882032139039725e-05, + "loss": 0.4128, + "step": 10257 + }, + { + "epoch": 4.171614477429849, + "grad_norm": 8.030878390767425, + "learning_rate": 1.5881212730216237e-05, + "loss": 0.9706, + "step": 10258 + }, + { + "epoch": 4.172021146807645, + "grad_norm": 3.4640465937631575, + "learning_rate": 1.5880393261019137e-05, + "loss": 0.1481, + "step": 10259 + }, + { + "epoch": 4.172427816185441, + "grad_norm": 4.340671583771955, + "learning_rate": 1.5879573731456842e-05, + "loss": 0.1444, + "step": 10260 + }, + { + "epoch": 4.172834485563237, + "grad_norm": 4.323888629969375, + "learning_rate": 1.5878754141537764e-05, + "loss": 0.0858, + "step": 10261 + }, + { + "epoch": 4.173241154941033, + "grad_norm": 4.9007035426013985, + "learning_rate": 1.5877934491270316e-05, + "loss": 0.1054, + "step": 10262 + }, + { + "epoch": 4.173647824318829, + "grad_norm": 14.639743289076593, + "learning_rate": 1.587711478066291e-05, + "loss": 0.4355, + "step": 10263 + }, + { + "epoch": 4.174054493696625, + "grad_norm": 1.861412907979053, + "learning_rate": 1.5876295009723964e-05, + "loss": 0.0262, + "step": 10264 + }, + { + "epoch": 4.174461163074421, + "grad_norm": 5.724267133161028, + "learning_rate": 1.5875475178461892e-05, + "loss": 0.1913, + "step": 10265 + }, + { + "epoch": 4.174867832452216, + "grad_norm": 2.93201658012308, + "learning_rate": 1.587465528688511e-05, + "loss": 0.0557, + "step": 10266 + }, + { + "epoch": 4.175274501830012, + "grad_norm": 7.208131582497054, + "learning_rate": 1.5873835335002038e-05, + "loss": 0.2045, + "step": 10267 + }, + { + "epoch": 4.175681171207808, + "grad_norm": 7.101341910340732, + "learning_rate": 1.5873015322821085e-05, + "loss": 0.1724, + "step": 10268 + }, + { + "epoch": 4.176087840585604, + "grad_norm": 3.722430432846172, + "learning_rate": 1.5872195250350677e-05, + "loss": 0.1678, + "step": 10269 + }, + { + "epoch": 4.176494509963399, + "grad_norm": 5.6239517572984745, + "learning_rate": 1.587137511759923e-05, + "loss": 0.1447, + "step": 10270 + }, + { + "epoch": 4.176901179341195, + "grad_norm": 1.1874662122882143, + "learning_rate": 1.587055492457516e-05, + "loss": 0.0189, + "step": 10271 + }, + { + "epoch": 4.177307848718992, + "grad_norm": 13.29505037425251, + "learning_rate": 1.586973467128689e-05, + "loss": 1.1305, + "step": 10272 + }, + { + "epoch": 4.177714518096788, + "grad_norm": 6.578622883532241, + "learning_rate": 1.586891435774284e-05, + "loss": 0.1947, + "step": 10273 + }, + { + "epoch": 4.178121187474583, + "grad_norm": 0.1104393732903283, + "learning_rate": 1.5868093983951428e-05, + "loss": 0.0014, + "step": 10274 + }, + { + "epoch": 4.178527856852379, + "grad_norm": 4.630209653926905, + "learning_rate": 1.586727354992108e-05, + "loss": 0.1045, + "step": 10275 + }, + { + "epoch": 4.178934526230175, + "grad_norm": 0.23935446664184815, + "learning_rate": 1.5866453055660218e-05, + "loss": 0.0046, + "step": 10276 + }, + { + "epoch": 4.179341195607971, + "grad_norm": 7.574112197663587, + "learning_rate": 1.5865632501177262e-05, + "loss": 0.2852, + "step": 10277 + }, + { + "epoch": 4.1797478649857664, + "grad_norm": 4.787246765163566, + "learning_rate": 1.586481188648063e-05, + "loss": 0.2558, + "step": 10278 + }, + { + "epoch": 4.180154534363562, + "grad_norm": 3.536895614106999, + "learning_rate": 1.586399121157876e-05, + "loss": 0.0704, + "step": 10279 + }, + { + "epoch": 4.180561203741358, + "grad_norm": 6.895602449625473, + "learning_rate": 1.5863170476480067e-05, + "loss": 0.2059, + "step": 10280 + }, + { + "epoch": 4.180967873119154, + "grad_norm": 1.5252375107270097, + "learning_rate": 1.5862349681192976e-05, + "loss": 0.0268, + "step": 10281 + }, + { + "epoch": 4.18137454249695, + "grad_norm": 14.828809227446904, + "learning_rate": 1.586152882572592e-05, + "loss": 0.6981, + "step": 10282 + }, + { + "epoch": 4.181781211874746, + "grad_norm": 10.327092469516899, + "learning_rate": 1.5860707910087314e-05, + "loss": 0.3287, + "step": 10283 + }, + { + "epoch": 4.182187881252542, + "grad_norm": 0.5794652727052321, + "learning_rate": 1.5859886934285596e-05, + "loss": 0.014, + "step": 10284 + }, + { + "epoch": 4.182594550630338, + "grad_norm": 11.540666080074292, + "learning_rate": 1.5859065898329188e-05, + "loss": 0.592, + "step": 10285 + }, + { + "epoch": 4.1830012200081335, + "grad_norm": 4.849702013896176, + "learning_rate": 1.5858244802226518e-05, + "loss": 0.1816, + "step": 10286 + }, + { + "epoch": 4.183407889385929, + "grad_norm": 5.372365378333699, + "learning_rate": 1.585742364598602e-05, + "loss": 0.207, + "step": 10287 + }, + { + "epoch": 4.183814558763725, + "grad_norm": 5.551540381989169, + "learning_rate": 1.5856602429616116e-05, + "loss": 0.1146, + "step": 10288 + }, + { + "epoch": 4.184221228141521, + "grad_norm": 16.940547491899757, + "learning_rate": 1.5855781153125244e-05, + "loss": 0.83, + "step": 10289 + }, + { + "epoch": 4.1846278975193165, + "grad_norm": 2.549355812435819, + "learning_rate": 1.5854959816521832e-05, + "loss": 0.0364, + "step": 10290 + }, + { + "epoch": 4.185034566897112, + "grad_norm": 7.04234890966012, + "learning_rate": 1.5854138419814308e-05, + "loss": 0.3533, + "step": 10291 + }, + { + "epoch": 4.185441236274908, + "grad_norm": 8.45712303637379, + "learning_rate": 1.585331696301111e-05, + "loss": 0.2703, + "step": 10292 + }, + { + "epoch": 4.185847905652705, + "grad_norm": 1.1292570645230464, + "learning_rate": 1.5852495446120667e-05, + "loss": 0.0339, + "step": 10293 + }, + { + "epoch": 4.1862545750305005, + "grad_norm": 1.3203879729407213, + "learning_rate": 1.5851673869151412e-05, + "loss": 0.0194, + "step": 10294 + }, + { + "epoch": 4.186661244408296, + "grad_norm": 0.545196308810692, + "learning_rate": 1.5850852232111778e-05, + "loss": 0.0072, + "step": 10295 + }, + { + "epoch": 4.187067913786092, + "grad_norm": 4.690717062832537, + "learning_rate": 1.5850030535010202e-05, + "loss": 0.153, + "step": 10296 + }, + { + "epoch": 4.187474583163888, + "grad_norm": 16.783324280821702, + "learning_rate": 1.584920877785512e-05, + "loss": 0.7784, + "step": 10297 + }, + { + "epoch": 4.1878812525416835, + "grad_norm": 8.134178447869274, + "learning_rate": 1.5848386960654964e-05, + "loss": 0.2631, + "step": 10298 + }, + { + "epoch": 4.188287921919479, + "grad_norm": 4.94300199872503, + "learning_rate": 1.584756508341817e-05, + "loss": 0.1754, + "step": 10299 + }, + { + "epoch": 4.188694591297275, + "grad_norm": 4.489953983079751, + "learning_rate": 1.5846743146153183e-05, + "loss": 0.2825, + "step": 10300 + }, + { + "epoch": 4.189101260675071, + "grad_norm": 0.05734765950424864, + "learning_rate": 1.584592114886843e-05, + "loss": 0.0009, + "step": 10301 + }, + { + "epoch": 4.189507930052867, + "grad_norm": 1.1139853829006063, + "learning_rate": 1.5845099091572358e-05, + "loss": 0.0133, + "step": 10302 + }, + { + "epoch": 4.189914599430663, + "grad_norm": 6.5589897070033745, + "learning_rate": 1.5844276974273404e-05, + "loss": 0.3432, + "step": 10303 + }, + { + "epoch": 4.190321268808459, + "grad_norm": 5.866673160245872, + "learning_rate": 1.584345479698e-05, + "loss": 0.2108, + "step": 10304 + }, + { + "epoch": 4.190727938186255, + "grad_norm": 0.051353802345705565, + "learning_rate": 1.5842632559700595e-05, + "loss": 0.0009, + "step": 10305 + }, + { + "epoch": 4.1911346075640505, + "grad_norm": 2.8481003090425334, + "learning_rate": 1.5841810262443623e-05, + "loss": 0.0521, + "step": 10306 + }, + { + "epoch": 4.191541276941846, + "grad_norm": 0.3109018630865751, + "learning_rate": 1.5840987905217532e-05, + "loss": 0.0058, + "step": 10307 + }, + { + "epoch": 4.191947946319642, + "grad_norm": 10.126893066239194, + "learning_rate": 1.5840165488030757e-05, + "loss": 0.2748, + "step": 10308 + }, + { + "epoch": 4.192354615697438, + "grad_norm": 4.32070273265905, + "learning_rate": 1.5839343010891746e-05, + "loss": 0.1377, + "step": 10309 + }, + { + "epoch": 4.192761285075234, + "grad_norm": 7.5758885042714486, + "learning_rate": 1.5838520473808942e-05, + "loss": 0.3281, + "step": 10310 + }, + { + "epoch": 4.193167954453029, + "grad_norm": 13.052952855776233, + "learning_rate": 1.5837697876790784e-05, + "loss": 0.8403, + "step": 10311 + }, + { + "epoch": 4.193574623830825, + "grad_norm": 5.019971304002212, + "learning_rate": 1.583687521984572e-05, + "loss": 0.3362, + "step": 10312 + }, + { + "epoch": 4.193981293208622, + "grad_norm": 2.3063179345765263, + "learning_rate": 1.5836052502982195e-05, + "loss": 0.0391, + "step": 10313 + }, + { + "epoch": 4.1943879625864176, + "grad_norm": 0.3412394503854309, + "learning_rate": 1.5835229726208654e-05, + "loss": 0.0049, + "step": 10314 + }, + { + "epoch": 4.194794631964213, + "grad_norm": 5.699829465279366, + "learning_rate": 1.583440688953354e-05, + "loss": 0.1609, + "step": 10315 + }, + { + "epoch": 4.195201301342009, + "grad_norm": 1.8456209125954532, + "learning_rate": 1.5833583992965307e-05, + "loss": 0.0652, + "step": 10316 + }, + { + "epoch": 4.195607970719805, + "grad_norm": 9.392379454482613, + "learning_rate": 1.5832761036512397e-05, + "loss": 0.6031, + "step": 10317 + }, + { + "epoch": 4.196014640097601, + "grad_norm": 0.9629340653179062, + "learning_rate": 1.5831938020183258e-05, + "loss": 0.0232, + "step": 10318 + }, + { + "epoch": 4.196421309475396, + "grad_norm": 10.533047021168521, + "learning_rate": 1.583111494398634e-05, + "loss": 0.3145, + "step": 10319 + }, + { + "epoch": 4.196827978853192, + "grad_norm": 0.019470409323970983, + "learning_rate": 1.5830291807930093e-05, + "loss": 0.0006, + "step": 10320 + }, + { + "epoch": 4.197234648230988, + "grad_norm": 0.941818356322765, + "learning_rate": 1.582946861202297e-05, + "loss": 0.0156, + "step": 10321 + }, + { + "epoch": 4.197641317608784, + "grad_norm": 1.5682670457243966, + "learning_rate": 1.582864535627341e-05, + "loss": 0.0325, + "step": 10322 + }, + { + "epoch": 4.19804798698658, + "grad_norm": 9.223258020572278, + "learning_rate": 1.582782204068988e-05, + "loss": 0.2442, + "step": 10323 + }, + { + "epoch": 4.198454656364376, + "grad_norm": 5.8677089355641865, + "learning_rate": 1.5826998665280816e-05, + "loss": 0.1454, + "step": 10324 + }, + { + "epoch": 4.198861325742172, + "grad_norm": 9.171530038510637, + "learning_rate": 1.5826175230054683e-05, + "loss": 0.3088, + "step": 10325 + }, + { + "epoch": 4.199267995119968, + "grad_norm": 0.2833654928650886, + "learning_rate": 1.582535173501993e-05, + "loss": 0.003, + "step": 10326 + }, + { + "epoch": 4.199674664497763, + "grad_norm": 15.317839713710626, + "learning_rate": 1.5824528180185006e-05, + "loss": 0.9679, + "step": 10327 + }, + { + "epoch": 4.200081333875559, + "grad_norm": 4.472233034539501, + "learning_rate": 1.582370456555837e-05, + "loss": 0.2395, + "step": 10328 + }, + { + "epoch": 4.200488003253355, + "grad_norm": 5.585222225122869, + "learning_rate": 1.582288089114848e-05, + "loss": 0.1781, + "step": 10329 + }, + { + "epoch": 4.200894672631151, + "grad_norm": 10.625862923985512, + "learning_rate": 1.582205715696378e-05, + "loss": 0.6267, + "step": 10330 + }, + { + "epoch": 4.2013013420089464, + "grad_norm": 4.837828571618871, + "learning_rate": 1.5821233363012738e-05, + "loss": 0.0795, + "step": 10331 + }, + { + "epoch": 4.201708011386742, + "grad_norm": 9.983560213608346, + "learning_rate": 1.5820409509303804e-05, + "loss": 0.2159, + "step": 10332 + }, + { + "epoch": 4.202114680764538, + "grad_norm": 4.855244117794876, + "learning_rate": 1.5819585595845434e-05, + "loss": 0.2034, + "step": 10333 + }, + { + "epoch": 4.202521350142335, + "grad_norm": 1.7526586262582555, + "learning_rate": 1.581876162264609e-05, + "loss": 0.0385, + "step": 10334 + }, + { + "epoch": 4.20292801952013, + "grad_norm": 2.361202042033954, + "learning_rate": 1.5817937589714236e-05, + "loss": 0.0508, + "step": 10335 + }, + { + "epoch": 4.203334688897926, + "grad_norm": 1.1208425451831727, + "learning_rate": 1.5817113497058317e-05, + "loss": 0.0144, + "step": 10336 + }, + { + "epoch": 4.203741358275722, + "grad_norm": 7.676441981394531, + "learning_rate": 1.5816289344686803e-05, + "loss": 0.2447, + "step": 10337 + }, + { + "epoch": 4.204148027653518, + "grad_norm": 2.3850210860998304, + "learning_rate": 1.5815465132608153e-05, + "loss": 0.0676, + "step": 10338 + }, + { + "epoch": 4.2045546970313135, + "grad_norm": 2.059247414296063, + "learning_rate": 1.5814640860830825e-05, + "loss": 0.0401, + "step": 10339 + }, + { + "epoch": 4.204961366409109, + "grad_norm": 4.027106444025499, + "learning_rate": 1.5813816529363284e-05, + "loss": 0.0664, + "step": 10340 + }, + { + "epoch": 4.205368035786905, + "grad_norm": 0.5297662715276318, + "learning_rate": 1.581299213821399e-05, + "loss": 0.0094, + "step": 10341 + }, + { + "epoch": 4.205774705164701, + "grad_norm": 6.077039230388544, + "learning_rate": 1.58121676873914e-05, + "loss": 0.1005, + "step": 10342 + }, + { + "epoch": 4.2061813745424965, + "grad_norm": 5.450807560390357, + "learning_rate": 1.581134317690399e-05, + "loss": 0.1592, + "step": 10343 + }, + { + "epoch": 4.206588043920293, + "grad_norm": 4.499958359133264, + "learning_rate": 1.5810518606760217e-05, + "loss": 0.1157, + "step": 10344 + }, + { + "epoch": 4.206994713298089, + "grad_norm": 4.540547657946913, + "learning_rate": 1.5809693976968546e-05, + "loss": 0.1455, + "step": 10345 + }, + { + "epoch": 4.207401382675885, + "grad_norm": 8.010963278196611, + "learning_rate": 1.580886928753744e-05, + "loss": 0.2083, + "step": 10346 + }, + { + "epoch": 4.2078080520536805, + "grad_norm": 6.255374055566489, + "learning_rate": 1.5808044538475367e-05, + "loss": 0.1313, + "step": 10347 + }, + { + "epoch": 4.208214721431476, + "grad_norm": 14.035749750046511, + "learning_rate": 1.5807219729790796e-05, + "loss": 0.3131, + "step": 10348 + }, + { + "epoch": 4.208621390809272, + "grad_norm": 2.593389913938225, + "learning_rate": 1.580639486149219e-05, + "loss": 0.0198, + "step": 10349 + }, + { + "epoch": 4.209028060187068, + "grad_norm": 0.0863063213253828, + "learning_rate": 1.580556993358802e-05, + "loss": 0.0014, + "step": 10350 + }, + { + "epoch": 4.2094347295648635, + "grad_norm": 9.685907285934407, + "learning_rate": 1.580474494608675e-05, + "loss": 0.4344, + "step": 10351 + }, + { + "epoch": 4.209841398942659, + "grad_norm": 10.663807549415797, + "learning_rate": 1.5803919898996855e-05, + "loss": 0.3657, + "step": 10352 + }, + { + "epoch": 4.210248068320455, + "grad_norm": 3.9777078108063018, + "learning_rate": 1.58030947923268e-05, + "loss": 0.0414, + "step": 10353 + }, + { + "epoch": 4.210654737698252, + "grad_norm": 0.18287030027652043, + "learning_rate": 1.5802269626085055e-05, + "loss": 0.0027, + "step": 10354 + }, + { + "epoch": 4.2110614070760475, + "grad_norm": 1.5503797937845258, + "learning_rate": 1.5801444400280094e-05, + "loss": 0.0197, + "step": 10355 + }, + { + "epoch": 4.211468076453843, + "grad_norm": 4.34718692677501, + "learning_rate": 1.5800619114920386e-05, + "loss": 0.0669, + "step": 10356 + }, + { + "epoch": 4.211874745831639, + "grad_norm": 7.940792843119552, + "learning_rate": 1.5799793770014408e-05, + "loss": 0.3664, + "step": 10357 + }, + { + "epoch": 4.212281415209435, + "grad_norm": 0.8513689040747776, + "learning_rate": 1.5798968365570623e-05, + "loss": 0.0132, + "step": 10358 + }, + { + "epoch": 4.2126880845872305, + "grad_norm": 2.293028418499348, + "learning_rate": 1.5798142901597507e-05, + "loss": 0.0161, + "step": 10359 + }, + { + "epoch": 4.213094753965026, + "grad_norm": 4.214119518964465, + "learning_rate": 1.579731737810354e-05, + "loss": 0.0603, + "step": 10360 + }, + { + "epoch": 4.213501423342822, + "grad_norm": 3.328302882125485, + "learning_rate": 1.5796491795097192e-05, + "loss": 0.0675, + "step": 10361 + }, + { + "epoch": 4.213908092720618, + "grad_norm": 0.534406407641644, + "learning_rate": 1.5795666152586937e-05, + "loss": 0.0068, + "step": 10362 + }, + { + "epoch": 4.214314762098414, + "grad_norm": 4.629835515161879, + "learning_rate": 1.579484045058125e-05, + "loss": 0.2644, + "step": 10363 + }, + { + "epoch": 4.21472143147621, + "grad_norm": 0.8495152323834411, + "learning_rate": 1.579401468908861e-05, + "loss": 0.0098, + "step": 10364 + }, + { + "epoch": 4.215128100854006, + "grad_norm": 11.060168501821627, + "learning_rate": 1.57931888681175e-05, + "loss": 0.2959, + "step": 10365 + }, + { + "epoch": 4.215534770231802, + "grad_norm": 8.854855219741658, + "learning_rate": 1.5792362987676382e-05, + "loss": 0.8279, + "step": 10366 + }, + { + "epoch": 4.2159414396095976, + "grad_norm": 9.045064840816858, + "learning_rate": 1.5791537047773747e-05, + "loss": 0.4145, + "step": 10367 + }, + { + "epoch": 4.216348108987393, + "grad_norm": 0.292820790996213, + "learning_rate": 1.5790711048418065e-05, + "loss": 0.004, + "step": 10368 + }, + { + "epoch": 4.216754778365189, + "grad_norm": 6.1488774309155945, + "learning_rate": 1.5789884989617824e-05, + "loss": 0.2689, + "step": 10369 + }, + { + "epoch": 4.217161447742985, + "grad_norm": 3.067474151377539, + "learning_rate": 1.5789058871381498e-05, + "loss": 0.0438, + "step": 10370 + }, + { + "epoch": 4.217568117120781, + "grad_norm": 0.9148702698946974, + "learning_rate": 1.5788232693717568e-05, + "loss": 0.0114, + "step": 10371 + }, + { + "epoch": 4.217974786498576, + "grad_norm": 8.247630014280265, + "learning_rate": 1.578740645663452e-05, + "loss": 0.2648, + "step": 10372 + }, + { + "epoch": 4.218381455876372, + "grad_norm": 0.23304840488398543, + "learning_rate": 1.578658016014083e-05, + "loss": 0.0026, + "step": 10373 + }, + { + "epoch": 4.218788125254168, + "grad_norm": 1.3125744853722503, + "learning_rate": 1.578575380424498e-05, + "loss": 0.0227, + "step": 10374 + }, + { + "epoch": 4.219194794631965, + "grad_norm": 0.12184973057553074, + "learning_rate": 1.578492738895546e-05, + "loss": 0.0013, + "step": 10375 + }, + { + "epoch": 4.21960146400976, + "grad_norm": 6.529088759186514, + "learning_rate": 1.5784100914280746e-05, + "loss": 0.4228, + "step": 10376 + }, + { + "epoch": 4.220008133387556, + "grad_norm": 5.516887611459233, + "learning_rate": 1.5783274380229327e-05, + "loss": 0.1273, + "step": 10377 + }, + { + "epoch": 4.220414802765352, + "grad_norm": 0.07073357996432991, + "learning_rate": 1.5782447786809682e-05, + "loss": 0.0013, + "step": 10378 + }, + { + "epoch": 4.220821472143148, + "grad_norm": 17.127864494528083, + "learning_rate": 1.57816211340303e-05, + "loss": 0.8753, + "step": 10379 + }, + { + "epoch": 4.221228141520943, + "grad_norm": 5.496574856258764, + "learning_rate": 1.578079442189967e-05, + "loss": 0.138, + "step": 10380 + }, + { + "epoch": 4.221634810898739, + "grad_norm": 3.5412945627894814, + "learning_rate": 1.5779967650426278e-05, + "loss": 0.0707, + "step": 10381 + }, + { + "epoch": 4.222041480276535, + "grad_norm": 2.7617886928939575, + "learning_rate": 1.577914081961861e-05, + "loss": 0.0829, + "step": 10382 + }, + { + "epoch": 4.222448149654331, + "grad_norm": 0.07732311308881161, + "learning_rate": 1.5778313929485147e-05, + "loss": 0.0013, + "step": 10383 + }, + { + "epoch": 4.222854819032127, + "grad_norm": 1.2710428896789892, + "learning_rate": 1.5777486980034384e-05, + "loss": 0.0232, + "step": 10384 + }, + { + "epoch": 4.223261488409923, + "grad_norm": 0.7159622801975577, + "learning_rate": 1.5776659971274814e-05, + "loss": 0.0132, + "step": 10385 + }, + { + "epoch": 4.223668157787719, + "grad_norm": 2.997230890418626, + "learning_rate": 1.5775832903214918e-05, + "loss": 0.0662, + "step": 10386 + }, + { + "epoch": 4.224074827165515, + "grad_norm": 0.4028347844242888, + "learning_rate": 1.577500577586319e-05, + "loss": 0.0142, + "step": 10387 + }, + { + "epoch": 4.22448149654331, + "grad_norm": 6.399319968658966, + "learning_rate": 1.5774178589228128e-05, + "loss": 0.1668, + "step": 10388 + }, + { + "epoch": 4.224888165921106, + "grad_norm": 9.689261944934353, + "learning_rate": 1.5773351343318208e-05, + "loss": 0.3238, + "step": 10389 + }, + { + "epoch": 4.225294835298902, + "grad_norm": 9.482086512344543, + "learning_rate": 1.5772524038141935e-05, + "loss": 0.5933, + "step": 10390 + }, + { + "epoch": 4.225701504676698, + "grad_norm": 14.442378776230397, + "learning_rate": 1.5771696673707796e-05, + "loss": 0.7504, + "step": 10391 + }, + { + "epoch": 4.2261081740544935, + "grad_norm": 6.919229474420857, + "learning_rate": 1.5770869250024285e-05, + "loss": 0.2403, + "step": 10392 + }, + { + "epoch": 4.226514843432289, + "grad_norm": 6.666446199379, + "learning_rate": 1.57700417670999e-05, + "loss": 0.1729, + "step": 10393 + }, + { + "epoch": 4.226921512810085, + "grad_norm": 5.106769754269291, + "learning_rate": 1.576921422494313e-05, + "loss": 0.1803, + "step": 10394 + }, + { + "epoch": 4.227328182187882, + "grad_norm": 0.5702333291643601, + "learning_rate": 1.5768386623562472e-05, + "loss": 0.0081, + "step": 10395 + }, + { + "epoch": 4.227734851565677, + "grad_norm": 10.304049851020448, + "learning_rate": 1.5767558962966424e-05, + "loss": 0.3659, + "step": 10396 + }, + { + "epoch": 4.228141520943473, + "grad_norm": 7.217585273078888, + "learning_rate": 1.5766731243163482e-05, + "loss": 0.4108, + "step": 10397 + }, + { + "epoch": 4.228548190321269, + "grad_norm": 1.330527976819756, + "learning_rate": 1.5765903464162138e-05, + "loss": 0.0208, + "step": 10398 + }, + { + "epoch": 4.228954859699065, + "grad_norm": 6.2007795987370935, + "learning_rate": 1.5765075625970895e-05, + "loss": 0.1431, + "step": 10399 + }, + { + "epoch": 4.2293615290768605, + "grad_norm": 0.6312654003367438, + "learning_rate": 1.5764247728598245e-05, + "loss": 0.0064, + "step": 10400 + }, + { + "epoch": 4.229768198454656, + "grad_norm": 4.5440467420253965, + "learning_rate": 1.5763419772052697e-05, + "loss": 0.2125, + "step": 10401 + }, + { + "epoch": 4.230174867832452, + "grad_norm": 0.9366659237305706, + "learning_rate": 1.576259175634274e-05, + "loss": 0.0087, + "step": 10402 + }, + { + "epoch": 4.230581537210248, + "grad_norm": 5.806460779584179, + "learning_rate": 1.5761763681476882e-05, + "loss": 0.1933, + "step": 10403 + }, + { + "epoch": 4.2309882065880435, + "grad_norm": 9.865381017054617, + "learning_rate": 1.5760935547463617e-05, + "loss": 0.3463, + "step": 10404 + }, + { + "epoch": 4.23139487596584, + "grad_norm": 9.562763769900043, + "learning_rate": 1.576010735431145e-05, + "loss": 0.3038, + "step": 10405 + }, + { + "epoch": 4.231801545343636, + "grad_norm": 0.01914243688207192, + "learning_rate": 1.5759279102028885e-05, + "loss": 0.0004, + "step": 10406 + }, + { + "epoch": 4.232208214721432, + "grad_norm": 8.211028157086695, + "learning_rate": 1.5758450790624417e-05, + "loss": 0.3527, + "step": 10407 + }, + { + "epoch": 4.2326148840992275, + "grad_norm": 1.0705157840353139, + "learning_rate": 1.5757622420106556e-05, + "loss": 0.0135, + "step": 10408 + }, + { + "epoch": 4.233021553477023, + "grad_norm": 14.334638184854306, + "learning_rate": 1.5756793990483803e-05, + "loss": 0.4032, + "step": 10409 + }, + { + "epoch": 4.233428222854819, + "grad_norm": 4.805597741028479, + "learning_rate": 1.575596550176466e-05, + "loss": 0.2022, + "step": 10410 + }, + { + "epoch": 4.233834892232615, + "grad_norm": 4.464585400494239, + "learning_rate": 1.5755136953957636e-05, + "loss": 0.0938, + "step": 10411 + }, + { + "epoch": 4.2342415616104105, + "grad_norm": 10.020874129294175, + "learning_rate": 1.5754308347071233e-05, + "loss": 0.4356, + "step": 10412 + }, + { + "epoch": 4.234648230988206, + "grad_norm": 3.01538824267839, + "learning_rate": 1.5753479681113962e-05, + "loss": 0.1771, + "step": 10413 + }, + { + "epoch": 4.235054900366002, + "grad_norm": 1.6022787925753235, + "learning_rate": 1.5752650956094323e-05, + "loss": 0.0253, + "step": 10414 + }, + { + "epoch": 4.235461569743798, + "grad_norm": 2.815342659982956, + "learning_rate": 1.575182217202083e-05, + "loss": 0.0565, + "step": 10415 + }, + { + "epoch": 4.2358682391215945, + "grad_norm": 4.513368092658365, + "learning_rate": 1.5750993328901985e-05, + "loss": 0.0859, + "step": 10416 + }, + { + "epoch": 4.23627490849939, + "grad_norm": 2.38187644673508, + "learning_rate": 1.57501644267463e-05, + "loss": 0.1051, + "step": 10417 + }, + { + "epoch": 4.236681577877186, + "grad_norm": 3.584899171869904, + "learning_rate": 1.5749335465562285e-05, + "loss": 0.1709, + "step": 10418 + }, + { + "epoch": 4.237088247254982, + "grad_norm": 0.9536641541206603, + "learning_rate": 1.5748506445358444e-05, + "loss": 0.013, + "step": 10419 + }, + { + "epoch": 4.2374949166327776, + "grad_norm": 5.891621140553107, + "learning_rate": 1.5747677366143295e-05, + "loss": 0.1206, + "step": 10420 + }, + { + "epoch": 4.237901586010573, + "grad_norm": 6.964760607810459, + "learning_rate": 1.5746848227925342e-05, + "loss": 0.3492, + "step": 10421 + }, + { + "epoch": 4.238308255388369, + "grad_norm": 4.148138775908956, + "learning_rate": 1.5746019030713103e-05, + "loss": 0.1145, + "step": 10422 + }, + { + "epoch": 4.238714924766165, + "grad_norm": 0.40377098031982106, + "learning_rate": 1.574518977451508e-05, + "loss": 0.0032, + "step": 10423 + }, + { + "epoch": 4.239121594143961, + "grad_norm": 7.294387862500528, + "learning_rate": 1.57443604593398e-05, + "loss": 0.2021, + "step": 10424 + }, + { + "epoch": 4.239528263521757, + "grad_norm": 3.9518933855755587, + "learning_rate": 1.5743531085195767e-05, + "loss": 0.16, + "step": 10425 + }, + { + "epoch": 4.239934932899553, + "grad_norm": 13.003933387148207, + "learning_rate": 1.5742701652091493e-05, + "loss": 0.4182, + "step": 10426 + }, + { + "epoch": 4.240341602277349, + "grad_norm": 0.7921512225423422, + "learning_rate": 1.57418721600355e-05, + "loss": 0.0118, + "step": 10427 + }, + { + "epoch": 4.240748271655145, + "grad_norm": 7.38004550947579, + "learning_rate": 1.57410426090363e-05, + "loss": 0.2647, + "step": 10428 + }, + { + "epoch": 4.24115494103294, + "grad_norm": 0.2216718803602644, + "learning_rate": 1.5740212999102404e-05, + "loss": 0.005, + "step": 10429 + }, + { + "epoch": 4.241561610410736, + "grad_norm": 11.347268431794255, + "learning_rate": 1.5739383330242338e-05, + "loss": 0.4221, + "step": 10430 + }, + { + "epoch": 4.241968279788532, + "grad_norm": 5.334342713508321, + "learning_rate": 1.5738553602464608e-05, + "loss": 0.1084, + "step": 10431 + }, + { + "epoch": 4.242374949166328, + "grad_norm": 3.569617724262522, + "learning_rate": 1.5737723815777743e-05, + "loss": 0.06, + "step": 10432 + }, + { + "epoch": 4.242781618544123, + "grad_norm": 1.502549044702233, + "learning_rate": 1.573689397019025e-05, + "loss": 0.0158, + "step": 10433 + }, + { + "epoch": 4.243188287921919, + "grad_norm": 6.162044999248074, + "learning_rate": 1.5736064065710656e-05, + "loss": 0.1142, + "step": 10434 + }, + { + "epoch": 4.243594957299715, + "grad_norm": 6.963398765899166, + "learning_rate": 1.5735234102347477e-05, + "loss": 0.1676, + "step": 10435 + }, + { + "epoch": 4.244001626677512, + "grad_norm": 5.895899704057396, + "learning_rate": 1.5734404080109233e-05, + "loss": 0.1139, + "step": 10436 + }, + { + "epoch": 4.244408296055307, + "grad_norm": 4.052242446960732, + "learning_rate": 1.5733573999004445e-05, + "loss": 0.0627, + "step": 10437 + }, + { + "epoch": 4.244814965433103, + "grad_norm": 8.451305025685844, + "learning_rate": 1.5732743859041635e-05, + "loss": 0.2362, + "step": 10438 + }, + { + "epoch": 4.245221634810899, + "grad_norm": 9.268248128162057, + "learning_rate": 1.573191366022932e-05, + "loss": 0.3537, + "step": 10439 + }, + { + "epoch": 4.245628304188695, + "grad_norm": 15.350426767163366, + "learning_rate": 1.573108340257603e-05, + "loss": 0.9619, + "step": 10440 + }, + { + "epoch": 4.24603497356649, + "grad_norm": 13.877933296308907, + "learning_rate": 1.5730253086090286e-05, + "loss": 0.6555, + "step": 10441 + }, + { + "epoch": 4.246441642944286, + "grad_norm": 8.468866451224228, + "learning_rate": 1.572942271078061e-05, + "loss": 0.3847, + "step": 10442 + }, + { + "epoch": 4.246848312322082, + "grad_norm": 5.809286656238291, + "learning_rate": 1.5728592276655525e-05, + "loss": 0.1549, + "step": 10443 + }, + { + "epoch": 4.247254981699878, + "grad_norm": 2.680718037523531, + "learning_rate": 1.5727761783723555e-05, + "loss": 0.046, + "step": 10444 + }, + { + "epoch": 4.2476616510776735, + "grad_norm": 15.97865959734904, + "learning_rate": 1.572693123199323e-05, + "loss": 0.4847, + "step": 10445 + }, + { + "epoch": 4.24806832045547, + "grad_norm": 1.5030113381458141, + "learning_rate": 1.5726100621473076e-05, + "loss": 0.0195, + "step": 10446 + }, + { + "epoch": 4.248474989833266, + "grad_norm": 0.49794021612930756, + "learning_rate": 1.5725269952171612e-05, + "loss": 0.0102, + "step": 10447 + }, + { + "epoch": 4.248881659211062, + "grad_norm": 4.076147273429927, + "learning_rate": 1.5724439224097374e-05, + "loss": 0.0997, + "step": 10448 + }, + { + "epoch": 4.249288328588857, + "grad_norm": 1.5477743134792041, + "learning_rate": 1.5723608437258883e-05, + "loss": 0.0239, + "step": 10449 + }, + { + "epoch": 4.249694997966653, + "grad_norm": 3.8321017674788043, + "learning_rate": 1.5722777591664672e-05, + "loss": 0.0948, + "step": 10450 + }, + { + "epoch": 4.250101667344449, + "grad_norm": 13.347887897700161, + "learning_rate": 1.572194668732327e-05, + "loss": 0.4178, + "step": 10451 + }, + { + "epoch": 4.250508336722245, + "grad_norm": 3.1508820866037133, + "learning_rate": 1.5721115724243204e-05, + "loss": 0.0482, + "step": 10452 + }, + { + "epoch": 4.2509150061000405, + "grad_norm": 5.1509540393202435, + "learning_rate": 1.5720284702433007e-05, + "loss": 0.1632, + "step": 10453 + }, + { + "epoch": 4.251321675477836, + "grad_norm": 3.8435511051827014, + "learning_rate": 1.571945362190121e-05, + "loss": 0.0185, + "step": 10454 + }, + { + "epoch": 4.251728344855632, + "grad_norm": 4.422498448145672, + "learning_rate": 1.5718622482656344e-05, + "loss": 0.1706, + "step": 10455 + }, + { + "epoch": 4.252135014233428, + "grad_norm": 5.526989280763533, + "learning_rate": 1.5717791284706936e-05, + "loss": 0.1229, + "step": 10456 + }, + { + "epoch": 4.252541683611224, + "grad_norm": 10.965023433152796, + "learning_rate": 1.5716960028061527e-05, + "loss": 0.4376, + "step": 10457 + }, + { + "epoch": 4.25294835298902, + "grad_norm": 11.760054744026313, + "learning_rate": 1.5716128712728643e-05, + "loss": 0.1603, + "step": 10458 + }, + { + "epoch": 4.253355022366816, + "grad_norm": 9.133662952501583, + "learning_rate": 1.571529733871682e-05, + "loss": 0.1991, + "step": 10459 + }, + { + "epoch": 4.253761691744612, + "grad_norm": 12.300837147483643, + "learning_rate": 1.5714465906034594e-05, + "loss": 0.5259, + "step": 10460 + }, + { + "epoch": 4.2541683611224075, + "grad_norm": 0.6511128047352391, + "learning_rate": 1.5713634414690503e-05, + "loss": 0.0106, + "step": 10461 + }, + { + "epoch": 4.254575030500203, + "grad_norm": 9.763114453905834, + "learning_rate": 1.5712802864693074e-05, + "loss": 0.5295, + "step": 10462 + }, + { + "epoch": 4.254981699877999, + "grad_norm": 8.717998391420645, + "learning_rate": 1.5711971256050852e-05, + "loss": 0.2151, + "step": 10463 + }, + { + "epoch": 4.255388369255795, + "grad_norm": 3.205018719939733, + "learning_rate": 1.5711139588772367e-05, + "loss": 0.0649, + "step": 10464 + }, + { + "epoch": 4.2557950386335905, + "grad_norm": 0.7365956815509311, + "learning_rate": 1.571030786286616e-05, + "loss": 0.0116, + "step": 10465 + }, + { + "epoch": 4.256201708011387, + "grad_norm": 6.721060229864988, + "learning_rate": 1.5709476078340773e-05, + "loss": 0.1385, + "step": 10466 + }, + { + "epoch": 4.256608377389183, + "grad_norm": 9.216699285600873, + "learning_rate": 1.5708644235204737e-05, + "loss": 0.381, + "step": 10467 + }, + { + "epoch": 4.257015046766979, + "grad_norm": 9.950576475223805, + "learning_rate": 1.5707812333466597e-05, + "loss": 0.4096, + "step": 10468 + }, + { + "epoch": 4.2574217161447745, + "grad_norm": 3.7695360183222415, + "learning_rate": 1.570698037313489e-05, + "loss": 0.0861, + "step": 10469 + }, + { + "epoch": 4.25782838552257, + "grad_norm": 0.2861714009430026, + "learning_rate": 1.5706148354218157e-05, + "loss": 0.0064, + "step": 10470 + }, + { + "epoch": 4.258235054900366, + "grad_norm": 9.996400714005594, + "learning_rate": 1.570531627672494e-05, + "loss": 0.3858, + "step": 10471 + }, + { + "epoch": 4.258641724278162, + "grad_norm": 0.10619540760900263, + "learning_rate": 1.5704484140663775e-05, + "loss": 0.0016, + "step": 10472 + }, + { + "epoch": 4.2590483936559576, + "grad_norm": 11.933145175547635, + "learning_rate": 1.5703651946043214e-05, + "loss": 0.6675, + "step": 10473 + }, + { + "epoch": 4.259455063033753, + "grad_norm": 6.005219823737406, + "learning_rate": 1.5702819692871796e-05, + "loss": 0.1336, + "step": 10474 + }, + { + "epoch": 4.259861732411549, + "grad_norm": 10.530077325458837, + "learning_rate": 1.5701987381158064e-05, + "loss": 0.382, + "step": 10475 + }, + { + "epoch": 4.260268401789345, + "grad_norm": 3.0522571385631854, + "learning_rate": 1.570115501091056e-05, + "loss": 0.0809, + "step": 10476 + }, + { + "epoch": 4.2606750711671415, + "grad_norm": 11.188533221948141, + "learning_rate": 1.570032258213783e-05, + "loss": 0.5375, + "step": 10477 + }, + { + "epoch": 4.261081740544937, + "grad_norm": 0.22851280910894617, + "learning_rate": 1.569949009484842e-05, + "loss": 0.0069, + "step": 10478 + }, + { + "epoch": 4.261488409922733, + "grad_norm": 8.734168762815218, + "learning_rate": 1.5698657549050875e-05, + "loss": 0.2176, + "step": 10479 + }, + { + "epoch": 4.261895079300529, + "grad_norm": 1.1166295584767514, + "learning_rate": 1.5697824944753742e-05, + "loss": 0.0124, + "step": 10480 + }, + { + "epoch": 4.262301748678325, + "grad_norm": 5.188909887247351, + "learning_rate": 1.569699228196557e-05, + "loss": 0.1596, + "step": 10481 + }, + { + "epoch": 4.26270841805612, + "grad_norm": 2.556274096026828, + "learning_rate": 1.5696159560694905e-05, + "loss": 0.0522, + "step": 10482 + }, + { + "epoch": 4.263115087433916, + "grad_norm": 13.448092818183792, + "learning_rate": 1.5695326780950296e-05, + "loss": 0.6119, + "step": 10483 + }, + { + "epoch": 4.263521756811712, + "grad_norm": 6.196177276207187, + "learning_rate": 1.569449394274029e-05, + "loss": 0.111, + "step": 10484 + }, + { + "epoch": 4.263928426189508, + "grad_norm": 6.900657874668741, + "learning_rate": 1.5693661046073437e-05, + "loss": 0.1963, + "step": 10485 + }, + { + "epoch": 4.264335095567303, + "grad_norm": 5.027993228984884, + "learning_rate": 1.5692828090958287e-05, + "loss": 0.119, + "step": 10486 + }, + { + "epoch": 4.2647417649451, + "grad_norm": 1.0521081548657134, + "learning_rate": 1.5691995077403393e-05, + "loss": 0.0199, + "step": 10487 + }, + { + "epoch": 4.265148434322896, + "grad_norm": 5.24024612783201, + "learning_rate": 1.5691162005417307e-05, + "loss": 0.112, + "step": 10488 + }, + { + "epoch": 4.265555103700692, + "grad_norm": 6.732432198509644, + "learning_rate": 1.5690328875008573e-05, + "loss": 0.2358, + "step": 10489 + }, + { + "epoch": 4.265961773078487, + "grad_norm": 0.9358452042416473, + "learning_rate": 1.5689495686185754e-05, + "loss": 0.0114, + "step": 10490 + }, + { + "epoch": 4.266368442456283, + "grad_norm": 8.181937651138464, + "learning_rate": 1.5688662438957397e-05, + "loss": 0.3269, + "step": 10491 + }, + { + "epoch": 4.266775111834079, + "grad_norm": 3.192688731873292, + "learning_rate": 1.5687829133332058e-05, + "loss": 0.0543, + "step": 10492 + }, + { + "epoch": 4.267181781211875, + "grad_norm": 1.963057643423699, + "learning_rate": 1.5686995769318288e-05, + "loss": 0.0321, + "step": 10493 + }, + { + "epoch": 4.26758845058967, + "grad_norm": 9.436144677201435, + "learning_rate": 1.5686162346924646e-05, + "loss": 0.7005, + "step": 10494 + }, + { + "epoch": 4.267995119967466, + "grad_norm": 8.799466473226019, + "learning_rate": 1.5685328866159687e-05, + "loss": 0.4946, + "step": 10495 + }, + { + "epoch": 4.268401789345262, + "grad_norm": 6.940319905460152, + "learning_rate": 1.5684495327031962e-05, + "loss": 0.3963, + "step": 10496 + }, + { + "epoch": 4.268808458723058, + "grad_norm": 8.750720330799298, + "learning_rate": 1.5683661729550033e-05, + "loss": 0.2812, + "step": 10497 + }, + { + "epoch": 4.269215128100854, + "grad_norm": 11.748426891896806, + "learning_rate": 1.5682828073722456e-05, + "loss": 0.4593, + "step": 10498 + }, + { + "epoch": 4.26962179747865, + "grad_norm": 6.170428636388446, + "learning_rate": 1.5681994359557792e-05, + "loss": 0.1489, + "step": 10499 + }, + { + "epoch": 4.270028466856446, + "grad_norm": 7.786884441470151, + "learning_rate": 1.5681160587064592e-05, + "loss": 0.3262, + "step": 10500 + }, + { + "epoch": 4.270435136234242, + "grad_norm": 5.302725982132482, + "learning_rate": 1.568032675625142e-05, + "loss": 0.0649, + "step": 10501 + }, + { + "epoch": 4.270841805612037, + "grad_norm": 5.964775426916128, + "learning_rate": 1.5679492867126835e-05, + "loss": 0.2399, + "step": 10502 + }, + { + "epoch": 4.271248474989833, + "grad_norm": 3.2000647358396583, + "learning_rate": 1.56786589196994e-05, + "loss": 0.1761, + "step": 10503 + }, + { + "epoch": 4.271655144367629, + "grad_norm": 2.836635279814822, + "learning_rate": 1.567782491397767e-05, + "loss": 0.0414, + "step": 10504 + }, + { + "epoch": 4.272061813745425, + "grad_norm": 35.53556499977856, + "learning_rate": 1.5676990849970212e-05, + "loss": 0.6049, + "step": 10505 + }, + { + "epoch": 4.2724684831232205, + "grad_norm": 10.561072731189322, + "learning_rate": 1.5676156727685582e-05, + "loss": 0.3718, + "step": 10506 + }, + { + "epoch": 4.272875152501017, + "grad_norm": 6.0006406181565906, + "learning_rate": 1.5675322547132353e-05, + "loss": 0.1508, + "step": 10507 + }, + { + "epoch": 4.273281821878813, + "grad_norm": 4.035527672566018, + "learning_rate": 1.5674488308319076e-05, + "loss": 0.0744, + "step": 10508 + }, + { + "epoch": 4.273688491256609, + "grad_norm": 8.268719561018752, + "learning_rate": 1.5673654011254323e-05, + "loss": 0.3239, + "step": 10509 + }, + { + "epoch": 4.274095160634404, + "grad_norm": 5.9361586541027025, + "learning_rate": 1.5672819655946657e-05, + "loss": 0.143, + "step": 10510 + }, + { + "epoch": 4.2745018300122, + "grad_norm": 11.273926328531578, + "learning_rate": 1.567198524240464e-05, + "loss": 0.5195, + "step": 10511 + }, + { + "epoch": 4.274908499389996, + "grad_norm": 1.749428902465726, + "learning_rate": 1.5671150770636837e-05, + "loss": 0.0241, + "step": 10512 + }, + { + "epoch": 4.275315168767792, + "grad_norm": 5.999289610122962, + "learning_rate": 1.5670316240651823e-05, + "loss": 0.0534, + "step": 10513 + }, + { + "epoch": 4.2757218381455875, + "grad_norm": 0.8107713228439916, + "learning_rate": 1.5669481652458157e-05, + "loss": 0.0131, + "step": 10514 + }, + { + "epoch": 4.276128507523383, + "grad_norm": 6.099410380645411, + "learning_rate": 1.5668647006064407e-05, + "loss": 0.237, + "step": 10515 + }, + { + "epoch": 4.276535176901179, + "grad_norm": 5.987274556445031, + "learning_rate": 1.5667812301479143e-05, + "loss": 0.2579, + "step": 10516 + }, + { + "epoch": 4.276941846278975, + "grad_norm": 7.040470648711305, + "learning_rate": 1.5666977538710933e-05, + "loss": 0.2331, + "step": 10517 + }, + { + "epoch": 4.277348515656771, + "grad_norm": 13.211058011794004, + "learning_rate": 1.5666142717768347e-05, + "loss": 0.5625, + "step": 10518 + }, + { + "epoch": 4.277755185034567, + "grad_norm": 6.324621032183765, + "learning_rate": 1.5665307838659955e-05, + "loss": 0.1933, + "step": 10519 + }, + { + "epoch": 4.278161854412363, + "grad_norm": 3.898977678164958, + "learning_rate": 1.5664472901394328e-05, + "loss": 0.1688, + "step": 10520 + }, + { + "epoch": 4.278568523790159, + "grad_norm": 1.9694333178008983, + "learning_rate": 1.566363790598003e-05, + "loss": 0.037, + "step": 10521 + }, + { + "epoch": 4.2789751931679545, + "grad_norm": 5.669352912254823, + "learning_rate": 1.5662802852425644e-05, + "loss": 0.1886, + "step": 10522 + }, + { + "epoch": 4.27938186254575, + "grad_norm": 1.329740660366631, + "learning_rate": 1.5661967740739738e-05, + "loss": 0.0314, + "step": 10523 + }, + { + "epoch": 4.279788531923546, + "grad_norm": 5.176309266108585, + "learning_rate": 1.5661132570930878e-05, + "loss": 0.1684, + "step": 10524 + }, + { + "epoch": 4.280195201301342, + "grad_norm": 9.007524073107149, + "learning_rate": 1.5660297343007647e-05, + "loss": 0.4868, + "step": 10525 + }, + { + "epoch": 4.2806018706791376, + "grad_norm": 8.75327849448568, + "learning_rate": 1.5659462056978613e-05, + "loss": 0.2571, + "step": 10526 + }, + { + "epoch": 4.281008540056933, + "grad_norm": 1.1083956721101726, + "learning_rate": 1.565862671285235e-05, + "loss": 0.0158, + "step": 10527 + }, + { + "epoch": 4.28141520943473, + "grad_norm": 11.634439377919767, + "learning_rate": 1.565779131063744e-05, + "loss": 0.6469, + "step": 10528 + }, + { + "epoch": 4.281821878812526, + "grad_norm": 13.31906608604933, + "learning_rate": 1.5656955850342454e-05, + "loss": 0.7058, + "step": 10529 + }, + { + "epoch": 4.2822285481903215, + "grad_norm": 5.674958641860205, + "learning_rate": 1.5656120331975972e-05, + "loss": 0.1173, + "step": 10530 + }, + { + "epoch": 4.282635217568117, + "grad_norm": 2.6542154359113295, + "learning_rate": 1.5655284755546564e-05, + "loss": 0.028, + "step": 10531 + }, + { + "epoch": 4.283041886945913, + "grad_norm": 0.25399266164745665, + "learning_rate": 1.5654449121062812e-05, + "loss": 0.0037, + "step": 10532 + }, + { + "epoch": 4.283448556323709, + "grad_norm": 0.5077684144803456, + "learning_rate": 1.5653613428533295e-05, + "loss": 0.0072, + "step": 10533 + }, + { + "epoch": 4.283855225701505, + "grad_norm": 5.618537080144728, + "learning_rate": 1.5652777677966588e-05, + "loss": 0.1819, + "step": 10534 + }, + { + "epoch": 4.2842618950793, + "grad_norm": 1.15865592765585, + "learning_rate": 1.5651941869371277e-05, + "loss": 0.0201, + "step": 10535 + }, + { + "epoch": 4.284668564457096, + "grad_norm": 0.5562899243168266, + "learning_rate": 1.5651106002755937e-05, + "loss": 0.0096, + "step": 10536 + }, + { + "epoch": 4.285075233834892, + "grad_norm": 12.270125746632543, + "learning_rate": 1.565027007812915e-05, + "loss": 0.6271, + "step": 10537 + }, + { + "epoch": 4.285481903212688, + "grad_norm": 1.4558778909092265, + "learning_rate": 1.5649434095499495e-05, + "loss": 0.0222, + "step": 10538 + }, + { + "epoch": 4.285888572590484, + "grad_norm": 6.667576556943351, + "learning_rate": 1.5648598054875557e-05, + "loss": 0.1535, + "step": 10539 + }, + { + "epoch": 4.28629524196828, + "grad_norm": 3.856651632873685, + "learning_rate": 1.5647761956265916e-05, + "loss": 0.0629, + "step": 10540 + }, + { + "epoch": 4.286701911346076, + "grad_norm": 8.912718827028907, + "learning_rate": 1.564692579967916e-05, + "loss": 0.493, + "step": 10541 + }, + { + "epoch": 4.287108580723872, + "grad_norm": 1.3023785991073666, + "learning_rate": 1.5646089585123864e-05, + "loss": 0.0201, + "step": 10542 + }, + { + "epoch": 4.287515250101667, + "grad_norm": 4.5313955405797355, + "learning_rate": 1.5645253312608618e-05, + "loss": 0.1007, + "step": 10543 + }, + { + "epoch": 4.287921919479463, + "grad_norm": 12.552824488367275, + "learning_rate": 1.5644416982142006e-05, + "loss": 0.5973, + "step": 10544 + }, + { + "epoch": 4.288328588857259, + "grad_norm": 6.6741632540536795, + "learning_rate": 1.5643580593732616e-05, + "loss": 0.4356, + "step": 10545 + }, + { + "epoch": 4.288735258235055, + "grad_norm": 0.08671344208185174, + "learning_rate": 1.5642744147389027e-05, + "loss": 0.0018, + "step": 10546 + }, + { + "epoch": 4.28914192761285, + "grad_norm": 0.4761487973161973, + "learning_rate": 1.564190764311983e-05, + "loss": 0.0102, + "step": 10547 + }, + { + "epoch": 4.289548596990647, + "grad_norm": 7.6057759128748055, + "learning_rate": 1.564107108093361e-05, + "loss": 0.376, + "step": 10548 + }, + { + "epoch": 4.289955266368443, + "grad_norm": 9.768542583012463, + "learning_rate": 1.564023446083896e-05, + "loss": 0.488, + "step": 10549 + }, + { + "epoch": 4.290361935746239, + "grad_norm": 3.593634347505954, + "learning_rate": 1.5639397782844462e-05, + "loss": 0.0719, + "step": 10550 + }, + { + "epoch": 4.290768605124034, + "grad_norm": 2.073176275576244, + "learning_rate": 1.563856104695871e-05, + "loss": 0.0321, + "step": 10551 + }, + { + "epoch": 4.29117527450183, + "grad_norm": 4.8060357075443685, + "learning_rate": 1.5637724253190288e-05, + "loss": 0.1054, + "step": 10552 + }, + { + "epoch": 4.291581943879626, + "grad_norm": 5.891181942940385, + "learning_rate": 1.563688740154779e-05, + "loss": 0.258, + "step": 10553 + }, + { + "epoch": 4.291988613257422, + "grad_norm": 10.62449769873338, + "learning_rate": 1.5636050492039807e-05, + "loss": 0.4369, + "step": 10554 + }, + { + "epoch": 4.292395282635217, + "grad_norm": 3.5292928720032233, + "learning_rate": 1.563521352467493e-05, + "loss": 0.1097, + "step": 10555 + }, + { + "epoch": 4.292801952013013, + "grad_norm": 7.60608271855695, + "learning_rate": 1.5634376499461747e-05, + "loss": 0.2552, + "step": 10556 + }, + { + "epoch": 4.293208621390809, + "grad_norm": 3.786130004969526, + "learning_rate": 1.5633539416408856e-05, + "loss": 0.0579, + "step": 10557 + }, + { + "epoch": 4.293615290768605, + "grad_norm": 10.94636801332917, + "learning_rate": 1.5632702275524848e-05, + "loss": 0.3335, + "step": 10558 + }, + { + "epoch": 4.294021960146401, + "grad_norm": 3.4340142648547296, + "learning_rate": 1.5631865076818312e-05, + "loss": 0.0564, + "step": 10559 + }, + { + "epoch": 4.294428629524197, + "grad_norm": 5.398250980129194, + "learning_rate": 1.563102782029785e-05, + "loss": 0.0959, + "step": 10560 + }, + { + "epoch": 4.294835298901993, + "grad_norm": 4.587177441658308, + "learning_rate": 1.5630190505972052e-05, + "loss": 0.185, + "step": 10561 + }, + { + "epoch": 4.295241968279789, + "grad_norm": 7.877049563040066, + "learning_rate": 1.5629353133849516e-05, + "loss": 0.2126, + "step": 10562 + }, + { + "epoch": 4.295648637657584, + "grad_norm": 5.369100701816614, + "learning_rate": 1.5628515703938835e-05, + "loss": 0.1649, + "step": 10563 + }, + { + "epoch": 4.29605530703538, + "grad_norm": 10.983397588037063, + "learning_rate": 1.5627678216248612e-05, + "loss": 0.3141, + "step": 10564 + }, + { + "epoch": 4.296461976413176, + "grad_norm": 2.744277300349731, + "learning_rate": 1.5626840670787434e-05, + "loss": 0.033, + "step": 10565 + }, + { + "epoch": 4.296868645790972, + "grad_norm": 0.2296923335765791, + "learning_rate": 1.562600306756391e-05, + "loss": 0.0049, + "step": 10566 + }, + { + "epoch": 4.2972753151687675, + "grad_norm": 10.578224108413421, + "learning_rate": 1.5625165406586628e-05, + "loss": 0.0929, + "step": 10567 + }, + { + "epoch": 4.297681984546563, + "grad_norm": 6.2358895699366075, + "learning_rate": 1.5624327687864197e-05, + "loss": 0.1619, + "step": 10568 + }, + { + "epoch": 4.29808865392436, + "grad_norm": 0.15760271639399287, + "learning_rate": 1.562348991140521e-05, + "loss": 0.0032, + "step": 10569 + }, + { + "epoch": 4.298495323302156, + "grad_norm": 8.789397973757756, + "learning_rate": 1.5622652077218265e-05, + "loss": 0.3422, + "step": 10570 + }, + { + "epoch": 4.298901992679951, + "grad_norm": 9.090689624884178, + "learning_rate": 1.562181418531197e-05, + "loss": 0.3261, + "step": 10571 + }, + { + "epoch": 4.299308662057747, + "grad_norm": 12.313559664926036, + "learning_rate": 1.5620976235694923e-05, + "loss": 0.5068, + "step": 10572 + }, + { + "epoch": 4.299715331435543, + "grad_norm": 8.579361718547862, + "learning_rate": 1.5620138228375723e-05, + "loss": 0.4011, + "step": 10573 + }, + { + "epoch": 4.300122000813339, + "grad_norm": 5.474164379853153, + "learning_rate": 1.5619300163362977e-05, + "loss": 0.146, + "step": 10574 + }, + { + "epoch": 4.3005286701911345, + "grad_norm": 5.0898021734176115, + "learning_rate": 1.561846204066529e-05, + "loss": 0.1121, + "step": 10575 + }, + { + "epoch": 4.30093533956893, + "grad_norm": 1.3456222840967398, + "learning_rate": 1.561762386029126e-05, + "loss": 0.0232, + "step": 10576 + }, + { + "epoch": 4.301342008946726, + "grad_norm": 11.708066047500083, + "learning_rate": 1.5616785622249493e-05, + "loss": 0.4469, + "step": 10577 + }, + { + "epoch": 4.301748678324522, + "grad_norm": 7.910114814037538, + "learning_rate": 1.5615947326548595e-05, + "loss": 0.2297, + "step": 10578 + }, + { + "epoch": 4.302155347702318, + "grad_norm": 0.4521614567897959, + "learning_rate": 1.561510897319717e-05, + "loss": 0.006, + "step": 10579 + }, + { + "epoch": 4.302562017080114, + "grad_norm": 8.277156044041018, + "learning_rate": 1.561427056220383e-05, + "loss": 0.2855, + "step": 10580 + }, + { + "epoch": 4.30296868645791, + "grad_norm": 9.989654185581175, + "learning_rate": 1.5613432093577174e-05, + "loss": 0.0805, + "step": 10581 + }, + { + "epoch": 4.303375355835706, + "grad_norm": 11.61638889080228, + "learning_rate": 1.561259356732581e-05, + "loss": 0.3437, + "step": 10582 + }, + { + "epoch": 4.3037820252135015, + "grad_norm": 8.240045452303063, + "learning_rate": 1.561175498345835e-05, + "loss": 0.2987, + "step": 10583 + }, + { + "epoch": 4.304188694591297, + "grad_norm": 10.456812507527477, + "learning_rate": 1.5610916341983403e-05, + "loss": 0.6339, + "step": 10584 + }, + { + "epoch": 4.304595363969093, + "grad_norm": 0.7239612399657116, + "learning_rate": 1.5610077642909573e-05, + "loss": 0.0119, + "step": 10585 + }, + { + "epoch": 4.305002033346889, + "grad_norm": 0.10668983141125571, + "learning_rate": 1.5609238886245477e-05, + "loss": 0.0014, + "step": 10586 + }, + { + "epoch": 4.305408702724685, + "grad_norm": 8.542521300958262, + "learning_rate": 1.5608400071999716e-05, + "loss": 0.3949, + "step": 10587 + }, + { + "epoch": 4.30581537210248, + "grad_norm": 2.526109738258173, + "learning_rate": 1.560756120018091e-05, + "loss": 0.0516, + "step": 10588 + }, + { + "epoch": 4.306222041480277, + "grad_norm": 10.493700854069644, + "learning_rate": 1.560672227079766e-05, + "loss": 0.2373, + "step": 10589 + }, + { + "epoch": 4.306628710858073, + "grad_norm": 5.988195569557027, + "learning_rate": 1.560588328385859e-05, + "loss": 0.3848, + "step": 10590 + }, + { + "epoch": 4.3070353802358685, + "grad_norm": 2.580174432661679, + "learning_rate": 1.56050442393723e-05, + "loss": 0.067, + "step": 10591 + }, + { + "epoch": 4.307442049613664, + "grad_norm": 4.691855675485521, + "learning_rate": 1.5604205137347418e-05, + "loss": 0.112, + "step": 10592 + }, + { + "epoch": 4.30784871899146, + "grad_norm": 13.01121067635133, + "learning_rate": 1.5603365977792545e-05, + "loss": 0.3927, + "step": 10593 + }, + { + "epoch": 4.308255388369256, + "grad_norm": 0.29908440395581326, + "learning_rate": 1.56025267607163e-05, + "loss": 0.004, + "step": 10594 + }, + { + "epoch": 4.308662057747052, + "grad_norm": 0.43314845321734785, + "learning_rate": 1.5601687486127297e-05, + "loss": 0.007, + "step": 10595 + }, + { + "epoch": 4.309068727124847, + "grad_norm": 4.005904649136195, + "learning_rate": 1.5600848154034157e-05, + "loss": 0.0661, + "step": 10596 + }, + { + "epoch": 4.309475396502643, + "grad_norm": 4.2398733041615895, + "learning_rate": 1.5600008764445488e-05, + "loss": 0.163, + "step": 10597 + }, + { + "epoch": 4.309882065880439, + "grad_norm": 8.078930953055098, + "learning_rate": 1.559916931736991e-05, + "loss": 0.2184, + "step": 10598 + }, + { + "epoch": 4.310288735258235, + "grad_norm": 0.11867489675614304, + "learning_rate": 1.5598329812816047e-05, + "loss": 0.0025, + "step": 10599 + }, + { + "epoch": 4.310695404636031, + "grad_norm": 9.996703564897238, + "learning_rate": 1.5597490250792504e-05, + "loss": 0.3582, + "step": 10600 + }, + { + "epoch": 4.311102074013827, + "grad_norm": 7.338513349301227, + "learning_rate": 1.559665063130791e-05, + "loss": 0.1815, + "step": 10601 + }, + { + "epoch": 4.311508743391623, + "grad_norm": 3.219397802886854, + "learning_rate": 1.559581095437088e-05, + "loss": 0.093, + "step": 10602 + }, + { + "epoch": 4.311915412769419, + "grad_norm": 5.906281796154085, + "learning_rate": 1.5594971219990035e-05, + "loss": 0.1255, + "step": 10603 + }, + { + "epoch": 4.312322082147214, + "grad_norm": 5.491216060507818, + "learning_rate": 1.5594131428173995e-05, + "loss": 0.218, + "step": 10604 + }, + { + "epoch": 4.31272875152501, + "grad_norm": 1.4888851988849365, + "learning_rate": 1.5593291578931377e-05, + "loss": 0.0328, + "step": 10605 + }, + { + "epoch": 4.313135420902806, + "grad_norm": 7.419857582286687, + "learning_rate": 1.559245167227081e-05, + "loss": 0.2404, + "step": 10606 + }, + { + "epoch": 4.313542090280602, + "grad_norm": 4.17249467262273, + "learning_rate": 1.559161170820091e-05, + "loss": 0.108, + "step": 10607 + }, + { + "epoch": 4.313948759658397, + "grad_norm": 4.51300654135736, + "learning_rate": 1.5590771686730302e-05, + "loss": 0.2097, + "step": 10608 + }, + { + "epoch": 4.314355429036193, + "grad_norm": 5.158228374377515, + "learning_rate": 1.5589931607867607e-05, + "loss": 0.1535, + "step": 10609 + }, + { + "epoch": 4.31476209841399, + "grad_norm": 2.3809259656345367, + "learning_rate": 1.558909147162145e-05, + "loss": 0.0429, + "step": 10610 + }, + { + "epoch": 4.315168767791786, + "grad_norm": 7.073016336488562, + "learning_rate": 1.5588251278000462e-05, + "loss": 0.1816, + "step": 10611 + }, + { + "epoch": 4.315575437169581, + "grad_norm": 10.00079531328908, + "learning_rate": 1.558741102701326e-05, + "loss": 0.2684, + "step": 10612 + }, + { + "epoch": 4.315982106547377, + "grad_norm": 0.9384139555320912, + "learning_rate": 1.5586570718668467e-05, + "loss": 0.0099, + "step": 10613 + }, + { + "epoch": 4.316388775925173, + "grad_norm": 1.7380869028458152, + "learning_rate": 1.5585730352974717e-05, + "loss": 0.0193, + "step": 10614 + }, + { + "epoch": 4.316795445302969, + "grad_norm": 16.90042529890056, + "learning_rate": 1.5584889929940634e-05, + "loss": 0.8573, + "step": 10615 + }, + { + "epoch": 4.317202114680764, + "grad_norm": 7.046741650923045, + "learning_rate": 1.5584049449574845e-05, + "loss": 0.1648, + "step": 10616 + }, + { + "epoch": 4.31760878405856, + "grad_norm": 1.5843429816336185, + "learning_rate": 1.558320891188598e-05, + "loss": 0.0501, + "step": 10617 + }, + { + "epoch": 4.318015453436356, + "grad_norm": 0.8768246158685804, + "learning_rate": 1.5582368316882663e-05, + "loss": 0.0114, + "step": 10618 + }, + { + "epoch": 4.318422122814152, + "grad_norm": 4.285356149427166, + "learning_rate": 1.5581527664573526e-05, + "loss": 0.0714, + "step": 10619 + }, + { + "epoch": 4.3188287921919475, + "grad_norm": 8.052851302995553, + "learning_rate": 1.5580686954967198e-05, + "loss": 0.2356, + "step": 10620 + }, + { + "epoch": 4.319235461569744, + "grad_norm": 2.429982448020078, + "learning_rate": 1.5579846188072308e-05, + "loss": 0.0343, + "step": 10621 + }, + { + "epoch": 4.31964213094754, + "grad_norm": 3.0113855552108224, + "learning_rate": 1.5579005363897492e-05, + "loss": 0.0559, + "step": 10622 + }, + { + "epoch": 4.320048800325336, + "grad_norm": 2.5908418812934166, + "learning_rate": 1.5578164482451377e-05, + "loss": 0.0352, + "step": 10623 + }, + { + "epoch": 4.320455469703131, + "grad_norm": 13.895436151019332, + "learning_rate": 1.5577323543742596e-05, + "loss": 0.7464, + "step": 10624 + }, + { + "epoch": 4.320862139080927, + "grad_norm": 8.690218292016024, + "learning_rate": 1.557648254777978e-05, + "loss": 0.3817, + "step": 10625 + }, + { + "epoch": 4.321268808458723, + "grad_norm": 3.433949100920572, + "learning_rate": 1.5575641494571566e-05, + "loss": 0.0628, + "step": 10626 + }, + { + "epoch": 4.321675477836519, + "grad_norm": 1.1504664007119922, + "learning_rate": 1.5574800384126584e-05, + "loss": 0.0204, + "step": 10627 + }, + { + "epoch": 4.3220821472143145, + "grad_norm": 1.3738593689592988, + "learning_rate": 1.5573959216453474e-05, + "loss": 0.0206, + "step": 10628 + }, + { + "epoch": 4.32248881659211, + "grad_norm": 8.970016502003002, + "learning_rate": 1.5573117991560866e-05, + "loss": 0.3329, + "step": 10629 + }, + { + "epoch": 4.322895485969907, + "grad_norm": 0.5227335039966103, + "learning_rate": 1.5572276709457396e-05, + "loss": 0.0125, + "step": 10630 + }, + { + "epoch": 4.323302155347703, + "grad_norm": 5.6293379741869725, + "learning_rate": 1.55714353701517e-05, + "loss": 0.2565, + "step": 10631 + }, + { + "epoch": 4.3237088247254984, + "grad_norm": 11.830238516912, + "learning_rate": 1.557059397365242e-05, + "loss": 0.5513, + "step": 10632 + }, + { + "epoch": 4.324115494103294, + "grad_norm": 4.331231301479416, + "learning_rate": 1.5569752519968183e-05, + "loss": 0.0939, + "step": 10633 + }, + { + "epoch": 4.32452216348109, + "grad_norm": 2.2979616000517757, + "learning_rate": 1.556891100910764e-05, + "loss": 0.03, + "step": 10634 + }, + { + "epoch": 4.324928832858886, + "grad_norm": 3.024633356521989, + "learning_rate": 1.5568069441079417e-05, + "loss": 0.0672, + "step": 10635 + }, + { + "epoch": 4.3253355022366815, + "grad_norm": 12.804668875553302, + "learning_rate": 1.5567227815892163e-05, + "loss": 0.7202, + "step": 10636 + }, + { + "epoch": 4.325742171614477, + "grad_norm": 9.026328637586497, + "learning_rate": 1.556638613355451e-05, + "loss": 0.4079, + "step": 10637 + }, + { + "epoch": 4.326148840992273, + "grad_norm": 0.10740634472113303, + "learning_rate": 1.55655443940751e-05, + "loss": 0.0023, + "step": 10638 + }, + { + "epoch": 4.326555510370069, + "grad_norm": 8.182214888317239, + "learning_rate": 1.5564702597462583e-05, + "loss": 0.2615, + "step": 10639 + }, + { + "epoch": 4.326962179747865, + "grad_norm": 7.043759444932721, + "learning_rate": 1.556386074372559e-05, + "loss": 0.4144, + "step": 10640 + }, + { + "epoch": 4.327368849125661, + "grad_norm": 0.91532349553048, + "learning_rate": 1.5563018832872765e-05, + "loss": 0.0155, + "step": 10641 + }, + { + "epoch": 4.327775518503457, + "grad_norm": 4.255741509190193, + "learning_rate": 1.556217686491275e-05, + "loss": 0.0597, + "step": 10642 + }, + { + "epoch": 4.328182187881253, + "grad_norm": 5.293532437092783, + "learning_rate": 1.5561334839854197e-05, + "loss": 0.097, + "step": 10643 + }, + { + "epoch": 4.3285888572590485, + "grad_norm": 5.4573347926112055, + "learning_rate": 1.556049275770574e-05, + "loss": 0.0803, + "step": 10644 + }, + { + "epoch": 4.328995526636844, + "grad_norm": 0.47447023635586055, + "learning_rate": 1.5559650618476024e-05, + "loss": 0.008, + "step": 10645 + }, + { + "epoch": 4.32940219601464, + "grad_norm": 8.934216674720577, + "learning_rate": 1.5558808422173695e-05, + "loss": 0.1731, + "step": 10646 + }, + { + "epoch": 4.329808865392436, + "grad_norm": 2.6525592179313326, + "learning_rate": 1.5557966168807405e-05, + "loss": 0.0376, + "step": 10647 + }, + { + "epoch": 4.330215534770232, + "grad_norm": 6.5587574111123015, + "learning_rate": 1.555712385838579e-05, + "loss": 0.1916, + "step": 10648 + }, + { + "epoch": 4.330622204148027, + "grad_norm": 3.832053580957883, + "learning_rate": 1.5556281490917504e-05, + "loss": 0.1591, + "step": 10649 + }, + { + "epoch": 4.331028873525823, + "grad_norm": 15.84929439022502, + "learning_rate": 1.555543906641119e-05, + "loss": 0.4865, + "step": 10650 + }, + { + "epoch": 4.33143554290362, + "grad_norm": 1.7347308473906273, + "learning_rate": 1.5554596584875502e-05, + "loss": 0.0116, + "step": 10651 + }, + { + "epoch": 4.3318422122814155, + "grad_norm": 22.19734390050309, + "learning_rate": 1.5553754046319083e-05, + "loss": 0.4206, + "step": 10652 + }, + { + "epoch": 4.332248881659211, + "grad_norm": 1.2596061857465783, + "learning_rate": 1.5552911450750583e-05, + "loss": 0.0123, + "step": 10653 + }, + { + "epoch": 4.332655551037007, + "grad_norm": 4.12839062172412, + "learning_rate": 1.555206879817865e-05, + "loss": 0.0731, + "step": 10654 + }, + { + "epoch": 4.333062220414803, + "grad_norm": 7.660021166568436, + "learning_rate": 1.555122608861194e-05, + "loss": 0.2498, + "step": 10655 + }, + { + "epoch": 4.333468889792599, + "grad_norm": 14.80357564837528, + "learning_rate": 1.55503833220591e-05, + "loss": 1.2101, + "step": 10656 + }, + { + "epoch": 4.333875559170394, + "grad_norm": 11.673184029392177, + "learning_rate": 1.554954049852878e-05, + "loss": 0.457, + "step": 10657 + }, + { + "epoch": 4.33428222854819, + "grad_norm": 20.36771293996881, + "learning_rate": 1.5548697618029635e-05, + "loss": 0.535, + "step": 10658 + }, + { + "epoch": 4.334688897925986, + "grad_norm": 1.1170778891317008, + "learning_rate": 1.5547854680570317e-05, + "loss": 0.0158, + "step": 10659 + }, + { + "epoch": 4.335095567303782, + "grad_norm": 0.44601260714286334, + "learning_rate": 1.554701168615948e-05, + "loss": 0.0069, + "step": 10660 + }, + { + "epoch": 4.335502236681577, + "grad_norm": 5.76614480789718, + "learning_rate": 1.5546168634805772e-05, + "loss": 0.1087, + "step": 10661 + }, + { + "epoch": 4.335908906059374, + "grad_norm": 8.346714279238807, + "learning_rate": 1.5545325526517852e-05, + "loss": 0.4287, + "step": 10662 + }, + { + "epoch": 4.33631557543717, + "grad_norm": 7.338379471702892, + "learning_rate": 1.5544482361304374e-05, + "loss": 0.2568, + "step": 10663 + }, + { + "epoch": 4.336722244814966, + "grad_norm": 6.09215727914029, + "learning_rate": 1.5543639139173996e-05, + "loss": 0.1464, + "step": 10664 + }, + { + "epoch": 4.337128914192761, + "grad_norm": 8.855900293151533, + "learning_rate": 1.5542795860135374e-05, + "loss": 0.2301, + "step": 10665 + }, + { + "epoch": 4.337535583570557, + "grad_norm": 1.2649397794291468, + "learning_rate": 1.5541952524197158e-05, + "loss": 0.0263, + "step": 10666 + }, + { + "epoch": 4.337942252948353, + "grad_norm": 9.228833336583687, + "learning_rate": 1.5541109131368014e-05, + "loss": 0.2885, + "step": 10667 + }, + { + "epoch": 4.338348922326149, + "grad_norm": 7.677135571241849, + "learning_rate": 1.5540265681656597e-05, + "loss": 0.4942, + "step": 10668 + }, + { + "epoch": 4.338755591703944, + "grad_norm": 0.6467731729518758, + "learning_rate": 1.5539422175071562e-05, + "loss": 0.0115, + "step": 10669 + }, + { + "epoch": 4.33916226108174, + "grad_norm": 1.2059316037572763, + "learning_rate": 1.553857861162157e-05, + "loss": 0.0183, + "step": 10670 + }, + { + "epoch": 4.339568930459537, + "grad_norm": 30.962250032212875, + "learning_rate": 1.5537734991315284e-05, + "loss": 0.501, + "step": 10671 + }, + { + "epoch": 4.339975599837333, + "grad_norm": 6.129673490099876, + "learning_rate": 1.5536891314161358e-05, + "loss": 0.2263, + "step": 10672 + }, + { + "epoch": 4.340382269215128, + "grad_norm": 5.653670640552692, + "learning_rate": 1.5536047580168458e-05, + "loss": 0.1292, + "step": 10673 + }, + { + "epoch": 4.340788938592924, + "grad_norm": 4.122031430530673, + "learning_rate": 1.553520378934524e-05, + "loss": 0.1568, + "step": 10674 + }, + { + "epoch": 4.34119560797072, + "grad_norm": 0.060241794814643104, + "learning_rate": 1.5534359941700374e-05, + "loss": 0.0009, + "step": 10675 + }, + { + "epoch": 4.341602277348516, + "grad_norm": 3.013640106483989, + "learning_rate": 1.5533516037242514e-05, + "loss": 0.0575, + "step": 10676 + }, + { + "epoch": 4.342008946726311, + "grad_norm": 10.301550310148613, + "learning_rate": 1.553267207598033e-05, + "loss": 0.3782, + "step": 10677 + }, + { + "epoch": 4.342415616104107, + "grad_norm": 0.7897362437221533, + "learning_rate": 1.5531828057922484e-05, + "loss": 0.0105, + "step": 10678 + }, + { + "epoch": 4.342822285481903, + "grad_norm": 11.011253689813381, + "learning_rate": 1.553098398307763e-05, + "loss": 0.5249, + "step": 10679 + }, + { + "epoch": 4.343228954859699, + "grad_norm": 5.069795103172297, + "learning_rate": 1.553013985145445e-05, + "loss": 0.2306, + "step": 10680 + }, + { + "epoch": 4.3436356242374945, + "grad_norm": 10.281804753498212, + "learning_rate": 1.55292956630616e-05, + "loss": 0.4766, + "step": 10681 + }, + { + "epoch": 4.344042293615291, + "grad_norm": 0.49292073878491116, + "learning_rate": 1.5528451417907745e-05, + "loss": 0.0067, + "step": 10682 + }, + { + "epoch": 4.344448962993087, + "grad_norm": 3.637702789554976, + "learning_rate": 1.5527607116001554e-05, + "loss": 0.1213, + "step": 10683 + }, + { + "epoch": 4.344855632370883, + "grad_norm": 0.5718438226312664, + "learning_rate": 1.55267627573517e-05, + "loss": 0.0086, + "step": 10684 + }, + { + "epoch": 4.3452623017486784, + "grad_norm": 15.04131604365083, + "learning_rate": 1.5525918341966838e-05, + "loss": 1.0753, + "step": 10685 + }, + { + "epoch": 4.345668971126474, + "grad_norm": 3.001985917070488, + "learning_rate": 1.5525073869855645e-05, + "loss": 0.0788, + "step": 10686 + }, + { + "epoch": 4.34607564050427, + "grad_norm": 0.8944940690122025, + "learning_rate": 1.5524229341026787e-05, + "loss": 0.0127, + "step": 10687 + }, + { + "epoch": 4.346482309882066, + "grad_norm": 0.9127560283137456, + "learning_rate": 1.5523384755488934e-05, + "loss": 0.0127, + "step": 10688 + }, + { + "epoch": 4.3468889792598615, + "grad_norm": 0.9455509041260803, + "learning_rate": 1.552254011325076e-05, + "loss": 0.0161, + "step": 10689 + }, + { + "epoch": 4.347295648637657, + "grad_norm": 9.905541323559843, + "learning_rate": 1.552169541432093e-05, + "loss": 0.2362, + "step": 10690 + }, + { + "epoch": 4.347702318015453, + "grad_norm": 9.33138659519222, + "learning_rate": 1.5520850658708116e-05, + "loss": 0.2309, + "step": 10691 + }, + { + "epoch": 4.34810898739325, + "grad_norm": 0.464445636726869, + "learning_rate": 1.5520005846420994e-05, + "loss": 0.0066, + "step": 10692 + }, + { + "epoch": 4.3485156567710455, + "grad_norm": 3.572911017087498, + "learning_rate": 1.551916097746823e-05, + "loss": 0.0566, + "step": 10693 + }, + { + "epoch": 4.348922326148841, + "grad_norm": 7.2919525314249976, + "learning_rate": 1.5518316051858505e-05, + "loss": 0.3849, + "step": 10694 + }, + { + "epoch": 4.349328995526637, + "grad_norm": 7.523911884477482, + "learning_rate": 1.5517471069600486e-05, + "loss": 0.2454, + "step": 10695 + }, + { + "epoch": 4.349735664904433, + "grad_norm": 12.423699121663036, + "learning_rate": 1.551662603070285e-05, + "loss": 0.7431, + "step": 10696 + }, + { + "epoch": 4.3501423342822285, + "grad_norm": 11.239914044572483, + "learning_rate": 1.5515780935174278e-05, + "loss": 0.4839, + "step": 10697 + }, + { + "epoch": 4.350549003660024, + "grad_norm": 7.611107526680053, + "learning_rate": 1.551493578302343e-05, + "loss": 0.3478, + "step": 10698 + }, + { + "epoch": 4.35095567303782, + "grad_norm": 3.456652688491591, + "learning_rate": 1.5514090574258994e-05, + "loss": 0.0621, + "step": 10699 + }, + { + "epoch": 4.351362342415616, + "grad_norm": 5.5967098155283646, + "learning_rate": 1.551324530888964e-05, + "loss": 0.1602, + "step": 10700 + }, + { + "epoch": 4.351769011793412, + "grad_norm": 10.579781060698785, + "learning_rate": 1.551239998692405e-05, + "loss": 0.3351, + "step": 10701 + }, + { + "epoch": 4.352175681171207, + "grad_norm": 10.667061812220654, + "learning_rate": 1.5511554608370904e-05, + "loss": 0.2825, + "step": 10702 + }, + { + "epoch": 4.352582350549004, + "grad_norm": 13.52509492027581, + "learning_rate": 1.5510709173238873e-05, + "loss": 1.305, + "step": 10703 + }, + { + "epoch": 4.3529890199268, + "grad_norm": 5.725524322455407, + "learning_rate": 1.5509863681536635e-05, + "loss": 0.0798, + "step": 10704 + }, + { + "epoch": 4.3533956893045955, + "grad_norm": 6.256997926742733, + "learning_rate": 1.5509018133272878e-05, + "loss": 0.1263, + "step": 10705 + }, + { + "epoch": 4.353802358682391, + "grad_norm": 8.65201068984148, + "learning_rate": 1.550817252845628e-05, + "loss": 0.215, + "step": 10706 + }, + { + "epoch": 4.354209028060187, + "grad_norm": 1.5432266533666332, + "learning_rate": 1.550732686709551e-05, + "loss": 0.0279, + "step": 10707 + }, + { + "epoch": 4.354615697437983, + "grad_norm": 0.5732887027923349, + "learning_rate": 1.5506481149199263e-05, + "loss": 0.0092, + "step": 10708 + }, + { + "epoch": 4.355022366815779, + "grad_norm": 1.4956148046413245, + "learning_rate": 1.5505635374776214e-05, + "loss": 0.0175, + "step": 10709 + }, + { + "epoch": 4.355429036193574, + "grad_norm": 14.470688152858976, + "learning_rate": 1.5504789543835052e-05, + "loss": 0.268, + "step": 10710 + }, + { + "epoch": 4.35583570557137, + "grad_norm": 1.3130947617219053, + "learning_rate": 1.550394365638445e-05, + "loss": 0.0185, + "step": 10711 + }, + { + "epoch": 4.356242374949167, + "grad_norm": 11.517797219603986, + "learning_rate": 1.5503097712433096e-05, + "loss": 0.5898, + "step": 10712 + }, + { + "epoch": 4.3566490443269625, + "grad_norm": 10.958247579310461, + "learning_rate": 1.5502251711989676e-05, + "loss": 0.5672, + "step": 10713 + }, + { + "epoch": 4.357055713704758, + "grad_norm": 13.086268105102663, + "learning_rate": 1.5501405655062875e-05, + "loss": 0.7418, + "step": 10714 + }, + { + "epoch": 4.357462383082554, + "grad_norm": 5.660142282216324, + "learning_rate": 1.5500559541661374e-05, + "loss": 0.2877, + "step": 10715 + }, + { + "epoch": 4.35786905246035, + "grad_norm": 3.158729437132516, + "learning_rate": 1.549971337179386e-05, + "loss": 0.0928, + "step": 10716 + }, + { + "epoch": 4.358275721838146, + "grad_norm": 3.408427561683937, + "learning_rate": 1.549886714546902e-05, + "loss": 0.0844, + "step": 10717 + }, + { + "epoch": 4.358682391215941, + "grad_norm": 12.54988721987072, + "learning_rate": 1.549802086269554e-05, + "loss": 0.5124, + "step": 10718 + }, + { + "epoch": 4.359089060593737, + "grad_norm": 16.166739041700616, + "learning_rate": 1.5497174523482113e-05, + "loss": 0.4132, + "step": 10719 + }, + { + "epoch": 4.359495729971533, + "grad_norm": 10.375373989565485, + "learning_rate": 1.549632812783742e-05, + "loss": 0.2724, + "step": 10720 + }, + { + "epoch": 4.359902399349329, + "grad_norm": 4.19695649037906, + "learning_rate": 1.5495481675770155e-05, + "loss": 0.0693, + "step": 10721 + }, + { + "epoch": 4.360309068727124, + "grad_norm": 3.902574490776838, + "learning_rate": 1.5494635167289002e-05, + "loss": 0.09, + "step": 10722 + }, + { + "epoch": 4.360715738104921, + "grad_norm": 2.6950828768681223, + "learning_rate": 1.5493788602402654e-05, + "loss": 0.0437, + "step": 10723 + }, + { + "epoch": 4.361122407482717, + "grad_norm": 5.565130265077009, + "learning_rate": 1.5492941981119802e-05, + "loss": 0.1331, + "step": 10724 + }, + { + "epoch": 4.361529076860513, + "grad_norm": 11.450540607020477, + "learning_rate": 1.5492095303449137e-05, + "loss": 0.3644, + "step": 10725 + }, + { + "epoch": 4.361935746238308, + "grad_norm": 13.887544770025375, + "learning_rate": 1.5491248569399345e-05, + "loss": 0.5711, + "step": 10726 + }, + { + "epoch": 4.362342415616104, + "grad_norm": 11.407826958946552, + "learning_rate": 1.5490401778979128e-05, + "loss": 0.3586, + "step": 10727 + }, + { + "epoch": 4.3627490849939, + "grad_norm": 11.459102463414476, + "learning_rate": 1.548955493219717e-05, + "loss": 0.3666, + "step": 10728 + }, + { + "epoch": 4.363155754371696, + "grad_norm": 1.9790368565351182, + "learning_rate": 1.5488708029062173e-05, + "loss": 0.0371, + "step": 10729 + }, + { + "epoch": 4.363562423749491, + "grad_norm": 13.864603945118793, + "learning_rate": 1.548786106958282e-05, + "loss": 0.706, + "step": 10730 + }, + { + "epoch": 4.363969093127287, + "grad_norm": 11.176271635091426, + "learning_rate": 1.548701405376781e-05, + "loss": 0.3686, + "step": 10731 + }, + { + "epoch": 4.364375762505083, + "grad_norm": 2.045604711213438, + "learning_rate": 1.5486166981625847e-05, + "loss": 0.0376, + "step": 10732 + }, + { + "epoch": 4.36478243188288, + "grad_norm": 0.3101822937473752, + "learning_rate": 1.548531985316561e-05, + "loss": 0.0053, + "step": 10733 + }, + { + "epoch": 4.365189101260675, + "grad_norm": 8.016942278614898, + "learning_rate": 1.548447266839581e-05, + "loss": 0.1722, + "step": 10734 + }, + { + "epoch": 4.365595770638471, + "grad_norm": 1.0176687907370845, + "learning_rate": 1.5483625427325137e-05, + "loss": 0.0194, + "step": 10735 + }, + { + "epoch": 4.366002440016267, + "grad_norm": 5.9586051808860665, + "learning_rate": 1.5482778129962284e-05, + "loss": 0.1623, + "step": 10736 + }, + { + "epoch": 4.366409109394063, + "grad_norm": 1.9364260245886749, + "learning_rate": 1.5481930776315963e-05, + "loss": 0.0463, + "step": 10737 + }, + { + "epoch": 4.3668157787718584, + "grad_norm": 3.7318769837410257, + "learning_rate": 1.548108336639486e-05, + "loss": 0.0612, + "step": 10738 + }, + { + "epoch": 4.367222448149654, + "grad_norm": 9.056611352683602, + "learning_rate": 1.548023590020767e-05, + "loss": 0.3387, + "step": 10739 + }, + { + "epoch": 4.36762911752745, + "grad_norm": 3.4341906507129867, + "learning_rate": 1.5479388377763108e-05, + "loss": 0.0631, + "step": 10740 + }, + { + "epoch": 4.368035786905246, + "grad_norm": 0.9388741632560103, + "learning_rate": 1.5478540799069866e-05, + "loss": 0.0157, + "step": 10741 + }, + { + "epoch": 4.3684424562830415, + "grad_norm": 7.844184398020832, + "learning_rate": 1.5477693164136645e-05, + "loss": 0.431, + "step": 10742 + }, + { + "epoch": 4.368849125660837, + "grad_norm": 4.996882185470819, + "learning_rate": 1.5476845472972146e-05, + "loss": 0.0784, + "step": 10743 + }, + { + "epoch": 4.369255795038634, + "grad_norm": 4.470493347142566, + "learning_rate": 1.5475997725585073e-05, + "loss": 0.0654, + "step": 10744 + }, + { + "epoch": 4.36966246441643, + "grad_norm": 1.3638863744497276, + "learning_rate": 1.5475149921984126e-05, + "loss": 0.0272, + "step": 10745 + }, + { + "epoch": 4.3700691337942255, + "grad_norm": 7.906426309869138, + "learning_rate": 1.547430206217801e-05, + "loss": 0.3253, + "step": 10746 + }, + { + "epoch": 4.370475803172021, + "grad_norm": 4.31956395968918, + "learning_rate": 1.547345414617543e-05, + "loss": 0.1001, + "step": 10747 + }, + { + "epoch": 4.370882472549817, + "grad_norm": 4.882429790519022, + "learning_rate": 1.5472606173985085e-05, + "loss": 0.0877, + "step": 10748 + }, + { + "epoch": 4.371289141927613, + "grad_norm": 2.8167928089830685, + "learning_rate": 1.5471758145615685e-05, + "loss": 0.0407, + "step": 10749 + }, + { + "epoch": 4.3716958113054085, + "grad_norm": 9.950805973854267, + "learning_rate": 1.5470910061075936e-05, + "loss": 0.4263, + "step": 10750 + }, + { + "epoch": 4.372102480683204, + "grad_norm": 14.956118566791988, + "learning_rate": 1.5470061920374538e-05, + "loss": 0.4551, + "step": 10751 + }, + { + "epoch": 4.372509150061, + "grad_norm": 9.777230163700084, + "learning_rate": 1.5469213723520202e-05, + "loss": 0.2942, + "step": 10752 + }, + { + "epoch": 4.372915819438797, + "grad_norm": 8.141606851003859, + "learning_rate": 1.5468365470521635e-05, + "loss": 0.2939, + "step": 10753 + }, + { + "epoch": 4.3733224888165925, + "grad_norm": 4.8391673438616385, + "learning_rate": 1.5467517161387546e-05, + "loss": 0.1005, + "step": 10754 + }, + { + "epoch": 4.373729158194388, + "grad_norm": 3.6110684466587055, + "learning_rate": 1.546666879612664e-05, + "loss": 0.0628, + "step": 10755 + }, + { + "epoch": 4.374135827572184, + "grad_norm": 6.710202191945386, + "learning_rate": 1.546582037474763e-05, + "loss": 0.2209, + "step": 10756 + }, + { + "epoch": 4.37454249694998, + "grad_norm": 0.25841226125318967, + "learning_rate": 1.546497189725922e-05, + "loss": 0.0039, + "step": 10757 + }, + { + "epoch": 4.3749491663277755, + "grad_norm": 13.596312236471595, + "learning_rate": 1.5464123363670123e-05, + "loss": 0.4657, + "step": 10758 + }, + { + "epoch": 4.375355835705571, + "grad_norm": 3.860564187717979, + "learning_rate": 1.546327477398905e-05, + "loss": 0.0815, + "step": 10759 + }, + { + "epoch": 4.375762505083367, + "grad_norm": 8.99279830897508, + "learning_rate": 1.5462426128224716e-05, + "loss": 0.5514, + "step": 10760 + }, + { + "epoch": 4.376169174461163, + "grad_norm": 11.629387410863991, + "learning_rate": 1.5461577426385823e-05, + "loss": 0.4261, + "step": 10761 + }, + { + "epoch": 4.376575843838959, + "grad_norm": 3.450348748466543, + "learning_rate": 1.546072866848109e-05, + "loss": 0.0504, + "step": 10762 + }, + { + "epoch": 4.376982513216754, + "grad_norm": 3.4373311008742533, + "learning_rate": 1.545987985451923e-05, + "loss": 0.0618, + "step": 10763 + }, + { + "epoch": 4.377389182594551, + "grad_norm": 0.34959717193268053, + "learning_rate": 1.5459030984508957e-05, + "loss": 0.006, + "step": 10764 + }, + { + "epoch": 4.377795851972347, + "grad_norm": 8.163289869745482, + "learning_rate": 1.545818205845898e-05, + "loss": 0.2172, + "step": 10765 + }, + { + "epoch": 4.3782025213501425, + "grad_norm": 0.4449772241427393, + "learning_rate": 1.5457333076378017e-05, + "loss": 0.0081, + "step": 10766 + }, + { + "epoch": 4.378609190727938, + "grad_norm": 7.387915129977797, + "learning_rate": 1.5456484038274786e-05, + "loss": 0.2746, + "step": 10767 + }, + { + "epoch": 4.379015860105734, + "grad_norm": 1.5557367370114987, + "learning_rate": 1.5455634944158002e-05, + "loss": 0.0365, + "step": 10768 + }, + { + "epoch": 4.37942252948353, + "grad_norm": 1.9314589547887138, + "learning_rate": 1.5454785794036375e-05, + "loss": 0.0347, + "step": 10769 + }, + { + "epoch": 4.379829198861326, + "grad_norm": 1.9348915400025306, + "learning_rate": 1.5453936587918626e-05, + "loss": 0.0386, + "step": 10770 + }, + { + "epoch": 4.380235868239121, + "grad_norm": 5.710178151247208, + "learning_rate": 1.5453087325813477e-05, + "loss": 0.2521, + "step": 10771 + }, + { + "epoch": 4.380642537616917, + "grad_norm": 10.943057113086988, + "learning_rate": 1.545223800772964e-05, + "loss": 0.5215, + "step": 10772 + }, + { + "epoch": 4.381049206994713, + "grad_norm": 8.554480434986136, + "learning_rate": 1.5451388633675834e-05, + "loss": 0.285, + "step": 10773 + }, + { + "epoch": 4.3814558763725096, + "grad_norm": 0.09184529425025846, + "learning_rate": 1.5450539203660785e-05, + "loss": 0.0011, + "step": 10774 + }, + { + "epoch": 4.381862545750305, + "grad_norm": 10.200551020152568, + "learning_rate": 1.5449689717693204e-05, + "loss": 0.2342, + "step": 10775 + }, + { + "epoch": 4.382269215128101, + "grad_norm": 11.189879438595973, + "learning_rate": 1.5448840175781816e-05, + "loss": 0.5032, + "step": 10776 + }, + { + "epoch": 4.382675884505897, + "grad_norm": 2.9240055369039166, + "learning_rate": 1.5447990577935345e-05, + "loss": 0.0532, + "step": 10777 + }, + { + "epoch": 4.383082553883693, + "grad_norm": 10.206055778873477, + "learning_rate": 1.5447140924162503e-05, + "loss": 0.3378, + "step": 10778 + }, + { + "epoch": 4.383489223261488, + "grad_norm": 8.342658045067948, + "learning_rate": 1.544629121447202e-05, + "loss": 0.2029, + "step": 10779 + }, + { + "epoch": 4.383895892639284, + "grad_norm": 12.307001457470093, + "learning_rate": 1.5445441448872617e-05, + "loss": 0.2951, + "step": 10780 + }, + { + "epoch": 4.38430256201708, + "grad_norm": 0.8374406724998338, + "learning_rate": 1.5444591627373014e-05, + "loss": 0.0105, + "step": 10781 + }, + { + "epoch": 4.384709231394876, + "grad_norm": 8.70678753696808, + "learning_rate": 1.5443741749981943e-05, + "loss": 0.2403, + "step": 10782 + }, + { + "epoch": 4.385115900772671, + "grad_norm": 7.586857049403484, + "learning_rate": 1.544289181670812e-05, + "loss": 0.339, + "step": 10783 + }, + { + "epoch": 4.385522570150467, + "grad_norm": 0.08240863996712841, + "learning_rate": 1.5442041827560274e-05, + "loss": 0.0013, + "step": 10784 + }, + { + "epoch": 4.385929239528264, + "grad_norm": 0.14718952302811494, + "learning_rate": 1.5441191782547132e-05, + "loss": 0.0021, + "step": 10785 + }, + { + "epoch": 4.38633590890606, + "grad_norm": 7.13421001999627, + "learning_rate": 1.5440341681677412e-05, + "loss": 0.4576, + "step": 10786 + }, + { + "epoch": 4.386742578283855, + "grad_norm": 9.360392604421007, + "learning_rate": 1.5439491524959852e-05, + "loss": 0.3231, + "step": 10787 + }, + { + "epoch": 4.387149247661651, + "grad_norm": 4.924581749354827, + "learning_rate": 1.543864131240317e-05, + "loss": 0.0819, + "step": 10788 + }, + { + "epoch": 4.387555917039447, + "grad_norm": 10.992791026593254, + "learning_rate": 1.54377910440161e-05, + "loss": 0.4606, + "step": 10789 + }, + { + "epoch": 4.387962586417243, + "grad_norm": 5.066024506333386, + "learning_rate": 1.543694071980737e-05, + "loss": 0.0917, + "step": 10790 + }, + { + "epoch": 4.3883692557950384, + "grad_norm": 5.69277155934991, + "learning_rate": 1.5436090339785704e-05, + "loss": 0.1787, + "step": 10791 + }, + { + "epoch": 4.388775925172834, + "grad_norm": 6.067563145557994, + "learning_rate": 1.5435239903959837e-05, + "loss": 0.1647, + "step": 10792 + }, + { + "epoch": 4.38918259455063, + "grad_norm": 5.6546050747860175, + "learning_rate": 1.5434389412338494e-05, + "loss": 0.1853, + "step": 10793 + }, + { + "epoch": 4.389589263928427, + "grad_norm": 10.292221681170581, + "learning_rate": 1.5433538864930412e-05, + "loss": 0.4101, + "step": 10794 + }, + { + "epoch": 4.389995933306222, + "grad_norm": 12.10642576604559, + "learning_rate": 1.5432688261744317e-05, + "loss": 0.5653, + "step": 10795 + }, + { + "epoch": 4.390402602684018, + "grad_norm": 3.409050970316565, + "learning_rate": 1.5431837602788942e-05, + "loss": 0.0764, + "step": 10796 + }, + { + "epoch": 4.390809272061814, + "grad_norm": 2.284179491994718, + "learning_rate": 1.5430986888073022e-05, + "loss": 0.0645, + "step": 10797 + }, + { + "epoch": 4.39121594143961, + "grad_norm": 4.616992870596057, + "learning_rate": 1.5430136117605285e-05, + "loss": 0.0964, + "step": 10798 + }, + { + "epoch": 4.3916226108174055, + "grad_norm": 9.657639476135463, + "learning_rate": 1.542928529139447e-05, + "loss": 0.3374, + "step": 10799 + }, + { + "epoch": 4.392029280195201, + "grad_norm": 1.6115301665881174, + "learning_rate": 1.542843440944931e-05, + "loss": 0.0192, + "step": 10800 + }, + { + "epoch": 4.392435949572997, + "grad_norm": 12.785340128132722, + "learning_rate": 1.542758347177854e-05, + "loss": 0.4928, + "step": 10801 + }, + { + "epoch": 4.392842618950793, + "grad_norm": 8.089149623657928, + "learning_rate": 1.542673247839089e-05, + "loss": 0.2105, + "step": 10802 + }, + { + "epoch": 4.3932492883285885, + "grad_norm": 11.756318632542287, + "learning_rate": 1.54258814292951e-05, + "loss": 0.3471, + "step": 10803 + }, + { + "epoch": 4.393655957706384, + "grad_norm": 14.175363824626604, + "learning_rate": 1.542503032449991e-05, + "loss": 0.6051, + "step": 10804 + }, + { + "epoch": 4.394062627084181, + "grad_norm": 8.33102685550096, + "learning_rate": 1.542417916401405e-05, + "loss": 0.3889, + "step": 10805 + }, + { + "epoch": 4.394469296461977, + "grad_norm": 0.012758565221959851, + "learning_rate": 1.5423327947846266e-05, + "loss": 0.0002, + "step": 10806 + }, + { + "epoch": 4.3948759658397725, + "grad_norm": 0.7823817157901387, + "learning_rate": 1.5422476676005284e-05, + "loss": 0.0125, + "step": 10807 + }, + { + "epoch": 4.395282635217568, + "grad_norm": 6.818807185152295, + "learning_rate": 1.5421625348499854e-05, + "loss": 0.2492, + "step": 10808 + }, + { + "epoch": 4.395689304595364, + "grad_norm": 5.077659357148607, + "learning_rate": 1.5420773965338713e-05, + "loss": 0.0972, + "step": 10809 + }, + { + "epoch": 4.39609597397316, + "grad_norm": 0.19635783067858292, + "learning_rate": 1.5419922526530596e-05, + "loss": 0.0039, + "step": 10810 + }, + { + "epoch": 4.3965026433509555, + "grad_norm": 6.667680521826918, + "learning_rate": 1.5419071032084246e-05, + "loss": 0.2244, + "step": 10811 + }, + { + "epoch": 4.396909312728751, + "grad_norm": 7.4173256779648025, + "learning_rate": 1.5418219482008404e-05, + "loss": 0.2735, + "step": 10812 + }, + { + "epoch": 4.397315982106547, + "grad_norm": 1.8716510808026097, + "learning_rate": 1.5417367876311816e-05, + "loss": 0.0526, + "step": 10813 + }, + { + "epoch": 4.397722651484343, + "grad_norm": 13.453285984744207, + "learning_rate": 1.5416516215003218e-05, + "loss": 0.6926, + "step": 10814 + }, + { + "epoch": 4.3981293208621395, + "grad_norm": 5.510361171890213, + "learning_rate": 1.5415664498091355e-05, + "loss": 0.1035, + "step": 10815 + }, + { + "epoch": 4.398535990239935, + "grad_norm": 6.737631429526746, + "learning_rate": 1.541481272558497e-05, + "loss": 0.2332, + "step": 10816 + }, + { + "epoch": 4.398942659617731, + "grad_norm": 2.3487836717941097, + "learning_rate": 1.5413960897492806e-05, + "loss": 0.0363, + "step": 10817 + }, + { + "epoch": 4.399349328995527, + "grad_norm": 4.988367608086181, + "learning_rate": 1.541310901382361e-05, + "loss": 0.2607, + "step": 10818 + }, + { + "epoch": 4.3997559983733225, + "grad_norm": 4.736669308685488, + "learning_rate": 1.5412257074586127e-05, + "loss": 0.0877, + "step": 10819 + }, + { + "epoch": 4.400162667751118, + "grad_norm": 1.6634251256844415, + "learning_rate": 1.5411405079789097e-05, + "loss": 0.0323, + "step": 10820 + }, + { + "epoch": 4.400569337128914, + "grad_norm": 12.932867836145387, + "learning_rate": 1.5410553029441272e-05, + "loss": 0.8061, + "step": 10821 + }, + { + "epoch": 4.40097600650671, + "grad_norm": 9.091177769042401, + "learning_rate": 1.54097009235514e-05, + "loss": 0.3871, + "step": 10822 + }, + { + "epoch": 4.401382675884506, + "grad_norm": 5.932552882108954, + "learning_rate": 1.540884876212822e-05, + "loss": 0.1071, + "step": 10823 + }, + { + "epoch": 4.401789345262301, + "grad_norm": 0.15107555779015164, + "learning_rate": 1.540799654518049e-05, + "loss": 0.0018, + "step": 10824 + }, + { + "epoch": 4.402196014640097, + "grad_norm": 8.733066258196407, + "learning_rate": 1.5407144272716952e-05, + "loss": 0.3265, + "step": 10825 + }, + { + "epoch": 4.402602684017894, + "grad_norm": 11.910185330775494, + "learning_rate": 1.540629194474636e-05, + "loss": 0.5678, + "step": 10826 + }, + { + "epoch": 4.4030093533956896, + "grad_norm": 11.114453095229727, + "learning_rate": 1.5405439561277455e-05, + "loss": 0.7805, + "step": 10827 + }, + { + "epoch": 4.403416022773485, + "grad_norm": 4.113765855415765, + "learning_rate": 1.5404587122318994e-05, + "loss": 0.0857, + "step": 10828 + }, + { + "epoch": 4.403822692151281, + "grad_norm": 2.0162116921760456, + "learning_rate": 1.5403734627879726e-05, + "loss": 0.0307, + "step": 10829 + }, + { + "epoch": 4.404229361529077, + "grad_norm": 8.44588692703814, + "learning_rate": 1.5402882077968406e-05, + "loss": 0.2672, + "step": 10830 + }, + { + "epoch": 4.404636030906873, + "grad_norm": 14.293605009931547, + "learning_rate": 1.540202947259378e-05, + "loss": 0.693, + "step": 10831 + }, + { + "epoch": 4.405042700284668, + "grad_norm": 0.840967552346489, + "learning_rate": 1.54011768117646e-05, + "loss": 0.0129, + "step": 10832 + }, + { + "epoch": 4.405449369662464, + "grad_norm": 10.032238674425724, + "learning_rate": 1.5400324095489624e-05, + "loss": 0.4724, + "step": 10833 + }, + { + "epoch": 4.40585603904026, + "grad_norm": 1.952376102815897, + "learning_rate": 1.5399471323777603e-05, + "loss": 0.0249, + "step": 10834 + }, + { + "epoch": 4.406262708418057, + "grad_norm": 4.851578460449132, + "learning_rate": 1.5398618496637292e-05, + "loss": 0.337, + "step": 10835 + }, + { + "epoch": 4.406669377795852, + "grad_norm": 0.884016764686853, + "learning_rate": 1.5397765614077448e-05, + "loss": 0.0127, + "step": 10836 + }, + { + "epoch": 4.407076047173648, + "grad_norm": 11.37763350797589, + "learning_rate": 1.5396912676106816e-05, + "loss": 0.6962, + "step": 10837 + }, + { + "epoch": 4.407482716551444, + "grad_norm": 5.761976625199769, + "learning_rate": 1.5396059682734164e-05, + "loss": 0.0983, + "step": 10838 + }, + { + "epoch": 4.40788938592924, + "grad_norm": 7.545454905925337, + "learning_rate": 1.539520663396824e-05, + "loss": 0.3719, + "step": 10839 + }, + { + "epoch": 4.408296055307035, + "grad_norm": 2.8515670493004706, + "learning_rate": 1.539435352981781e-05, + "loss": 0.0717, + "step": 10840 + }, + { + "epoch": 4.408702724684831, + "grad_norm": 10.146787880111447, + "learning_rate": 1.5393500370291624e-05, + "loss": 0.3488, + "step": 10841 + }, + { + "epoch": 4.409109394062627, + "grad_norm": 2.789580539656985, + "learning_rate": 1.539264715539844e-05, + "loss": 0.0606, + "step": 10842 + }, + { + "epoch": 4.409516063440423, + "grad_norm": 6.5351195408462175, + "learning_rate": 1.539179388514702e-05, + "loss": 0.1264, + "step": 10843 + }, + { + "epoch": 4.4099227328182184, + "grad_norm": 1.814726462668831, + "learning_rate": 1.539094055954612e-05, + "loss": 0.029, + "step": 10844 + }, + { + "epoch": 4.410329402196014, + "grad_norm": 14.56942561086395, + "learning_rate": 1.5390087178604508e-05, + "loss": 1.2089, + "step": 10845 + }, + { + "epoch": 4.410736071573811, + "grad_norm": 4.924839928410343, + "learning_rate": 1.5389233742330936e-05, + "loss": 0.1572, + "step": 10846 + }, + { + "epoch": 4.411142740951607, + "grad_norm": 2.8326796186370493, + "learning_rate": 1.538838025073416e-05, + "loss": 0.0446, + "step": 10847 + }, + { + "epoch": 4.411549410329402, + "grad_norm": 0.9890425420543347, + "learning_rate": 1.5387526703822958e-05, + "loss": 0.0228, + "step": 10848 + }, + { + "epoch": 4.411956079707198, + "grad_norm": 10.203335302918067, + "learning_rate": 1.5386673101606084e-05, + "loss": 0.4156, + "step": 10849 + }, + { + "epoch": 4.412362749084994, + "grad_norm": 3.4164316336633096, + "learning_rate": 1.538581944409229e-05, + "loss": 0.0675, + "step": 10850 + }, + { + "epoch": 4.41276941846279, + "grad_norm": 11.688522326029009, + "learning_rate": 1.5384965731290354e-05, + "loss": 0.6951, + "step": 10851 + }, + { + "epoch": 4.4131760878405855, + "grad_norm": 0.952604475407234, + "learning_rate": 1.5384111963209036e-05, + "loss": 0.0179, + "step": 10852 + }, + { + "epoch": 4.413582757218381, + "grad_norm": 3.193605681932547, + "learning_rate": 1.5383258139857097e-05, + "loss": 0.1433, + "step": 10853 + }, + { + "epoch": 4.413989426596177, + "grad_norm": 1.5457288444357722, + "learning_rate": 1.5382404261243306e-05, + "loss": 0.0338, + "step": 10854 + }, + { + "epoch": 4.414396095973973, + "grad_norm": 10.637521098554224, + "learning_rate": 1.5381550327376423e-05, + "loss": 0.6273, + "step": 10855 + }, + { + "epoch": 4.414802765351769, + "grad_norm": 0.5000410566304815, + "learning_rate": 1.5380696338265222e-05, + "loss": 0.0065, + "step": 10856 + }, + { + "epoch": 4.415209434729565, + "grad_norm": 0.9309867937918338, + "learning_rate": 1.537984229391846e-05, + "loss": 0.0234, + "step": 10857 + }, + { + "epoch": 4.415616104107361, + "grad_norm": 12.55312667608784, + "learning_rate": 1.5378988194344913e-05, + "loss": 0.6459, + "step": 10858 + }, + { + "epoch": 4.416022773485157, + "grad_norm": 2.7964012699211125, + "learning_rate": 1.5378134039553343e-05, + "loss": 0.0722, + "step": 10859 + }, + { + "epoch": 4.4164294428629525, + "grad_norm": 9.411605593499111, + "learning_rate": 1.537727982955252e-05, + "loss": 0.434, + "step": 10860 + }, + { + "epoch": 4.416836112240748, + "grad_norm": 8.84979552685897, + "learning_rate": 1.5376425564351218e-05, + "loss": 0.4902, + "step": 10861 + }, + { + "epoch": 4.417242781618544, + "grad_norm": 6.817226854057361, + "learning_rate": 1.5375571243958194e-05, + "loss": 0.2646, + "step": 10862 + }, + { + "epoch": 4.41764945099634, + "grad_norm": 1.5712077174929937, + "learning_rate": 1.537471686838223e-05, + "loss": 0.0385, + "step": 10863 + }, + { + "epoch": 4.4180561203741355, + "grad_norm": 9.191729943409104, + "learning_rate": 1.5373862437632093e-05, + "loss": 0.1943, + "step": 10864 + }, + { + "epoch": 4.418462789751931, + "grad_norm": 5.220356008409333, + "learning_rate": 1.537300795171655e-05, + "loss": 0.1427, + "step": 10865 + }, + { + "epoch": 4.418869459129727, + "grad_norm": 5.202868204592343, + "learning_rate": 1.5372153410644378e-05, + "loss": 0.2201, + "step": 10866 + }, + { + "epoch": 4.419276128507524, + "grad_norm": 8.145344718724495, + "learning_rate": 1.5371298814424348e-05, + "loss": 0.0901, + "step": 10867 + }, + { + "epoch": 4.4196827978853195, + "grad_norm": 4.496110143888465, + "learning_rate": 1.5370444163065235e-05, + "loss": 0.2036, + "step": 10868 + }, + { + "epoch": 4.420089467263115, + "grad_norm": 2.1689647450568446, + "learning_rate": 1.5369589456575803e-05, + "loss": 0.0332, + "step": 10869 + }, + { + "epoch": 4.420496136640911, + "grad_norm": 11.660789011235131, + "learning_rate": 1.5368734694964838e-05, + "loss": 0.4732, + "step": 10870 + }, + { + "epoch": 4.420902806018707, + "grad_norm": 4.13159075640699, + "learning_rate": 1.536787987824111e-05, + "loss": 0.0769, + "step": 10871 + }, + { + "epoch": 4.4213094753965025, + "grad_norm": 5.466948878657871, + "learning_rate": 1.536702500641339e-05, + "loss": 0.1133, + "step": 10872 + }, + { + "epoch": 4.421716144774298, + "grad_norm": 6.826912693726125, + "learning_rate": 1.5366170079490457e-05, + "loss": 0.4454, + "step": 10873 + }, + { + "epoch": 4.422122814152094, + "grad_norm": 9.179917771185462, + "learning_rate": 1.5365315097481093e-05, + "loss": 0.4089, + "step": 10874 + }, + { + "epoch": 4.42252948352989, + "grad_norm": 0.21604879691584622, + "learning_rate": 1.536446006039406e-05, + "loss": 0.0028, + "step": 10875 + }, + { + "epoch": 4.4229361529076865, + "grad_norm": 8.828307876971431, + "learning_rate": 1.5363604968238152e-05, + "loss": 0.2164, + "step": 10876 + }, + { + "epoch": 4.423342822285482, + "grad_norm": 8.650089861705027, + "learning_rate": 1.536274982102214e-05, + "loss": 0.2653, + "step": 10877 + }, + { + "epoch": 4.423749491663278, + "grad_norm": 9.960320943281952, + "learning_rate": 1.53618946187548e-05, + "loss": 0.4845, + "step": 10878 + }, + { + "epoch": 4.424156161041074, + "grad_norm": 5.570160802284752, + "learning_rate": 1.536103936144491e-05, + "loss": 0.3038, + "step": 10879 + }, + { + "epoch": 4.4245628304188696, + "grad_norm": 0.9958568658304838, + "learning_rate": 1.5360184049101255e-05, + "loss": 0.0157, + "step": 10880 + }, + { + "epoch": 4.424969499796665, + "grad_norm": 6.550884781134783, + "learning_rate": 1.5359328681732618e-05, + "loss": 0.1404, + "step": 10881 + }, + { + "epoch": 4.425376169174461, + "grad_norm": 9.171801825055606, + "learning_rate": 1.5358473259347765e-05, + "loss": 0.3735, + "step": 10882 + }, + { + "epoch": 4.425782838552257, + "grad_norm": 1.259969993904913, + "learning_rate": 1.5357617781955494e-05, + "loss": 0.0153, + "step": 10883 + }, + { + "epoch": 4.426189507930053, + "grad_norm": 4.203764266872757, + "learning_rate": 1.535676224956458e-05, + "loss": 0.0668, + "step": 10884 + }, + { + "epoch": 4.426596177307848, + "grad_norm": 7.951651213156382, + "learning_rate": 1.53559066621838e-05, + "loss": 0.2365, + "step": 10885 + }, + { + "epoch": 4.427002846685644, + "grad_norm": 4.528850704521726, + "learning_rate": 1.535505101982195e-05, + "loss": 0.0275, + "step": 10886 + }, + { + "epoch": 4.427409516063441, + "grad_norm": 3.688279371886961, + "learning_rate": 1.5354195322487804e-05, + "loss": 0.1613, + "step": 10887 + }, + { + "epoch": 4.427816185441237, + "grad_norm": 12.128378933924713, + "learning_rate": 1.5353339570190143e-05, + "loss": 0.7754, + "step": 10888 + }, + { + "epoch": 4.428222854819032, + "grad_norm": 0.656604364354849, + "learning_rate": 1.535248376293776e-05, + "loss": 0.0113, + "step": 10889 + }, + { + "epoch": 4.428629524196828, + "grad_norm": 0.18089455833629078, + "learning_rate": 1.535162790073944e-05, + "loss": 0.0024, + "step": 10890 + }, + { + "epoch": 4.429036193574624, + "grad_norm": 5.56935856782375, + "learning_rate": 1.5350771983603965e-05, + "loss": 0.2074, + "step": 10891 + }, + { + "epoch": 4.42944286295242, + "grad_norm": 9.70921705752192, + "learning_rate": 1.5349916011540123e-05, + "loss": 0.5125, + "step": 10892 + }, + { + "epoch": 4.429849532330215, + "grad_norm": 5.685412442843502, + "learning_rate": 1.53490599845567e-05, + "loss": 0.2053, + "step": 10893 + }, + { + "epoch": 4.430256201708011, + "grad_norm": 4.203083655015414, + "learning_rate": 1.5348203902662485e-05, + "loss": 0.1531, + "step": 10894 + }, + { + "epoch": 4.430662871085807, + "grad_norm": 7.865188066154627, + "learning_rate": 1.5347347765866262e-05, + "loss": 0.1766, + "step": 10895 + }, + { + "epoch": 4.431069540463603, + "grad_norm": 0.5237603449505536, + "learning_rate": 1.5346491574176826e-05, + "loss": 0.008, + "step": 10896 + }, + { + "epoch": 4.431476209841399, + "grad_norm": 1.3569967352731918, + "learning_rate": 1.5345635327602962e-05, + "loss": 0.0213, + "step": 10897 + }, + { + "epoch": 4.431882879219195, + "grad_norm": 9.133578796968331, + "learning_rate": 1.534477902615346e-05, + "loss": 0.3246, + "step": 10898 + }, + { + "epoch": 4.432289548596991, + "grad_norm": 12.304606408067702, + "learning_rate": 1.534392266983711e-05, + "loss": 0.2452, + "step": 10899 + }, + { + "epoch": 4.432696217974787, + "grad_norm": 5.878444448064943, + "learning_rate": 1.534306625866271e-05, + "loss": 0.2154, + "step": 10900 + }, + { + "epoch": 4.433102887352582, + "grad_norm": 7.439440319144649, + "learning_rate": 1.534220979263904e-05, + "loss": 0.5388, + "step": 10901 + }, + { + "epoch": 4.433509556730378, + "grad_norm": 8.464522264551977, + "learning_rate": 1.5341353271774902e-05, + "loss": 0.1738, + "step": 10902 + }, + { + "epoch": 4.433916226108174, + "grad_norm": 4.971172308231767, + "learning_rate": 1.5340496696079085e-05, + "loss": 0.2281, + "step": 10903 + }, + { + "epoch": 4.43432289548597, + "grad_norm": 0.10439388889197478, + "learning_rate": 1.5339640065560378e-05, + "loss": 0.0013, + "step": 10904 + }, + { + "epoch": 4.4347295648637655, + "grad_norm": 4.812327955277893, + "learning_rate": 1.5338783380227577e-05, + "loss": 0.1208, + "step": 10905 + }, + { + "epoch": 4.435136234241561, + "grad_norm": 6.0177206117276825, + "learning_rate": 1.5337926640089485e-05, + "loss": 0.1629, + "step": 10906 + }, + { + "epoch": 4.435542903619357, + "grad_norm": 2.9139796304680754, + "learning_rate": 1.5337069845154883e-05, + "loss": 0.0513, + "step": 10907 + }, + { + "epoch": 4.435949572997154, + "grad_norm": 1.5480065647969377, + "learning_rate": 1.5336212995432574e-05, + "loss": 0.0185, + "step": 10908 + }, + { + "epoch": 4.436356242374949, + "grad_norm": 6.280032342279312, + "learning_rate": 1.5335356090931354e-05, + "loss": 0.2357, + "step": 10909 + }, + { + "epoch": 4.436762911752745, + "grad_norm": 3.955025199797991, + "learning_rate": 1.5334499131660017e-05, + "loss": 0.1029, + "step": 10910 + }, + { + "epoch": 4.437169581130541, + "grad_norm": 0.8484568628085962, + "learning_rate": 1.5333642117627364e-05, + "loss": 0.0136, + "step": 10911 + }, + { + "epoch": 4.437576250508337, + "grad_norm": 8.610006183452336, + "learning_rate": 1.5332785048842185e-05, + "loss": 0.4115, + "step": 10912 + }, + { + "epoch": 4.4379829198861325, + "grad_norm": 8.448634121258998, + "learning_rate": 1.533192792531329e-05, + "loss": 0.2599, + "step": 10913 + }, + { + "epoch": 4.438389589263928, + "grad_norm": 0.04039256597302137, + "learning_rate": 1.533107074704947e-05, + "loss": 0.0009, + "step": 10914 + }, + { + "epoch": 4.438796258641724, + "grad_norm": 13.897215704529765, + "learning_rate": 1.5330213514059525e-05, + "loss": 0.6619, + "step": 10915 + }, + { + "epoch": 4.43920292801952, + "grad_norm": 8.640808564781237, + "learning_rate": 1.5329356226352254e-05, + "loss": 0.3563, + "step": 10916 + }, + { + "epoch": 4.439609597397316, + "grad_norm": 0.18032257918371308, + "learning_rate": 1.532849888393646e-05, + "loss": 0.0033, + "step": 10917 + }, + { + "epoch": 4.440016266775112, + "grad_norm": 6.941562544708891, + "learning_rate": 1.5327641486820946e-05, + "loss": 0.1929, + "step": 10918 + }, + { + "epoch": 4.440422936152908, + "grad_norm": 6.5996814532209145, + "learning_rate": 1.532678403501451e-05, + "loss": 0.4037, + "step": 10919 + }, + { + "epoch": 4.440829605530704, + "grad_norm": 8.770128816463483, + "learning_rate": 1.532592652852595e-05, + "loss": 0.2002, + "step": 10920 + }, + { + "epoch": 4.4412362749084995, + "grad_norm": 6.459967570687116, + "learning_rate": 1.5325068967364077e-05, + "loss": 0.1561, + "step": 10921 + }, + { + "epoch": 4.441642944286295, + "grad_norm": 6.696221501698169, + "learning_rate": 1.532421135153769e-05, + "loss": 0.2256, + "step": 10922 + }, + { + "epoch": 4.442049613664091, + "grad_norm": 2.3345764581800785, + "learning_rate": 1.5323353681055596e-05, + "loss": 0.0371, + "step": 10923 + }, + { + "epoch": 4.442456283041887, + "grad_norm": 8.89540169604741, + "learning_rate": 1.5322495955926596e-05, + "loss": 0.3172, + "step": 10924 + }, + { + "epoch": 4.4428629524196825, + "grad_norm": 0.7269531548154834, + "learning_rate": 1.53216381761595e-05, + "loss": 0.0098, + "step": 10925 + }, + { + "epoch": 4.443269621797478, + "grad_norm": 26.946980762904538, + "learning_rate": 1.5320780341763103e-05, + "loss": 1.3777, + "step": 10926 + }, + { + "epoch": 4.443676291175274, + "grad_norm": 3.9086468068740836, + "learning_rate": 1.5319922452746218e-05, + "loss": 0.0901, + "step": 10927 + }, + { + "epoch": 4.444082960553071, + "grad_norm": 7.206529166338086, + "learning_rate": 1.5319064509117656e-05, + "loss": 0.2594, + "step": 10928 + }, + { + "epoch": 4.4444896299308665, + "grad_norm": 5.108014279756893, + "learning_rate": 1.531820651088622e-05, + "loss": 0.2969, + "step": 10929 + }, + { + "epoch": 4.444896299308662, + "grad_norm": 0.2112911999877708, + "learning_rate": 1.5317348458060715e-05, + "loss": 0.0041, + "step": 10930 + }, + { + "epoch": 4.445302968686458, + "grad_norm": 10.147512445510921, + "learning_rate": 1.5316490350649954e-05, + "loss": 1.0426, + "step": 10931 + }, + { + "epoch": 4.445709638064254, + "grad_norm": 3.358471323327765, + "learning_rate": 1.5315632188662742e-05, + "loss": 0.0453, + "step": 10932 + }, + { + "epoch": 4.4461163074420496, + "grad_norm": 4.605245582079255, + "learning_rate": 1.531477397210789e-05, + "loss": 0.0835, + "step": 10933 + }, + { + "epoch": 4.446522976819845, + "grad_norm": 6.557822773949562, + "learning_rate": 1.531391570099421e-05, + "loss": 0.472, + "step": 10934 + }, + { + "epoch": 4.446929646197641, + "grad_norm": 8.93474659989219, + "learning_rate": 1.531305737533051e-05, + "loss": 0.3181, + "step": 10935 + }, + { + "epoch": 4.447336315575437, + "grad_norm": 0.2872203297050985, + "learning_rate": 1.5312198995125603e-05, + "loss": 0.0052, + "step": 10936 + }, + { + "epoch": 4.447742984953233, + "grad_norm": 7.009405264746776, + "learning_rate": 1.53113405603883e-05, + "loss": 0.1948, + "step": 10937 + }, + { + "epoch": 4.448149654331029, + "grad_norm": 0.8702619359096435, + "learning_rate": 1.531048207112741e-05, + "loss": 0.0131, + "step": 10938 + }, + { + "epoch": 4.448556323708825, + "grad_norm": 2.028957463103487, + "learning_rate": 1.5309623527351752e-05, + "loss": 0.0362, + "step": 10939 + }, + { + "epoch": 4.448962993086621, + "grad_norm": 3.075719594499907, + "learning_rate": 1.5308764929070134e-05, + "loss": 0.0856, + "step": 10940 + }, + { + "epoch": 4.449369662464417, + "grad_norm": 0.46503800152109265, + "learning_rate": 1.5307906276291372e-05, + "loss": 0.0081, + "step": 10941 + }, + { + "epoch": 4.449776331842212, + "grad_norm": 13.463266476180504, + "learning_rate": 1.5307047569024286e-05, + "loss": 0.4534, + "step": 10942 + }, + { + "epoch": 4.450183001220008, + "grad_norm": 1.0867762528856246, + "learning_rate": 1.530618880727768e-05, + "loss": 0.0276, + "step": 10943 + }, + { + "epoch": 4.450589670597804, + "grad_norm": 4.5321806547469725, + "learning_rate": 1.5305329991060376e-05, + "loss": 0.0972, + "step": 10944 + }, + { + "epoch": 4.4509963399756, + "grad_norm": 7.741717299804457, + "learning_rate": 1.530447112038119e-05, + "loss": 0.3061, + "step": 10945 + }, + { + "epoch": 4.451403009353395, + "grad_norm": 6.562958793837837, + "learning_rate": 1.530361219524894e-05, + "loss": 0.1327, + "step": 10946 + }, + { + "epoch": 4.451809678731191, + "grad_norm": 0.12959789405905314, + "learning_rate": 1.530275321567244e-05, + "loss": 0.0021, + "step": 10947 + }, + { + "epoch": 4.452216348108988, + "grad_norm": 10.715940797921585, + "learning_rate": 1.5301894181660507e-05, + "loss": 0.5487, + "step": 10948 + }, + { + "epoch": 4.452623017486784, + "grad_norm": 0.6149109336942143, + "learning_rate": 1.5301035093221967e-05, + "loss": 0.0124, + "step": 10949 + }, + { + "epoch": 4.453029686864579, + "grad_norm": 0.27203603991038916, + "learning_rate": 1.5300175950365632e-05, + "loss": 0.0038, + "step": 10950 + }, + { + "epoch": 4.453436356242375, + "grad_norm": 5.814820196526654, + "learning_rate": 1.5299316753100322e-05, + "loss": 0.3924, + "step": 10951 + }, + { + "epoch": 4.453843025620171, + "grad_norm": 2.554621546882632, + "learning_rate": 1.529845750143486e-05, + "loss": 0.0892, + "step": 10952 + }, + { + "epoch": 4.454249694997967, + "grad_norm": 0.2145905121546494, + "learning_rate": 1.529759819537806e-05, + "loss": 0.0033, + "step": 10953 + }, + { + "epoch": 4.454656364375762, + "grad_norm": 4.97035685941686, + "learning_rate": 1.5296738834938756e-05, + "loss": 0.1184, + "step": 10954 + }, + { + "epoch": 4.455063033753558, + "grad_norm": 9.71732266407682, + "learning_rate": 1.529587942012576e-05, + "loss": 0.2486, + "step": 10955 + }, + { + "epoch": 4.455469703131354, + "grad_norm": 5.973774428865413, + "learning_rate": 1.5295019950947895e-05, + "loss": 0.1699, + "step": 10956 + }, + { + "epoch": 4.45587637250915, + "grad_norm": 4.933368477620321, + "learning_rate": 1.5294160427413985e-05, + "loss": 0.1371, + "step": 10957 + }, + { + "epoch": 4.456283041886946, + "grad_norm": 6.271797009477593, + "learning_rate": 1.5293300849532855e-05, + "loss": 0.3951, + "step": 10958 + }, + { + "epoch": 4.456689711264742, + "grad_norm": 13.765051022366666, + "learning_rate": 1.5292441217313324e-05, + "loss": 0.4383, + "step": 10959 + }, + { + "epoch": 4.457096380642538, + "grad_norm": 8.639315089287143, + "learning_rate": 1.5291581530764227e-05, + "loss": 0.5399, + "step": 10960 + }, + { + "epoch": 4.457503050020334, + "grad_norm": 11.954694922827192, + "learning_rate": 1.5290721789894378e-05, + "loss": 0.3701, + "step": 10961 + }, + { + "epoch": 4.457909719398129, + "grad_norm": 0.12465296755324526, + "learning_rate": 1.5289861994712604e-05, + "loss": 0.0021, + "step": 10962 + }, + { + "epoch": 4.458316388775925, + "grad_norm": 8.25336428748207, + "learning_rate": 1.528900214522774e-05, + "loss": 0.3383, + "step": 10963 + }, + { + "epoch": 4.458723058153721, + "grad_norm": 0.5150744071741332, + "learning_rate": 1.5288142241448603e-05, + "loss": 0.0103, + "step": 10964 + }, + { + "epoch": 4.459129727531517, + "grad_norm": 12.392692537658863, + "learning_rate": 1.5287282283384026e-05, + "loss": 0.2758, + "step": 10965 + }, + { + "epoch": 4.4595363969093125, + "grad_norm": 2.4227041792026514, + "learning_rate": 1.5286422271042835e-05, + "loss": 0.0292, + "step": 10966 + }, + { + "epoch": 4.459943066287108, + "grad_norm": 12.087350615234886, + "learning_rate": 1.5285562204433862e-05, + "loss": 0.7318, + "step": 10967 + }, + { + "epoch": 4.460349735664904, + "grad_norm": 8.926619324373073, + "learning_rate": 1.5284702083565928e-05, + "loss": 0.5233, + "step": 10968 + }, + { + "epoch": 4.460756405042701, + "grad_norm": 0.5006471991611156, + "learning_rate": 1.528384190844787e-05, + "loss": 0.0069, + "step": 10969 + }, + { + "epoch": 4.461163074420496, + "grad_norm": 2.6174386668876517, + "learning_rate": 1.5282981679088517e-05, + "loss": 0.0793, + "step": 10970 + }, + { + "epoch": 4.461569743798292, + "grad_norm": 1.9856445306086044, + "learning_rate": 1.5282121395496696e-05, + "loss": 0.0746, + "step": 10971 + }, + { + "epoch": 4.461976413176088, + "grad_norm": 7.56529774548624, + "learning_rate": 1.5281261057681242e-05, + "loss": 0.3032, + "step": 10972 + }, + { + "epoch": 4.462383082553884, + "grad_norm": 1.3939837052120936, + "learning_rate": 1.5280400665650983e-05, + "loss": 0.0082, + "step": 10973 + }, + { + "epoch": 4.4627897519316795, + "grad_norm": 0.47931399304020994, + "learning_rate": 1.527954021941475e-05, + "loss": 0.0083, + "step": 10974 + }, + { + "epoch": 4.463196421309475, + "grad_norm": 5.143683157839746, + "learning_rate": 1.5278679718981385e-05, + "loss": 0.0928, + "step": 10975 + }, + { + "epoch": 4.463603090687271, + "grad_norm": 0.13293133722577846, + "learning_rate": 1.5277819164359716e-05, + "loss": 0.0029, + "step": 10976 + }, + { + "epoch": 4.464009760065067, + "grad_norm": 7.152373915908957, + "learning_rate": 1.5276958555558578e-05, + "loss": 0.243, + "step": 10977 + }, + { + "epoch": 4.4644164294428625, + "grad_norm": 0.2042947368808819, + "learning_rate": 1.5276097892586802e-05, + "loss": 0.0022, + "step": 10978 + }, + { + "epoch": 4.464823098820659, + "grad_norm": 2.265553364271214, + "learning_rate": 1.5275237175453226e-05, + "loss": 0.0359, + "step": 10979 + }, + { + "epoch": 4.465229768198455, + "grad_norm": 1.1317424597035257, + "learning_rate": 1.5274376404166685e-05, + "loss": 0.0248, + "step": 10980 + }, + { + "epoch": 4.465636437576251, + "grad_norm": 11.64951657448309, + "learning_rate": 1.5273515578736017e-05, + "loss": 0.5809, + "step": 10981 + }, + { + "epoch": 4.4660431069540465, + "grad_norm": 4.216346894588908, + "learning_rate": 1.5272654699170056e-05, + "loss": 0.1296, + "step": 10982 + }, + { + "epoch": 4.466449776331842, + "grad_norm": 4.462269728142915, + "learning_rate": 1.5271793765477643e-05, + "loss": 0.12, + "step": 10983 + }, + { + "epoch": 4.466856445709638, + "grad_norm": 7.278534105064849, + "learning_rate": 1.527093277766761e-05, + "loss": 0.1447, + "step": 10984 + }, + { + "epoch": 4.467263115087434, + "grad_norm": 9.186083547974995, + "learning_rate": 1.5270071735748802e-05, + "loss": 0.4, + "step": 10985 + }, + { + "epoch": 4.4676697844652296, + "grad_norm": 3.7224947281188765, + "learning_rate": 1.5269210639730056e-05, + "loss": 0.0694, + "step": 10986 + }, + { + "epoch": 4.468076453843025, + "grad_norm": 8.331183678805893, + "learning_rate": 1.526834948962021e-05, + "loss": 0.4179, + "step": 10987 + }, + { + "epoch": 4.468483123220821, + "grad_norm": 5.394265155872735, + "learning_rate": 1.5267488285428104e-05, + "loss": 0.0663, + "step": 10988 + }, + { + "epoch": 4.468889792598618, + "grad_norm": 0.3739296644580445, + "learning_rate": 1.5266627027162582e-05, + "loss": 0.007, + "step": 10989 + }, + { + "epoch": 4.4692964619764135, + "grad_norm": 5.174775773531117, + "learning_rate": 1.526576571483248e-05, + "loss": 0.1224, + "step": 10990 + }, + { + "epoch": 4.469703131354209, + "grad_norm": 3.0262000395931397, + "learning_rate": 1.526490434844664e-05, + "loss": 0.0162, + "step": 10991 + }, + { + "epoch": 4.470109800732005, + "grad_norm": 4.682486982892188, + "learning_rate": 1.5264042928013915e-05, + "loss": 0.0694, + "step": 10992 + }, + { + "epoch": 4.470516470109801, + "grad_norm": 1.9784819498248911, + "learning_rate": 1.526318145354314e-05, + "loss": 0.0387, + "step": 10993 + }, + { + "epoch": 4.470923139487597, + "grad_norm": 5.250420917868896, + "learning_rate": 1.5262319925043154e-05, + "loss": 0.1884, + "step": 10994 + }, + { + "epoch": 4.471329808865392, + "grad_norm": 12.621947762283604, + "learning_rate": 1.5261458342522805e-05, + "loss": 0.6365, + "step": 10995 + }, + { + "epoch": 4.471736478243188, + "grad_norm": 0.7205639982600951, + "learning_rate": 1.5260596705990936e-05, + "loss": 0.0098, + "step": 10996 + }, + { + "epoch": 4.472143147620984, + "grad_norm": 10.092785354223354, + "learning_rate": 1.52597350154564e-05, + "loss": 0.7107, + "step": 10997 + }, + { + "epoch": 4.47254981699878, + "grad_norm": 1.5389372648274824, + "learning_rate": 1.5258873270928035e-05, + "loss": 0.0352, + "step": 10998 + }, + { + "epoch": 4.472956486376576, + "grad_norm": 3.5060125953031904, + "learning_rate": 1.5258011472414688e-05, + "loss": 0.121, + "step": 10999 + }, + { + "epoch": 4.473363155754372, + "grad_norm": 0.40185117860493647, + "learning_rate": 1.5257149619925207e-05, + "loss": 0.0072, + "step": 11000 + }, + { + "epoch": 4.473769825132168, + "grad_norm": 2.39606845310912, + "learning_rate": 1.5256287713468441e-05, + "loss": 0.0502, + "step": 11001 + }, + { + "epoch": 4.474176494509964, + "grad_norm": 2.618519081133416, + "learning_rate": 1.5255425753053232e-05, + "loss": 0.0507, + "step": 11002 + }, + { + "epoch": 4.474583163887759, + "grad_norm": 3.7731489191994854, + "learning_rate": 1.5254563738688438e-05, + "loss": 0.0689, + "step": 11003 + }, + { + "epoch": 4.474989833265555, + "grad_norm": 7.406828984558891, + "learning_rate": 1.5253701670382898e-05, + "loss": 0.333, + "step": 11004 + }, + { + "epoch": 4.475396502643351, + "grad_norm": 8.333502413858552, + "learning_rate": 1.525283954814547e-05, + "loss": 0.645, + "step": 11005 + }, + { + "epoch": 4.475803172021147, + "grad_norm": 4.845324517265604, + "learning_rate": 1.5251977371985001e-05, + "loss": 0.1219, + "step": 11006 + }, + { + "epoch": 4.476209841398942, + "grad_norm": 9.957083865867002, + "learning_rate": 1.5251115141910338e-05, + "loss": 0.391, + "step": 11007 + }, + { + "epoch": 4.476616510776738, + "grad_norm": 5.517900301021678, + "learning_rate": 1.5250252857930337e-05, + "loss": 0.0964, + "step": 11008 + }, + { + "epoch": 4.477023180154534, + "grad_norm": 6.703988858535519, + "learning_rate": 1.5249390520053848e-05, + "loss": 0.2888, + "step": 11009 + }, + { + "epoch": 4.477429849532331, + "grad_norm": 9.648302774318823, + "learning_rate": 1.524852812828972e-05, + "loss": 0.2948, + "step": 11010 + }, + { + "epoch": 4.477836518910126, + "grad_norm": 0.05555896058225494, + "learning_rate": 1.5247665682646813e-05, + "loss": 0.001, + "step": 11011 + }, + { + "epoch": 4.478243188287922, + "grad_norm": 3.0249668567242085, + "learning_rate": 1.5246803183133976e-05, + "loss": 0.0457, + "step": 11012 + }, + { + "epoch": 4.478649857665718, + "grad_norm": 0.06797582586235582, + "learning_rate": 1.524594062976006e-05, + "loss": 0.0009, + "step": 11013 + }, + { + "epoch": 4.479056527043514, + "grad_norm": 1.998110827284127, + "learning_rate": 1.5245078022533928e-05, + "loss": 0.0354, + "step": 11014 + }, + { + "epoch": 4.479463196421309, + "grad_norm": 10.326454649317085, + "learning_rate": 1.5244215361464427e-05, + "loss": 0.3122, + "step": 11015 + }, + { + "epoch": 4.479869865799105, + "grad_norm": 4.008764822814887, + "learning_rate": 1.5243352646560419e-05, + "loss": 0.0637, + "step": 11016 + }, + { + "epoch": 4.480276535176901, + "grad_norm": 6.969528834026428, + "learning_rate": 1.5242489877830755e-05, + "loss": 0.3354, + "step": 11017 + }, + { + "epoch": 4.480683204554697, + "grad_norm": 0.7370353978465269, + "learning_rate": 1.5241627055284296e-05, + "loss": 0.0097, + "step": 11018 + }, + { + "epoch": 4.4810898739324925, + "grad_norm": 0.4054236141703111, + "learning_rate": 1.5240764178929896e-05, + "loss": 0.004, + "step": 11019 + }, + { + "epoch": 4.481496543310289, + "grad_norm": 3.7355429866927454, + "learning_rate": 1.5239901248776411e-05, + "loss": 0.0633, + "step": 11020 + }, + { + "epoch": 4.481903212688085, + "grad_norm": 1.5350030184041985, + "learning_rate": 1.5239038264832706e-05, + "loss": 0.037, + "step": 11021 + }, + { + "epoch": 4.482309882065881, + "grad_norm": 4.459364636303508, + "learning_rate": 1.5238175227107637e-05, + "loss": 0.0892, + "step": 11022 + }, + { + "epoch": 4.482716551443676, + "grad_norm": 4.326598043689653, + "learning_rate": 1.5237312135610061e-05, + "loss": 0.1224, + "step": 11023 + }, + { + "epoch": 4.483123220821472, + "grad_norm": 2.167536759711711, + "learning_rate": 1.523644899034884e-05, + "loss": 0.0364, + "step": 11024 + }, + { + "epoch": 4.483529890199268, + "grad_norm": 10.399683306466022, + "learning_rate": 1.5235585791332836e-05, + "loss": 0.5096, + "step": 11025 + }, + { + "epoch": 4.483936559577064, + "grad_norm": 5.307723117989034, + "learning_rate": 1.5234722538570907e-05, + "loss": 0.144, + "step": 11026 + }, + { + "epoch": 4.4843432289548595, + "grad_norm": 11.083438529803653, + "learning_rate": 1.523385923207192e-05, + "loss": 0.3652, + "step": 11027 + }, + { + "epoch": 4.484749898332655, + "grad_norm": 1.476170431667853, + "learning_rate": 1.5232995871844732e-05, + "loss": 0.0243, + "step": 11028 + }, + { + "epoch": 4.485156567710451, + "grad_norm": 0.3333994884457293, + "learning_rate": 1.5232132457898207e-05, + "loss": 0.0054, + "step": 11029 + }, + { + "epoch": 4.485563237088248, + "grad_norm": 13.058097245174162, + "learning_rate": 1.5231268990241211e-05, + "loss": 0.6296, + "step": 11030 + }, + { + "epoch": 4.485969906466043, + "grad_norm": 3.2025638969946626, + "learning_rate": 1.5230405468882602e-05, + "loss": 0.0468, + "step": 11031 + }, + { + "epoch": 4.486376575843839, + "grad_norm": 5.578282006531005, + "learning_rate": 1.5229541893831253e-05, + "loss": 0.1011, + "step": 11032 + }, + { + "epoch": 4.486783245221635, + "grad_norm": 1.1168913510726657, + "learning_rate": 1.522867826509602e-05, + "loss": 0.0219, + "step": 11033 + }, + { + "epoch": 4.487189914599431, + "grad_norm": 10.128857432276096, + "learning_rate": 1.5227814582685776e-05, + "loss": 0.3873, + "step": 11034 + }, + { + "epoch": 4.4875965839772265, + "grad_norm": 8.393441851813366, + "learning_rate": 1.5226950846609385e-05, + "loss": 0.133, + "step": 11035 + }, + { + "epoch": 4.488003253355022, + "grad_norm": 9.437919316169847, + "learning_rate": 1.5226087056875709e-05, + "loss": 0.2838, + "step": 11036 + }, + { + "epoch": 4.488409922732818, + "grad_norm": 9.67077741303555, + "learning_rate": 1.5225223213493623e-05, + "loss": 0.6631, + "step": 11037 + }, + { + "epoch": 4.488816592110614, + "grad_norm": 0.11373483282327689, + "learning_rate": 1.5224359316471988e-05, + "loss": 0.0016, + "step": 11038 + }, + { + "epoch": 4.4892232614884096, + "grad_norm": 4.011948508371669, + "learning_rate": 1.522349536581968e-05, + "loss": 0.0673, + "step": 11039 + }, + { + "epoch": 4.489629930866206, + "grad_norm": 3.534586790359139, + "learning_rate": 1.5222631361545558e-05, + "loss": 0.0365, + "step": 11040 + }, + { + "epoch": 4.490036600244002, + "grad_norm": 2.7860882697579465, + "learning_rate": 1.52217673036585e-05, + "loss": 0.0664, + "step": 11041 + }, + { + "epoch": 4.490443269621798, + "grad_norm": 4.050100330119834, + "learning_rate": 1.5220903192167371e-05, + "loss": 0.1419, + "step": 11042 + }, + { + "epoch": 4.4908499389995935, + "grad_norm": 5.692080100493615, + "learning_rate": 1.5220039027081041e-05, + "loss": 0.1708, + "step": 11043 + }, + { + "epoch": 4.491256608377389, + "grad_norm": 8.291865620889936, + "learning_rate": 1.5219174808408386e-05, + "loss": 0.2341, + "step": 11044 + }, + { + "epoch": 4.491663277755185, + "grad_norm": 0.690996672558823, + "learning_rate": 1.5218310536158276e-05, + "loss": 0.007, + "step": 11045 + }, + { + "epoch": 4.492069947132981, + "grad_norm": 6.976617469583054, + "learning_rate": 1.5217446210339577e-05, + "loss": 0.1976, + "step": 11046 + }, + { + "epoch": 4.492476616510777, + "grad_norm": 3.7323503452943294, + "learning_rate": 1.521658183096117e-05, + "loss": 0.0617, + "step": 11047 + }, + { + "epoch": 4.492883285888572, + "grad_norm": 1.3530136210069377, + "learning_rate": 1.5215717398031928e-05, + "loss": 0.018, + "step": 11048 + }, + { + "epoch": 4.493289955266368, + "grad_norm": 5.6961670233751, + "learning_rate": 1.5214852911560716e-05, + "loss": 0.0791, + "step": 11049 + }, + { + "epoch": 4.493696624644164, + "grad_norm": 3.78109644063133, + "learning_rate": 1.5213988371556419e-05, + "loss": 0.1561, + "step": 11050 + }, + { + "epoch": 4.4941032940219605, + "grad_norm": 0.41401317297177553, + "learning_rate": 1.5213123778027905e-05, + "loss": 0.008, + "step": 11051 + }, + { + "epoch": 4.494509963399756, + "grad_norm": 1.991571946507031, + "learning_rate": 1.5212259130984053e-05, + "loss": 0.0324, + "step": 11052 + }, + { + "epoch": 4.494916632777552, + "grad_norm": 0.6418104074307638, + "learning_rate": 1.5211394430433734e-05, + "loss": 0.0095, + "step": 11053 + }, + { + "epoch": 4.495323302155348, + "grad_norm": 5.53449723735916, + "learning_rate": 1.5210529676385833e-05, + "loss": 0.2433, + "step": 11054 + }, + { + "epoch": 4.495729971533144, + "grad_norm": 8.265289754764053, + "learning_rate": 1.520966486884922e-05, + "loss": 0.2384, + "step": 11055 + }, + { + "epoch": 4.496136640910939, + "grad_norm": 1.3483778392080161, + "learning_rate": 1.5208800007832779e-05, + "loss": 0.0198, + "step": 11056 + }, + { + "epoch": 4.496543310288735, + "grad_norm": 4.843543418718171, + "learning_rate": 1.520793509334538e-05, + "loss": 0.0975, + "step": 11057 + }, + { + "epoch": 4.496949979666531, + "grad_norm": 3.7491849753075113, + "learning_rate": 1.5207070125395906e-05, + "loss": 0.1141, + "step": 11058 + }, + { + "epoch": 4.497356649044327, + "grad_norm": 13.145093964467689, + "learning_rate": 1.5206205103993239e-05, + "loss": 0.3372, + "step": 11059 + }, + { + "epoch": 4.497763318422122, + "grad_norm": 2.766702946228544, + "learning_rate": 1.5205340029146256e-05, + "loss": 0.0383, + "step": 11060 + }, + { + "epoch": 4.498169987799919, + "grad_norm": 9.002606798351295, + "learning_rate": 1.520447490086384e-05, + "loss": 0.2893, + "step": 11061 + }, + { + "epoch": 4.498576657177715, + "grad_norm": 0.13895089065096228, + "learning_rate": 1.5203609719154865e-05, + "loss": 0.0026, + "step": 11062 + }, + { + "epoch": 4.498983326555511, + "grad_norm": 9.957171862280205, + "learning_rate": 1.5202744484028217e-05, + "loss": 0.6711, + "step": 11063 + }, + { + "epoch": 4.499389995933306, + "grad_norm": 13.765556075478896, + "learning_rate": 1.5201879195492782e-05, + "loss": 0.4307, + "step": 11064 + }, + { + "epoch": 4.499796665311102, + "grad_norm": 12.518115903028379, + "learning_rate": 1.5201013853557437e-05, + "loss": 0.3678, + "step": 11065 + }, + { + "epoch": 4.500203334688898, + "grad_norm": 9.365867735734097, + "learning_rate": 1.5200148458231067e-05, + "loss": 0.2615, + "step": 11066 + }, + { + "epoch": 4.500610004066694, + "grad_norm": 0.117185013113388, + "learning_rate": 1.5199283009522558e-05, + "loss": 0.0016, + "step": 11067 + }, + { + "epoch": 4.501016673444489, + "grad_norm": 8.148475047525157, + "learning_rate": 1.5198417507440791e-05, + "loss": 0.3385, + "step": 11068 + }, + { + "epoch": 4.501423342822285, + "grad_norm": 9.0361165386974, + "learning_rate": 1.5197551951994652e-05, + "loss": 0.1961, + "step": 11069 + }, + { + "epoch": 4.501830012200081, + "grad_norm": 9.977319904111015, + "learning_rate": 1.5196686343193025e-05, + "loss": 0.222, + "step": 11070 + }, + { + "epoch": 4.502236681577877, + "grad_norm": 0.1321788315392776, + "learning_rate": 1.5195820681044797e-05, + "loss": 0.0023, + "step": 11071 + }, + { + "epoch": 4.502643350955673, + "grad_norm": 0.4042021898167717, + "learning_rate": 1.5194954965558857e-05, + "loss": 0.006, + "step": 11072 + }, + { + "epoch": 4.503050020333469, + "grad_norm": 10.384980471951497, + "learning_rate": 1.519408919674409e-05, + "loss": 0.3532, + "step": 11073 + }, + { + "epoch": 4.503456689711265, + "grad_norm": 0.12231475586115938, + "learning_rate": 1.5193223374609383e-05, + "loss": 0.002, + "step": 11074 + }, + { + "epoch": 4.503863359089061, + "grad_norm": 4.312879779741208, + "learning_rate": 1.5192357499163622e-05, + "loss": 0.0933, + "step": 11075 + }, + { + "epoch": 4.504270028466856, + "grad_norm": 5.0259580023636365, + "learning_rate": 1.5191491570415699e-05, + "loss": 0.1012, + "step": 11076 + }, + { + "epoch": 4.504676697844652, + "grad_norm": 5.707774380493716, + "learning_rate": 1.5190625588374503e-05, + "loss": 0.18, + "step": 11077 + }, + { + "epoch": 4.505083367222448, + "grad_norm": 0.25819537374483204, + "learning_rate": 1.5189759553048922e-05, + "loss": 0.0029, + "step": 11078 + }, + { + "epoch": 4.505490036600244, + "grad_norm": 4.314253290633308, + "learning_rate": 1.5188893464447847e-05, + "loss": 0.104, + "step": 11079 + }, + { + "epoch": 4.5058967059780395, + "grad_norm": 5.922507688221263, + "learning_rate": 1.518802732258017e-05, + "loss": 0.1228, + "step": 11080 + }, + { + "epoch": 4.506303375355836, + "grad_norm": 0.32639523785224656, + "learning_rate": 1.5187161127454781e-05, + "loss": 0.005, + "step": 11081 + }, + { + "epoch": 4.506710044733632, + "grad_norm": 12.097939364477304, + "learning_rate": 1.5186294879080574e-05, + "loss": 0.4918, + "step": 11082 + }, + { + "epoch": 4.507116714111428, + "grad_norm": 3.6674294426608274, + "learning_rate": 1.5185428577466438e-05, + "loss": 0.0609, + "step": 11083 + }, + { + "epoch": 4.507523383489223, + "grad_norm": 8.116141444139256, + "learning_rate": 1.5184562222621268e-05, + "loss": 0.2042, + "step": 11084 + }, + { + "epoch": 4.507930052867019, + "grad_norm": 8.419031422190253, + "learning_rate": 1.5183695814553958e-05, + "loss": 0.1825, + "step": 11085 + }, + { + "epoch": 4.508336722244815, + "grad_norm": 5.3326714465840945, + "learning_rate": 1.5182829353273403e-05, + "loss": 0.1019, + "step": 11086 + }, + { + "epoch": 4.508743391622611, + "grad_norm": 15.040298148210407, + "learning_rate": 1.5181962838788495e-05, + "loss": 0.5254, + "step": 11087 + }, + { + "epoch": 4.5091500610004065, + "grad_norm": 9.39968835445334, + "learning_rate": 1.518109627110813e-05, + "loss": 0.3522, + "step": 11088 + }, + { + "epoch": 4.509556730378202, + "grad_norm": 9.81190954298973, + "learning_rate": 1.5180229650241205e-05, + "loss": 0.3363, + "step": 11089 + }, + { + "epoch": 4.509963399755998, + "grad_norm": 6.339349535920424, + "learning_rate": 1.5179362976196614e-05, + "loss": 0.1495, + "step": 11090 + }, + { + "epoch": 4.510370069133794, + "grad_norm": 0.17403465880373747, + "learning_rate": 1.5178496248983254e-05, + "loss": 0.0026, + "step": 11091 + }, + { + "epoch": 4.5107767385115904, + "grad_norm": 7.60457372323173, + "learning_rate": 1.5177629468610027e-05, + "loss": 0.232, + "step": 11092 + }, + { + "epoch": 4.511183407889386, + "grad_norm": 1.3925816755550278, + "learning_rate": 1.5176762635085828e-05, + "loss": 0.0221, + "step": 11093 + }, + { + "epoch": 4.511590077267182, + "grad_norm": 6.2908895207373, + "learning_rate": 1.5175895748419552e-05, + "loss": 0.07, + "step": 11094 + }, + { + "epoch": 4.511996746644978, + "grad_norm": 2.2157836720214164, + "learning_rate": 1.5175028808620103e-05, + "loss": 0.0372, + "step": 11095 + }, + { + "epoch": 4.5124034160227735, + "grad_norm": 11.976115134407227, + "learning_rate": 1.517416181569638e-05, + "loss": 0.4694, + "step": 11096 + }, + { + "epoch": 4.512810085400569, + "grad_norm": 5.68881862455092, + "learning_rate": 1.5173294769657282e-05, + "loss": 0.1847, + "step": 11097 + }, + { + "epoch": 4.513216754778365, + "grad_norm": 0.12657027822507488, + "learning_rate": 1.5172427670511704e-05, + "loss": 0.0023, + "step": 11098 + }, + { + "epoch": 4.513623424156161, + "grad_norm": 0.5696559263915862, + "learning_rate": 1.5171560518268558e-05, + "loss": 0.0078, + "step": 11099 + }, + { + "epoch": 4.514030093533957, + "grad_norm": 8.590418403688677, + "learning_rate": 1.517069331293674e-05, + "loss": 0.5026, + "step": 11100 + }, + { + "epoch": 4.514436762911753, + "grad_norm": 4.879476084657976, + "learning_rate": 1.516982605452515e-05, + "loss": 0.1007, + "step": 11101 + }, + { + "epoch": 4.514843432289549, + "grad_norm": 5.436755918502896, + "learning_rate": 1.5168958743042697e-05, + "loss": 0.2093, + "step": 11102 + }, + { + "epoch": 4.515250101667345, + "grad_norm": 9.425822670122713, + "learning_rate": 1.516809137849828e-05, + "loss": 0.372, + "step": 11103 + }, + { + "epoch": 4.5156567710451405, + "grad_norm": 9.143389084117004, + "learning_rate": 1.5167223960900802e-05, + "loss": 0.4015, + "step": 11104 + }, + { + "epoch": 4.516063440422936, + "grad_norm": 1.7611831092072157, + "learning_rate": 1.5166356490259172e-05, + "loss": 0.0251, + "step": 11105 + }, + { + "epoch": 4.516470109800732, + "grad_norm": 2.6694544544192262, + "learning_rate": 1.5165488966582293e-05, + "loss": 0.0524, + "step": 11106 + }, + { + "epoch": 4.516876779178528, + "grad_norm": 0.586593389008708, + "learning_rate": 1.5164621389879069e-05, + "loss": 0.0088, + "step": 11107 + }, + { + "epoch": 4.517283448556324, + "grad_norm": 5.972013196732263, + "learning_rate": 1.5163753760158406e-05, + "loss": 0.1065, + "step": 11108 + }, + { + "epoch": 4.517690117934119, + "grad_norm": 11.930626684372125, + "learning_rate": 1.5162886077429213e-05, + "loss": 1.0124, + "step": 11109 + }, + { + "epoch": 4.518096787311915, + "grad_norm": 2.433165390548346, + "learning_rate": 1.5162018341700395e-05, + "loss": 0.0624, + "step": 11110 + }, + { + "epoch": 4.518503456689711, + "grad_norm": 9.826744854390267, + "learning_rate": 1.5161150552980862e-05, + "loss": 0.2769, + "step": 11111 + }, + { + "epoch": 4.518910126067507, + "grad_norm": 1.9252945726781245, + "learning_rate": 1.516028271127952e-05, + "loss": 0.0274, + "step": 11112 + }, + { + "epoch": 4.519316795445303, + "grad_norm": 8.53634929000625, + "learning_rate": 1.5159414816605278e-05, + "loss": 0.1474, + "step": 11113 + }, + { + "epoch": 4.519723464823099, + "grad_norm": 1.694464658906606, + "learning_rate": 1.5158546868967046e-05, + "loss": 0.029, + "step": 11114 + }, + { + "epoch": 4.520130134200895, + "grad_norm": 6.708963878900822, + "learning_rate": 1.5157678868373738e-05, + "loss": 0.2446, + "step": 11115 + }, + { + "epoch": 4.520536803578691, + "grad_norm": 1.838226173507268, + "learning_rate": 1.5156810814834255e-05, + "loss": 0.0383, + "step": 11116 + }, + { + "epoch": 4.520943472956486, + "grad_norm": 8.334959016295771, + "learning_rate": 1.5155942708357518e-05, + "loss": 0.173, + "step": 11117 + }, + { + "epoch": 4.521350142334282, + "grad_norm": 9.539187296372052, + "learning_rate": 1.5155074548952433e-05, + "loss": 0.2641, + "step": 11118 + }, + { + "epoch": 4.521756811712078, + "grad_norm": 6.077324333031097, + "learning_rate": 1.5154206336627915e-05, + "loss": 0.3414, + "step": 11119 + }, + { + "epoch": 4.522163481089874, + "grad_norm": 2.882855057281098, + "learning_rate": 1.5153338071392872e-05, + "loss": 0.0553, + "step": 11120 + }, + { + "epoch": 4.522570150467669, + "grad_norm": 6.873574775927938, + "learning_rate": 1.515246975325622e-05, + "loss": 0.2424, + "step": 11121 + }, + { + "epoch": 4.522976819845466, + "grad_norm": 5.554948233854312, + "learning_rate": 1.5151601382226875e-05, + "loss": 0.1412, + "step": 11122 + }, + { + "epoch": 4.523383489223262, + "grad_norm": 7.8034544902506715, + "learning_rate": 1.5150732958313745e-05, + "loss": 0.3264, + "step": 11123 + }, + { + "epoch": 4.523790158601058, + "grad_norm": 7.6937295534588905, + "learning_rate": 1.5149864481525754e-05, + "loss": 0.2356, + "step": 11124 + }, + { + "epoch": 4.524196827978853, + "grad_norm": 9.375047246893402, + "learning_rate": 1.514899595187181e-05, + "loss": 0.1532, + "step": 11125 + }, + { + "epoch": 4.524603497356649, + "grad_norm": 7.757315144882038, + "learning_rate": 1.514812736936083e-05, + "loss": 0.2694, + "step": 11126 + }, + { + "epoch": 4.525010166734445, + "grad_norm": 6.68796614750265, + "learning_rate": 1.5147258734001732e-05, + "loss": 0.0865, + "step": 11127 + }, + { + "epoch": 4.525416836112241, + "grad_norm": 0.4770764464752186, + "learning_rate": 1.5146390045803434e-05, + "loss": 0.0078, + "step": 11128 + }, + { + "epoch": 4.525823505490036, + "grad_norm": 16.1421867300079, + "learning_rate": 1.514552130477485e-05, + "loss": 0.5358, + "step": 11129 + }, + { + "epoch": 4.526230174867832, + "grad_norm": 11.079108095230186, + "learning_rate": 1.5144652510924901e-05, + "loss": 0.4111, + "step": 11130 + }, + { + "epoch": 4.526636844245628, + "grad_norm": 5.425112006545428, + "learning_rate": 1.5143783664262506e-05, + "loss": 0.1188, + "step": 11131 + }, + { + "epoch": 4.527043513623424, + "grad_norm": 1.5101209946467489, + "learning_rate": 1.5142914764796584e-05, + "loss": 0.0364, + "step": 11132 + }, + { + "epoch": 4.52745018300122, + "grad_norm": 6.885014969659529, + "learning_rate": 1.5142045812536048e-05, + "loss": 0.2261, + "step": 11133 + }, + { + "epoch": 4.527856852379016, + "grad_norm": 10.899043276573767, + "learning_rate": 1.5141176807489828e-05, + "loss": 0.363, + "step": 11134 + }, + { + "epoch": 4.528263521756812, + "grad_norm": 0.7848584811050022, + "learning_rate": 1.5140307749666843e-05, + "loss": 0.0107, + "step": 11135 + }, + { + "epoch": 4.528670191134608, + "grad_norm": 7.363843878138071, + "learning_rate": 1.5139438639076005e-05, + "loss": 0.2174, + "step": 11136 + }, + { + "epoch": 4.529076860512403, + "grad_norm": 0.7270564548374817, + "learning_rate": 1.5138569475726247e-05, + "loss": 0.0117, + "step": 11137 + }, + { + "epoch": 4.529483529890199, + "grad_norm": 12.87710549298979, + "learning_rate": 1.5137700259626489e-05, + "loss": 0.629, + "step": 11138 + }, + { + "epoch": 4.529890199267995, + "grad_norm": 4.551582037578991, + "learning_rate": 1.5136830990785649e-05, + "loss": 0.0859, + "step": 11139 + }, + { + "epoch": 4.530296868645791, + "grad_norm": 0.06830063531973542, + "learning_rate": 1.5135961669212653e-05, + "loss": 0.0012, + "step": 11140 + }, + { + "epoch": 4.5307035380235865, + "grad_norm": 4.479961002201019, + "learning_rate": 1.5135092294916426e-05, + "loss": 0.1192, + "step": 11141 + }, + { + "epoch": 4.531110207401383, + "grad_norm": 3.976842767850063, + "learning_rate": 1.5134222867905894e-05, + "loss": 0.1506, + "step": 11142 + }, + { + "epoch": 4.531516876779179, + "grad_norm": 0.9568657900808716, + "learning_rate": 1.513335338818998e-05, + "loss": 0.0144, + "step": 11143 + }, + { + "epoch": 4.531923546156975, + "grad_norm": 6.586470595659894, + "learning_rate": 1.5132483855777608e-05, + "loss": 0.3535, + "step": 11144 + }, + { + "epoch": 4.5323302155347704, + "grad_norm": 1.9300788872685846, + "learning_rate": 1.5131614270677707e-05, + "loss": 0.0311, + "step": 11145 + }, + { + "epoch": 4.532736884912566, + "grad_norm": 8.568918595109237, + "learning_rate": 1.5130744632899204e-05, + "loss": 0.2449, + "step": 11146 + }, + { + "epoch": 4.533143554290362, + "grad_norm": 14.356409467443898, + "learning_rate": 1.5129874942451025e-05, + "loss": 0.6686, + "step": 11147 + }, + { + "epoch": 4.533550223668158, + "grad_norm": 3.5323512410219737, + "learning_rate": 1.5129005199342097e-05, + "loss": 0.069, + "step": 11148 + }, + { + "epoch": 4.5339568930459535, + "grad_norm": 4.5090462776716205, + "learning_rate": 1.5128135403581347e-05, + "loss": 0.0786, + "step": 11149 + }, + { + "epoch": 4.534363562423749, + "grad_norm": 0.10929579094052927, + "learning_rate": 1.512726555517771e-05, + "loss": 0.002, + "step": 11150 + }, + { + "epoch": 4.534770231801545, + "grad_norm": 11.070736843623235, + "learning_rate": 1.512639565414011e-05, + "loss": 0.3605, + "step": 11151 + }, + { + "epoch": 4.535176901179341, + "grad_norm": 7.367980240904493, + "learning_rate": 1.5125525700477478e-05, + "loss": 0.6479, + "step": 11152 + }, + { + "epoch": 4.535583570557137, + "grad_norm": 16.58285334654549, + "learning_rate": 1.5124655694198747e-05, + "loss": 0.3403, + "step": 11153 + }, + { + "epoch": 4.535990239934933, + "grad_norm": 3.5150074064004952, + "learning_rate": 1.5123785635312844e-05, + "loss": 0.0672, + "step": 11154 + }, + { + "epoch": 4.536396909312729, + "grad_norm": 15.46105046529785, + "learning_rate": 1.5122915523828705e-05, + "loss": 0.6997, + "step": 11155 + }, + { + "epoch": 4.536803578690525, + "grad_norm": 7.336639254506134, + "learning_rate": 1.512204535975526e-05, + "loss": 0.1791, + "step": 11156 + }, + { + "epoch": 4.5372102480683205, + "grad_norm": 6.063889428906203, + "learning_rate": 1.512117514310144e-05, + "loss": 0.2891, + "step": 11157 + }, + { + "epoch": 4.537616917446116, + "grad_norm": 15.869964815480685, + "learning_rate": 1.5120304873876178e-05, + "loss": 0.397, + "step": 11158 + }, + { + "epoch": 4.538023586823912, + "grad_norm": 3.078446015541604, + "learning_rate": 1.5119434552088412e-05, + "loss": 0.065, + "step": 11159 + }, + { + "epoch": 4.538430256201708, + "grad_norm": 2.9229555689025917, + "learning_rate": 1.5118564177747073e-05, + "loss": 0.0818, + "step": 11160 + }, + { + "epoch": 4.538836925579504, + "grad_norm": 6.820075755800655, + "learning_rate": 1.5117693750861096e-05, + "loss": 0.1259, + "step": 11161 + }, + { + "epoch": 4.539243594957299, + "grad_norm": 14.796437584314022, + "learning_rate": 1.5116823271439418e-05, + "loss": 0.9353, + "step": 11162 + }, + { + "epoch": 4.539650264335096, + "grad_norm": 1.4750487740674594, + "learning_rate": 1.5115952739490974e-05, + "loss": 0.0235, + "step": 11163 + }, + { + "epoch": 4.540056933712892, + "grad_norm": 10.635933088075012, + "learning_rate": 1.5115082155024698e-05, + "loss": 0.442, + "step": 11164 + }, + { + "epoch": 4.5404636030906875, + "grad_norm": 6.520248960127809, + "learning_rate": 1.511421151804953e-05, + "loss": 0.1128, + "step": 11165 + }, + { + "epoch": 4.540870272468483, + "grad_norm": 2.936271760166005, + "learning_rate": 1.5113340828574408e-05, + "loss": 0.041, + "step": 11166 + }, + { + "epoch": 4.541276941846279, + "grad_norm": 5.099081166027147, + "learning_rate": 1.5112470086608271e-05, + "loss": 0.1001, + "step": 11167 + }, + { + "epoch": 4.541683611224075, + "grad_norm": 5.448983825555474, + "learning_rate": 1.5111599292160053e-05, + "loss": 0.2405, + "step": 11168 + }, + { + "epoch": 4.542090280601871, + "grad_norm": 9.222350090974603, + "learning_rate": 1.5110728445238695e-05, + "loss": 0.3877, + "step": 11169 + }, + { + "epoch": 4.542496949979666, + "grad_norm": 12.605869458546687, + "learning_rate": 1.5109857545853137e-05, + "loss": 0.4152, + "step": 11170 + }, + { + "epoch": 4.542903619357462, + "grad_norm": 4.895178035633696, + "learning_rate": 1.5108986594012321e-05, + "loss": 0.0986, + "step": 11171 + }, + { + "epoch": 4.543310288735258, + "grad_norm": 9.538855072018789, + "learning_rate": 1.5108115589725185e-05, + "loss": 0.1323, + "step": 11172 + }, + { + "epoch": 4.543716958113054, + "grad_norm": 6.467805873037004, + "learning_rate": 1.5107244533000676e-05, + "loss": 0.1463, + "step": 11173 + }, + { + "epoch": 4.54412362749085, + "grad_norm": 0.17678752130822734, + "learning_rate": 1.5106373423847727e-05, + "loss": 0.0035, + "step": 11174 + }, + { + "epoch": 4.544530296868646, + "grad_norm": 8.647071169507933, + "learning_rate": 1.5105502262275285e-05, + "loss": 0.4078, + "step": 11175 + }, + { + "epoch": 4.544936966246442, + "grad_norm": 5.284375784045995, + "learning_rate": 1.5104631048292295e-05, + "loss": 0.3244, + "step": 11176 + }, + { + "epoch": 4.545343635624238, + "grad_norm": 4.252617899256871, + "learning_rate": 1.5103759781907696e-05, + "loss": 0.2392, + "step": 11177 + }, + { + "epoch": 4.545750305002033, + "grad_norm": 11.857527543088624, + "learning_rate": 1.5102888463130434e-05, + "loss": 0.3155, + "step": 11178 + }, + { + "epoch": 4.546156974379829, + "grad_norm": 1.1529047229950438, + "learning_rate": 1.5102017091969457e-05, + "loss": 0.0194, + "step": 11179 + }, + { + "epoch": 4.546563643757625, + "grad_norm": 8.676684461593783, + "learning_rate": 1.5101145668433704e-05, + "loss": 0.1396, + "step": 11180 + }, + { + "epoch": 4.546970313135421, + "grad_norm": 0.6173768285909936, + "learning_rate": 1.5100274192532123e-05, + "loss": 0.0109, + "step": 11181 + }, + { + "epoch": 4.547376982513216, + "grad_norm": 9.307659440049248, + "learning_rate": 1.509940266427366e-05, + "loss": 0.2488, + "step": 11182 + }, + { + "epoch": 4.547783651891013, + "grad_norm": 8.789985369815097, + "learning_rate": 1.5098531083667264e-05, + "loss": 0.7703, + "step": 11183 + }, + { + "epoch": 4.548190321268809, + "grad_norm": 9.994824237099103, + "learning_rate": 1.509765945072188e-05, + "loss": 0.2937, + "step": 11184 + }, + { + "epoch": 4.548596990646605, + "grad_norm": 5.167620828635057, + "learning_rate": 1.5096787765446456e-05, + "loss": 0.3005, + "step": 11185 + }, + { + "epoch": 4.5490036600244, + "grad_norm": 0.7458599010263192, + "learning_rate": 1.509591602784994e-05, + "loss": 0.0104, + "step": 11186 + }, + { + "epoch": 4.549410329402196, + "grad_norm": 9.681975323981026, + "learning_rate": 1.5095044237941281e-05, + "loss": 0.354, + "step": 11187 + }, + { + "epoch": 4.549816998779992, + "grad_norm": 9.040408536216367, + "learning_rate": 1.509417239572943e-05, + "loss": 0.2319, + "step": 11188 + }, + { + "epoch": 4.550223668157788, + "grad_norm": 6.319714854577035, + "learning_rate": 1.5093300501223335e-05, + "loss": 0.257, + "step": 11189 + }, + { + "epoch": 4.550630337535583, + "grad_norm": 0.3776641752021825, + "learning_rate": 1.5092428554431948e-05, + "loss": 0.0069, + "step": 11190 + }, + { + "epoch": 4.551037006913379, + "grad_norm": 3.579323143173556, + "learning_rate": 1.5091556555364215e-05, + "loss": 0.0689, + "step": 11191 + }, + { + "epoch": 4.551443676291175, + "grad_norm": 7.662241908572237, + "learning_rate": 1.5090684504029095e-05, + "loss": 0.2587, + "step": 11192 + }, + { + "epoch": 4.551850345668971, + "grad_norm": 2.591162019368357, + "learning_rate": 1.5089812400435538e-05, + "loss": 0.0405, + "step": 11193 + }, + { + "epoch": 4.5522570150467665, + "grad_norm": 8.314296527155792, + "learning_rate": 1.5088940244592491e-05, + "loss": 0.2159, + "step": 11194 + }, + { + "epoch": 4.552663684424563, + "grad_norm": 5.642023413979747, + "learning_rate": 1.5088068036508912e-05, + "loss": 0.1343, + "step": 11195 + }, + { + "epoch": 4.553070353802359, + "grad_norm": 2.1327024593453965, + "learning_rate": 1.5087195776193757e-05, + "loss": 0.0227, + "step": 11196 + }, + { + "epoch": 4.553477023180155, + "grad_norm": 4.65041715588796, + "learning_rate": 1.5086323463655972e-05, + "loss": 0.1176, + "step": 11197 + }, + { + "epoch": 4.5538836925579504, + "grad_norm": 0.0818408953675724, + "learning_rate": 1.508545109890452e-05, + "loss": 0.0013, + "step": 11198 + }, + { + "epoch": 4.554290361935746, + "grad_norm": 0.5889230276830063, + "learning_rate": 1.5084578681948351e-05, + "loss": 0.0087, + "step": 11199 + }, + { + "epoch": 4.554697031313542, + "grad_norm": 6.278134728206277, + "learning_rate": 1.5083706212796425e-05, + "loss": 0.1281, + "step": 11200 + }, + { + "epoch": 4.555103700691338, + "grad_norm": 0.4314858462410649, + "learning_rate": 1.5082833691457697e-05, + "loss": 0.008, + "step": 11201 + }, + { + "epoch": 4.5555103700691335, + "grad_norm": 3.250869764977355, + "learning_rate": 1.5081961117941122e-05, + "loss": 0.1683, + "step": 11202 + }, + { + "epoch": 4.555917039446929, + "grad_norm": 7.781178523888216, + "learning_rate": 1.5081088492255658e-05, + "loss": 0.3635, + "step": 11203 + }, + { + "epoch": 4.556323708824726, + "grad_norm": 1.461605762831673, + "learning_rate": 1.5080215814410262e-05, + "loss": 0.024, + "step": 11204 + }, + { + "epoch": 4.556730378202522, + "grad_norm": 2.9266699571568537, + "learning_rate": 1.5079343084413898e-05, + "loss": 0.0413, + "step": 11205 + }, + { + "epoch": 4.5571370475803175, + "grad_norm": 0.10322018155499232, + "learning_rate": 1.5078470302275516e-05, + "loss": 0.0022, + "step": 11206 + }, + { + "epoch": 4.557543716958113, + "grad_norm": 0.5360313781644056, + "learning_rate": 1.5077597468004084e-05, + "loss": 0.0064, + "step": 11207 + }, + { + "epoch": 4.557950386335909, + "grad_norm": 6.3875900033009945, + "learning_rate": 1.5076724581608554e-05, + "loss": 0.3323, + "step": 11208 + }, + { + "epoch": 4.558357055713705, + "grad_norm": 0.4397705321957063, + "learning_rate": 1.5075851643097897e-05, + "loss": 0.0068, + "step": 11209 + }, + { + "epoch": 4.5587637250915005, + "grad_norm": 10.670335399626085, + "learning_rate": 1.5074978652481065e-05, + "loss": 0.5957, + "step": 11210 + }, + { + "epoch": 4.559170394469296, + "grad_norm": 2.980259771132589, + "learning_rate": 1.5074105609767022e-05, + "loss": 0.0824, + "step": 11211 + }, + { + "epoch": 4.559577063847092, + "grad_norm": 6.025390676064855, + "learning_rate": 1.5073232514964733e-05, + "loss": 0.3436, + "step": 11212 + }, + { + "epoch": 4.559983733224888, + "grad_norm": 2.4123802547713344, + "learning_rate": 1.5072359368083156e-05, + "loss": 0.036, + "step": 11213 + }, + { + "epoch": 4.560390402602684, + "grad_norm": 4.959432037580968, + "learning_rate": 1.507148616913126e-05, + "loss": 0.1528, + "step": 11214 + }, + { + "epoch": 4.56079707198048, + "grad_norm": 8.808010720519958, + "learning_rate": 1.5070612918118008e-05, + "loss": 0.4113, + "step": 11215 + }, + { + "epoch": 4.561203741358276, + "grad_norm": 7.905547418788675, + "learning_rate": 1.5069739615052359e-05, + "loss": 0.4319, + "step": 11216 + }, + { + "epoch": 4.561610410736072, + "grad_norm": 5.42428526911056, + "learning_rate": 1.5068866259943283e-05, + "loss": 0.1034, + "step": 11217 + }, + { + "epoch": 4.5620170801138675, + "grad_norm": 7.069977186613568, + "learning_rate": 1.5067992852799743e-05, + "loss": 0.3091, + "step": 11218 + }, + { + "epoch": 4.562423749491663, + "grad_norm": 0.044562300289055404, + "learning_rate": 1.5067119393630702e-05, + "loss": 0.0008, + "step": 11219 + }, + { + "epoch": 4.562830418869459, + "grad_norm": 6.262539978322681, + "learning_rate": 1.5066245882445136e-05, + "loss": 0.3157, + "step": 11220 + }, + { + "epoch": 4.563237088247255, + "grad_norm": 10.284541635444166, + "learning_rate": 1.5065372319252005e-05, + "loss": 0.6375, + "step": 11221 + }, + { + "epoch": 4.563643757625051, + "grad_norm": 56.62672899566515, + "learning_rate": 1.5064498704060278e-05, + "loss": 1.1997, + "step": 11222 + }, + { + "epoch": 4.564050427002846, + "grad_norm": 3.4766077737860646, + "learning_rate": 1.506362503687892e-05, + "loss": 0.1407, + "step": 11223 + }, + { + "epoch": 4.564457096380643, + "grad_norm": 2.195303424604012, + "learning_rate": 1.5062751317716905e-05, + "loss": 0.0334, + "step": 11224 + }, + { + "epoch": 4.564863765758439, + "grad_norm": 1.4147584686163215, + "learning_rate": 1.50618775465832e-05, + "loss": 0.0187, + "step": 11225 + }, + { + "epoch": 4.5652704351362345, + "grad_norm": 7.946348074644325, + "learning_rate": 1.5061003723486772e-05, + "loss": 0.1533, + "step": 11226 + }, + { + "epoch": 4.56567710451403, + "grad_norm": 10.27178095436361, + "learning_rate": 1.5060129848436594e-05, + "loss": 0.3945, + "step": 11227 + }, + { + "epoch": 4.566083773891826, + "grad_norm": 1.0965946089392327, + "learning_rate": 1.505925592144164e-05, + "loss": 0.0244, + "step": 11228 + }, + { + "epoch": 4.566490443269622, + "grad_norm": 1.0943753961368314, + "learning_rate": 1.5058381942510871e-05, + "loss": 0.0256, + "step": 11229 + }, + { + "epoch": 4.566897112647418, + "grad_norm": 7.993325763248261, + "learning_rate": 1.505750791165327e-05, + "loss": 0.4232, + "step": 11230 + }, + { + "epoch": 4.567303782025213, + "grad_norm": 3.8590485977988096, + "learning_rate": 1.5056633828877807e-05, + "loss": 0.0854, + "step": 11231 + }, + { + "epoch": 4.567710451403009, + "grad_norm": 8.483199871554618, + "learning_rate": 1.5055759694193448e-05, + "loss": 0.1926, + "step": 11232 + }, + { + "epoch": 4.568117120780805, + "grad_norm": 56.75438304183345, + "learning_rate": 1.5054885507609176e-05, + "loss": 0.1875, + "step": 11233 + }, + { + "epoch": 4.568523790158601, + "grad_norm": 8.051998779954605, + "learning_rate": 1.5054011269133959e-05, + "loss": 0.2341, + "step": 11234 + }, + { + "epoch": 4.568930459536396, + "grad_norm": 3.1488207175258998, + "learning_rate": 1.505313697877677e-05, + "loss": 0.0564, + "step": 11235 + }, + { + "epoch": 4.569337128914193, + "grad_norm": 0.49020419236624546, + "learning_rate": 1.5052262636546586e-05, + "loss": 0.0082, + "step": 11236 + }, + { + "epoch": 4.569743798291989, + "grad_norm": 0.635601348876422, + "learning_rate": 1.5051388242452387e-05, + "loss": 0.0096, + "step": 11237 + }, + { + "epoch": 4.570150467669785, + "grad_norm": 4.187557908153063, + "learning_rate": 1.5050513796503144e-05, + "loss": 0.2252, + "step": 11238 + }, + { + "epoch": 4.57055713704758, + "grad_norm": 6.141011270083909, + "learning_rate": 1.5049639298707835e-05, + "loss": 0.116, + "step": 11239 + }, + { + "epoch": 4.570963806425376, + "grad_norm": 4.166385222847687, + "learning_rate": 1.5048764749075437e-05, + "loss": 0.0721, + "step": 11240 + }, + { + "epoch": 4.571370475803172, + "grad_norm": 0.40280237787281736, + "learning_rate": 1.5047890147614927e-05, + "loss": 0.0062, + "step": 11241 + }, + { + "epoch": 4.571777145180968, + "grad_norm": 10.191960662430235, + "learning_rate": 1.5047015494335283e-05, + "loss": 0.5739, + "step": 11242 + }, + { + "epoch": 4.572183814558763, + "grad_norm": 7.018419384763502, + "learning_rate": 1.5046140789245489e-05, + "loss": 0.478, + "step": 11243 + }, + { + "epoch": 4.572590483936559, + "grad_norm": 7.164613372409998, + "learning_rate": 1.5045266032354517e-05, + "loss": 0.3145, + "step": 11244 + }, + { + "epoch": 4.572997153314356, + "grad_norm": 1.6445778280065773, + "learning_rate": 1.5044391223671354e-05, + "loss": 0.0228, + "step": 11245 + }, + { + "epoch": 4.573403822692152, + "grad_norm": 0.4591996032181082, + "learning_rate": 1.5043516363204971e-05, + "loss": 0.0088, + "step": 11246 + }, + { + "epoch": 4.573810492069947, + "grad_norm": 4.359074742904864, + "learning_rate": 1.5042641450964357e-05, + "loss": 0.0574, + "step": 11247 + }, + { + "epoch": 4.574217161447743, + "grad_norm": 4.156628014737447, + "learning_rate": 1.5041766486958492e-05, + "loss": 0.1683, + "step": 11248 + }, + { + "epoch": 4.574623830825539, + "grad_norm": 7.84755228761091, + "learning_rate": 1.5040891471196352e-05, + "loss": 0.2066, + "step": 11249 + }, + { + "epoch": 4.575030500203335, + "grad_norm": 5.850840352250551, + "learning_rate": 1.5040016403686926e-05, + "loss": 0.2264, + "step": 11250 + }, + { + "epoch": 4.5754371695811304, + "grad_norm": 1.3142716763804438, + "learning_rate": 1.5039141284439197e-05, + "loss": 0.0151, + "step": 11251 + }, + { + "epoch": 4.575843838958926, + "grad_norm": 2.152287635964462, + "learning_rate": 1.5038266113462144e-05, + "loss": 0.049, + "step": 11252 + }, + { + "epoch": 4.576250508336722, + "grad_norm": 7.4885471131820545, + "learning_rate": 1.5037390890764756e-05, + "loss": 0.2248, + "step": 11253 + }, + { + "epoch": 4.576657177714518, + "grad_norm": 1.6536484668044586, + "learning_rate": 1.5036515616356013e-05, + "loss": 0.027, + "step": 11254 + }, + { + "epoch": 4.5770638470923135, + "grad_norm": 1.064091855444971, + "learning_rate": 1.5035640290244904e-05, + "loss": 0.0153, + "step": 11255 + }, + { + "epoch": 4.57747051647011, + "grad_norm": 26.92281605572441, + "learning_rate": 1.5034764912440413e-05, + "loss": 2.0158, + "step": 11256 + }, + { + "epoch": 4.577877185847906, + "grad_norm": 9.06861109962408, + "learning_rate": 1.5033889482951527e-05, + "loss": 0.1121, + "step": 11257 + }, + { + "epoch": 4.578283855225702, + "grad_norm": 1.688398208622596, + "learning_rate": 1.503301400178723e-05, + "loss": 0.0264, + "step": 11258 + }, + { + "epoch": 4.5786905246034975, + "grad_norm": 0.18966227886205128, + "learning_rate": 1.5032138468956512e-05, + "loss": 0.0038, + "step": 11259 + }, + { + "epoch": 4.579097193981293, + "grad_norm": 0.7065691903115155, + "learning_rate": 1.5031262884468358e-05, + "loss": 0.0096, + "step": 11260 + }, + { + "epoch": 4.579503863359089, + "grad_norm": 2.811724409270666, + "learning_rate": 1.503038724833176e-05, + "loss": 0.0413, + "step": 11261 + }, + { + "epoch": 4.579910532736885, + "grad_norm": 2.6322857057192257, + "learning_rate": 1.5029511560555707e-05, + "loss": 0.0525, + "step": 11262 + }, + { + "epoch": 4.5803172021146805, + "grad_norm": 4.6559509940748045, + "learning_rate": 1.5028635821149185e-05, + "loss": 0.1678, + "step": 11263 + }, + { + "epoch": 4.580723871492476, + "grad_norm": 3.567255563402269, + "learning_rate": 1.5027760030121184e-05, + "loss": 0.0673, + "step": 11264 + }, + { + "epoch": 4.581130540870273, + "grad_norm": 9.52010411670394, + "learning_rate": 1.5026884187480697e-05, + "loss": 0.1944, + "step": 11265 + }, + { + "epoch": 4.581537210248069, + "grad_norm": 10.210075377411743, + "learning_rate": 1.5026008293236713e-05, + "loss": 0.4441, + "step": 11266 + }, + { + "epoch": 4.5819438796258645, + "grad_norm": 8.925943797507479, + "learning_rate": 1.5025132347398228e-05, + "loss": 0.1972, + "step": 11267 + }, + { + "epoch": 4.58235054900366, + "grad_norm": 6.08747899368361, + "learning_rate": 1.5024256349974226e-05, + "loss": 0.1284, + "step": 11268 + }, + { + "epoch": 4.582757218381456, + "grad_norm": 0.23103774512422762, + "learning_rate": 1.5023380300973706e-05, + "loss": 0.0027, + "step": 11269 + }, + { + "epoch": 4.583163887759252, + "grad_norm": 5.342657210337929, + "learning_rate": 1.502250420040566e-05, + "loss": 0.233, + "step": 11270 + }, + { + "epoch": 4.5835705571370475, + "grad_norm": 6.0884034851405495, + "learning_rate": 1.5021628048279077e-05, + "loss": 0.2569, + "step": 11271 + }, + { + "epoch": 4.583977226514843, + "grad_norm": 1.0632838629633052, + "learning_rate": 1.5020751844602957e-05, + "loss": 0.0157, + "step": 11272 + }, + { + "epoch": 4.584383895892639, + "grad_norm": 13.270906109874819, + "learning_rate": 1.5019875589386294e-05, + "loss": 0.5311, + "step": 11273 + }, + { + "epoch": 4.584790565270435, + "grad_norm": 2.888342249348674, + "learning_rate": 1.5018999282638078e-05, + "loss": 0.1294, + "step": 11274 + }, + { + "epoch": 4.585197234648231, + "grad_norm": 7.761280192911579, + "learning_rate": 1.501812292436731e-05, + "loss": 0.3339, + "step": 11275 + }, + { + "epoch": 4.585603904026026, + "grad_norm": 6.658953210989079, + "learning_rate": 1.5017246514582988e-05, + "loss": 0.1794, + "step": 11276 + }, + { + "epoch": 4.586010573403823, + "grad_norm": 0.4378723431679302, + "learning_rate": 1.50163700532941e-05, + "loss": 0.009, + "step": 11277 + }, + { + "epoch": 4.586417242781619, + "grad_norm": 3.189580977001145, + "learning_rate": 1.5015493540509652e-05, + "loss": 0.0942, + "step": 11278 + }, + { + "epoch": 4.5868239121594145, + "grad_norm": 3.6733510451950706, + "learning_rate": 1.5014616976238635e-05, + "loss": 0.0704, + "step": 11279 + }, + { + "epoch": 4.58723058153721, + "grad_norm": 0.18333466003870474, + "learning_rate": 1.5013740360490057e-05, + "loss": 0.0031, + "step": 11280 + }, + { + "epoch": 4.587637250915006, + "grad_norm": 1.6393331834675104, + "learning_rate": 1.5012863693272905e-05, + "loss": 0.0252, + "step": 11281 + }, + { + "epoch": 4.588043920292802, + "grad_norm": 0.15290255557870658, + "learning_rate": 1.5011986974596187e-05, + "loss": 0.0032, + "step": 11282 + }, + { + "epoch": 4.588450589670598, + "grad_norm": 2.2312909962884095, + "learning_rate": 1.5011110204468902e-05, + "loss": 0.0347, + "step": 11283 + }, + { + "epoch": 4.588857259048393, + "grad_norm": 6.192815871941064, + "learning_rate": 1.5010233382900045e-05, + "loss": 0.1945, + "step": 11284 + }, + { + "epoch": 4.589263928426189, + "grad_norm": 10.720912242821601, + "learning_rate": 1.5009356509898624e-05, + "loss": 0.5139, + "step": 11285 + }, + { + "epoch": 4.589670597803986, + "grad_norm": 3.270164331098143, + "learning_rate": 1.5008479585473635e-05, + "loss": 0.0488, + "step": 11286 + }, + { + "epoch": 4.5900772671817816, + "grad_norm": 4.652312771496261, + "learning_rate": 1.500760260963408e-05, + "loss": 0.0882, + "step": 11287 + }, + { + "epoch": 4.590483936559577, + "grad_norm": 0.3021539382396504, + "learning_rate": 1.5006725582388968e-05, + "loss": 0.007, + "step": 11288 + }, + { + "epoch": 4.590890605937373, + "grad_norm": 0.36613392141991924, + "learning_rate": 1.5005848503747298e-05, + "loss": 0.0047, + "step": 11289 + }, + { + "epoch": 4.591297275315169, + "grad_norm": 0.36960780722753683, + "learning_rate": 1.5004971373718072e-05, + "loss": 0.0075, + "step": 11290 + }, + { + "epoch": 4.591703944692965, + "grad_norm": 1.499084691029442, + "learning_rate": 1.5004094192310297e-05, + "loss": 0.0237, + "step": 11291 + }, + { + "epoch": 4.59211061407076, + "grad_norm": 11.37222954420331, + "learning_rate": 1.5003216959532976e-05, + "loss": 0.6783, + "step": 11292 + }, + { + "epoch": 4.592517283448556, + "grad_norm": 1.2945832945812452, + "learning_rate": 1.5002339675395114e-05, + "loss": 0.0227, + "step": 11293 + }, + { + "epoch": 4.592923952826352, + "grad_norm": 6.886725309263206, + "learning_rate": 1.5001462339905719e-05, + "loss": 0.2595, + "step": 11294 + }, + { + "epoch": 4.593330622204148, + "grad_norm": 5.643113652333795, + "learning_rate": 1.5000584953073796e-05, + "loss": 0.0826, + "step": 11295 + }, + { + "epoch": 4.593737291581943, + "grad_norm": 1.1745436428711467, + "learning_rate": 1.4999707514908353e-05, + "loss": 0.022, + "step": 11296 + }, + { + "epoch": 4.59414396095974, + "grad_norm": 9.577944606302342, + "learning_rate": 1.4998830025418391e-05, + "loss": 0.2392, + "step": 11297 + }, + { + "epoch": 4.594550630337536, + "grad_norm": 1.330136699573151, + "learning_rate": 1.4997952484612928e-05, + "loss": 0.0315, + "step": 11298 + }, + { + "epoch": 4.594957299715332, + "grad_norm": 11.08741903473894, + "learning_rate": 1.4997074892500964e-05, + "loss": 0.9758, + "step": 11299 + }, + { + "epoch": 4.595363969093127, + "grad_norm": 6.352947462267381, + "learning_rate": 1.4996197249091513e-05, + "loss": 0.176, + "step": 11300 + }, + { + "epoch": 4.595770638470923, + "grad_norm": 8.857228277142598, + "learning_rate": 1.4995319554393581e-05, + "loss": 0.4472, + "step": 11301 + }, + { + "epoch": 4.596177307848719, + "grad_norm": 4.19513796946055, + "learning_rate": 1.4994441808416182e-05, + "loss": 0.1525, + "step": 11302 + }, + { + "epoch": 4.596583977226515, + "grad_norm": 4.089437288366077, + "learning_rate": 1.4993564011168324e-05, + "loss": 0.0619, + "step": 11303 + }, + { + "epoch": 4.5969906466043104, + "grad_norm": 7.574008514891088, + "learning_rate": 1.499268616265902e-05, + "loss": 0.1956, + "step": 11304 + }, + { + "epoch": 4.597397315982106, + "grad_norm": 2.429919229029429, + "learning_rate": 1.4991808262897278e-05, + "loss": 0.0388, + "step": 11305 + }, + { + "epoch": 4.597803985359903, + "grad_norm": 15.051183922393461, + "learning_rate": 1.499093031189211e-05, + "loss": 0.5205, + "step": 11306 + }, + { + "epoch": 4.598210654737699, + "grad_norm": 6.307621760712929, + "learning_rate": 1.4990052309652533e-05, + "loss": 0.444, + "step": 11307 + }, + { + "epoch": 4.598617324115494, + "grad_norm": 3.654285182372215, + "learning_rate": 1.4989174256187555e-05, + "loss": 0.1141, + "step": 11308 + }, + { + "epoch": 4.59902399349329, + "grad_norm": 3.0089474904927513, + "learning_rate": 1.4988296151506192e-05, + "loss": 0.051, + "step": 11309 + }, + { + "epoch": 4.599430662871086, + "grad_norm": 9.547956041012378, + "learning_rate": 1.498741799561746e-05, + "loss": 0.2803, + "step": 11310 + }, + { + "epoch": 4.599837332248882, + "grad_norm": 9.991827043438438, + "learning_rate": 1.4986539788530375e-05, + "loss": 0.487, + "step": 11311 + }, + { + "epoch": 4.6002440016266775, + "grad_norm": 2.5640224774451674, + "learning_rate": 1.4985661530253946e-05, + "loss": 0.076, + "step": 11312 + }, + { + "epoch": 4.600650671004473, + "grad_norm": 2.7251989527227733, + "learning_rate": 1.498478322079719e-05, + "loss": 0.0494, + "step": 11313 + }, + { + "epoch": 4.601057340382269, + "grad_norm": 8.46067173418026, + "learning_rate": 1.4983904860169129e-05, + "loss": 0.1426, + "step": 11314 + }, + { + "epoch": 4.601464009760065, + "grad_norm": 8.163832877320267, + "learning_rate": 1.4983026448378778e-05, + "loss": 0.1243, + "step": 11315 + }, + { + "epoch": 4.6018706791378605, + "grad_norm": 9.897897550553319, + "learning_rate": 1.4982147985435146e-05, + "loss": 0.3239, + "step": 11316 + }, + { + "epoch": 4.602277348515656, + "grad_norm": 17.659828327961385, + "learning_rate": 1.4981269471347264e-05, + "loss": 1.8261, + "step": 11317 + }, + { + "epoch": 4.602684017893453, + "grad_norm": 13.201618591123516, + "learning_rate": 1.498039090612414e-05, + "loss": 0.4464, + "step": 11318 + }, + { + "epoch": 4.603090687271249, + "grad_norm": 0.377912282624772, + "learning_rate": 1.4979512289774798e-05, + "loss": 0.0052, + "step": 11319 + }, + { + "epoch": 4.6034973566490445, + "grad_norm": 8.506940950232257, + "learning_rate": 1.4978633622308257e-05, + "loss": 0.2661, + "step": 11320 + }, + { + "epoch": 4.60390402602684, + "grad_norm": 1.8494937447730588, + "learning_rate": 1.4977754903733536e-05, + "loss": 0.0272, + "step": 11321 + }, + { + "epoch": 4.604310695404636, + "grad_norm": 8.216915070939724, + "learning_rate": 1.4976876134059655e-05, + "loss": 0.2712, + "step": 11322 + }, + { + "epoch": 4.604717364782432, + "grad_norm": 1.0409055189422813, + "learning_rate": 1.4975997313295636e-05, + "loss": 0.0182, + "step": 11323 + }, + { + "epoch": 4.6051240341602275, + "grad_norm": 12.346987692628346, + "learning_rate": 1.49751184414505e-05, + "loss": 0.6322, + "step": 11324 + }, + { + "epoch": 4.605530703538023, + "grad_norm": 0.22572397526836918, + "learning_rate": 1.4974239518533268e-05, + "loss": 0.0045, + "step": 11325 + }, + { + "epoch": 4.605937372915819, + "grad_norm": 7.149023890025653, + "learning_rate": 1.4973360544552966e-05, + "loss": 0.0965, + "step": 11326 + }, + { + "epoch": 4.606344042293616, + "grad_norm": 2.2811047477869084, + "learning_rate": 1.4972481519518614e-05, + "loss": 0.0376, + "step": 11327 + }, + { + "epoch": 4.6067507116714115, + "grad_norm": 9.422346998001881, + "learning_rate": 1.4971602443439237e-05, + "loss": 0.3517, + "step": 11328 + }, + { + "epoch": 4.607157381049207, + "grad_norm": 5.143195722823065, + "learning_rate": 1.4970723316323858e-05, + "loss": 0.1517, + "step": 11329 + }, + { + "epoch": 4.607564050427003, + "grad_norm": 6.459165103595463, + "learning_rate": 1.4969844138181505e-05, + "loss": 0.1949, + "step": 11330 + }, + { + "epoch": 4.607970719804799, + "grad_norm": 8.018184677951282, + "learning_rate": 1.4968964909021197e-05, + "loss": 0.2925, + "step": 11331 + }, + { + "epoch": 4.6083773891825945, + "grad_norm": 15.854620974775111, + "learning_rate": 1.4968085628851965e-05, + "loss": 1.0854, + "step": 11332 + }, + { + "epoch": 4.60878405856039, + "grad_norm": 13.369106056734344, + "learning_rate": 1.4967206297682832e-05, + "loss": 0.3846, + "step": 11333 + }, + { + "epoch": 4.609190727938186, + "grad_norm": 1.8619969331825577, + "learning_rate": 1.496632691552283e-05, + "loss": 0.0435, + "step": 11334 + }, + { + "epoch": 4.609597397315982, + "grad_norm": 1.9278685066914913, + "learning_rate": 1.4965447482380978e-05, + "loss": 0.03, + "step": 11335 + }, + { + "epoch": 4.610004066693778, + "grad_norm": 2.1993391574822208, + "learning_rate": 1.4964567998266313e-05, + "loss": 0.0332, + "step": 11336 + }, + { + "epoch": 4.610410736071573, + "grad_norm": 10.26696733408081, + "learning_rate": 1.4963688463187857e-05, + "loss": 0.3833, + "step": 11337 + }, + { + "epoch": 4.61081740544937, + "grad_norm": 11.300253848832435, + "learning_rate": 1.4962808877154642e-05, + "loss": 0.1915, + "step": 11338 + }, + { + "epoch": 4.611224074827166, + "grad_norm": 3.2491358726313604, + "learning_rate": 1.4961929240175695e-05, + "loss": 0.0861, + "step": 11339 + }, + { + "epoch": 4.6116307442049616, + "grad_norm": 2.4017144085861637, + "learning_rate": 1.4961049552260047e-05, + "loss": 0.0369, + "step": 11340 + }, + { + "epoch": 4.612037413582757, + "grad_norm": 15.293005333536367, + "learning_rate": 1.496016981341673e-05, + "loss": 1.1224, + "step": 11341 + }, + { + "epoch": 4.612444082960553, + "grad_norm": 7.595363396058923, + "learning_rate": 1.495929002365477e-05, + "loss": 0.4817, + "step": 11342 + }, + { + "epoch": 4.612850752338349, + "grad_norm": 6.194408934576699, + "learning_rate": 1.4958410182983202e-05, + "loss": 0.3123, + "step": 11343 + }, + { + "epoch": 4.613257421716145, + "grad_norm": 3.8169889089027578, + "learning_rate": 1.4957530291411063e-05, + "loss": 0.0645, + "step": 11344 + }, + { + "epoch": 4.61366409109394, + "grad_norm": 8.85054859354272, + "learning_rate": 1.4956650348947376e-05, + "loss": 0.2836, + "step": 11345 + }, + { + "epoch": 4.614070760471736, + "grad_norm": 7.762261245573846, + "learning_rate": 1.495577035560118e-05, + "loss": 0.2637, + "step": 11346 + }, + { + "epoch": 4.614477429849533, + "grad_norm": 10.020567182709454, + "learning_rate": 1.495489031138151e-05, + "loss": 0.0877, + "step": 11347 + }, + { + "epoch": 4.614884099227329, + "grad_norm": 15.454820163766968, + "learning_rate": 1.4954010216297392e-05, + "loss": 0.9808, + "step": 11348 + }, + { + "epoch": 4.615290768605124, + "grad_norm": 8.24317464952373, + "learning_rate": 1.4953130070357868e-05, + "loss": 0.2499, + "step": 11349 + }, + { + "epoch": 4.61569743798292, + "grad_norm": 6.256786852589446, + "learning_rate": 1.4952249873571971e-05, + "loss": 0.2757, + "step": 11350 + }, + { + "epoch": 4.616104107360716, + "grad_norm": 3.425740911690489, + "learning_rate": 1.4951369625948738e-05, + "loss": 0.1686, + "step": 11351 + }, + { + "epoch": 4.616510776738512, + "grad_norm": 11.228781441026962, + "learning_rate": 1.4950489327497202e-05, + "loss": 0.5098, + "step": 11352 + }, + { + "epoch": 4.616917446116307, + "grad_norm": 3.0903659196527395, + "learning_rate": 1.4949608978226403e-05, + "loss": 0.0488, + "step": 11353 + }, + { + "epoch": 4.617324115494103, + "grad_norm": 6.079478912620204, + "learning_rate": 1.4948728578145375e-05, + "loss": 0.2056, + "step": 11354 + }, + { + "epoch": 4.617730784871899, + "grad_norm": 15.368057990424704, + "learning_rate": 1.4947848127263159e-05, + "loss": 0.4691, + "step": 11355 + }, + { + "epoch": 4.618137454249695, + "grad_norm": 3.2266688919753514, + "learning_rate": 1.4946967625588791e-05, + "loss": 0.092, + "step": 11356 + }, + { + "epoch": 4.6185441236274904, + "grad_norm": 3.2512788119289775, + "learning_rate": 1.4946087073131312e-05, + "loss": 0.0779, + "step": 11357 + }, + { + "epoch": 4.618950793005286, + "grad_norm": 0.8967539655627592, + "learning_rate": 1.4945206469899758e-05, + "loss": 0.0127, + "step": 11358 + }, + { + "epoch": 4.619357462383083, + "grad_norm": 3.135971476607683, + "learning_rate": 1.494432581590317e-05, + "loss": 0.1058, + "step": 11359 + }, + { + "epoch": 4.619764131760879, + "grad_norm": 9.452083230471091, + "learning_rate": 1.4943445111150592e-05, + "loss": 0.3533, + "step": 11360 + }, + { + "epoch": 4.620170801138674, + "grad_norm": 8.341544371528796, + "learning_rate": 1.4942564355651059e-05, + "loss": 0.4192, + "step": 11361 + }, + { + "epoch": 4.62057747051647, + "grad_norm": 7.93540311016988, + "learning_rate": 1.4941683549413616e-05, + "loss": 0.1152, + "step": 11362 + }, + { + "epoch": 4.620984139894266, + "grad_norm": 8.256409161782072, + "learning_rate": 1.4940802692447306e-05, + "loss": 0.1577, + "step": 11363 + }, + { + "epoch": 4.621390809272062, + "grad_norm": 4.562505257453242, + "learning_rate": 1.4939921784761169e-05, + "loss": 0.0713, + "step": 11364 + }, + { + "epoch": 4.6217974786498575, + "grad_norm": 5.429398079746833, + "learning_rate": 1.4939040826364249e-05, + "loss": 0.0576, + "step": 11365 + }, + { + "epoch": 4.622204148027653, + "grad_norm": 8.789027483764407, + "learning_rate": 1.4938159817265587e-05, + "loss": 0.1782, + "step": 11366 + }, + { + "epoch": 4.622610817405449, + "grad_norm": 7.300483980515881, + "learning_rate": 1.493727875747423e-05, + "loss": 0.3185, + "step": 11367 + }, + { + "epoch": 4.623017486783246, + "grad_norm": 4.981169659109583, + "learning_rate": 1.4936397646999224e-05, + "loss": 0.1086, + "step": 11368 + }, + { + "epoch": 4.623424156161041, + "grad_norm": 7.720080522272335, + "learning_rate": 1.4935516485849611e-05, + "loss": 0.2719, + "step": 11369 + }, + { + "epoch": 4.623830825538837, + "grad_norm": 3.2922890497151016, + "learning_rate": 1.4934635274034435e-05, + "loss": 0.0856, + "step": 11370 + }, + { + "epoch": 4.624237494916633, + "grad_norm": 3.422724595041383, + "learning_rate": 1.4933754011562745e-05, + "loss": 0.0652, + "step": 11371 + }, + { + "epoch": 4.624644164294429, + "grad_norm": 5.847054351427494, + "learning_rate": 1.4932872698443587e-05, + "loss": 0.1046, + "step": 11372 + }, + { + "epoch": 4.6250508336722245, + "grad_norm": 9.29047876497603, + "learning_rate": 1.4931991334686008e-05, + "loss": 0.2589, + "step": 11373 + }, + { + "epoch": 4.62545750305002, + "grad_norm": 7.650143704224406, + "learning_rate": 1.4931109920299056e-05, + "loss": 0.2033, + "step": 11374 + }, + { + "epoch": 4.625864172427816, + "grad_norm": 5.5338488105112225, + "learning_rate": 1.4930228455291783e-05, + "loss": 0.1692, + "step": 11375 + }, + { + "epoch": 4.626270841805612, + "grad_norm": 7.637215655531537, + "learning_rate": 1.4929346939673229e-05, + "loss": 0.3552, + "step": 11376 + }, + { + "epoch": 4.6266775111834075, + "grad_norm": 5.832730005555227, + "learning_rate": 1.4928465373452448e-05, + "loss": 0.2477, + "step": 11377 + }, + { + "epoch": 4.627084180561203, + "grad_norm": 0.08833489322806085, + "learning_rate": 1.4927583756638488e-05, + "loss": 0.0014, + "step": 11378 + }, + { + "epoch": 4.627490849939, + "grad_norm": 0.11271714933517102, + "learning_rate": 1.4926702089240406e-05, + "loss": 0.0021, + "step": 11379 + }, + { + "epoch": 4.627897519316796, + "grad_norm": 5.717223804848248, + "learning_rate": 1.4925820371267241e-05, + "loss": 0.1284, + "step": 11380 + }, + { + "epoch": 4.6283041886945915, + "grad_norm": 7.075495454754745, + "learning_rate": 1.4924938602728056e-05, + "loss": 0.1488, + "step": 11381 + }, + { + "epoch": 4.628710858072387, + "grad_norm": 12.370802160264928, + "learning_rate": 1.4924056783631895e-05, + "loss": 0.8313, + "step": 11382 + }, + { + "epoch": 4.629117527450183, + "grad_norm": 4.87001096323918, + "learning_rate": 1.4923174913987817e-05, + "loss": 0.1455, + "step": 11383 + }, + { + "epoch": 4.629524196827979, + "grad_norm": 8.139373441008534, + "learning_rate": 1.4922292993804864e-05, + "loss": 0.1845, + "step": 11384 + }, + { + "epoch": 4.6299308662057745, + "grad_norm": 2.5397442906422767, + "learning_rate": 1.4921411023092101e-05, + "loss": 0.056, + "step": 11385 + }, + { + "epoch": 4.63033753558357, + "grad_norm": 6.635728051631599, + "learning_rate": 1.4920529001858575e-05, + "loss": 0.2919, + "step": 11386 + }, + { + "epoch": 4.630744204961366, + "grad_norm": 6.132919987606732, + "learning_rate": 1.491964693011334e-05, + "loss": 0.3027, + "step": 11387 + }, + { + "epoch": 4.631150874339163, + "grad_norm": 5.701319226355001, + "learning_rate": 1.4918764807865458e-05, + "loss": 0.216, + "step": 11388 + }, + { + "epoch": 4.6315575437169585, + "grad_norm": 9.553697401368801, + "learning_rate": 1.491788263512398e-05, + "loss": 0.3634, + "step": 11389 + }, + { + "epoch": 4.631964213094754, + "grad_norm": 4.46335873544751, + "learning_rate": 1.4917000411897958e-05, + "loss": 0.1147, + "step": 11390 + }, + { + "epoch": 4.63237088247255, + "grad_norm": 4.798266297465832, + "learning_rate": 1.4916118138196453e-05, + "loss": 0.1451, + "step": 11391 + }, + { + "epoch": 4.632777551850346, + "grad_norm": 0.6199123655609102, + "learning_rate": 1.4915235814028522e-05, + "loss": 0.0098, + "step": 11392 + }, + { + "epoch": 4.6331842212281416, + "grad_norm": 2.377252415509679, + "learning_rate": 1.4914353439403219e-05, + "loss": 0.0268, + "step": 11393 + }, + { + "epoch": 4.633590890605937, + "grad_norm": 5.277744922605408, + "learning_rate": 1.4913471014329608e-05, + "loss": 0.1332, + "step": 11394 + }, + { + "epoch": 4.633997559983733, + "grad_norm": 9.025211909905714, + "learning_rate": 1.4912588538816744e-05, + "loss": 0.19, + "step": 11395 + }, + { + "epoch": 4.634404229361529, + "grad_norm": 7.4565225340749475, + "learning_rate": 1.4911706012873684e-05, + "loss": 0.4834, + "step": 11396 + }, + { + "epoch": 4.634810898739325, + "grad_norm": 2.6982200976758266, + "learning_rate": 1.4910823436509489e-05, + "loss": 0.1242, + "step": 11397 + }, + { + "epoch": 4.63521756811712, + "grad_norm": 2.7033759523055236, + "learning_rate": 1.4909940809733223e-05, + "loss": 0.0462, + "step": 11398 + }, + { + "epoch": 4.635624237494916, + "grad_norm": 0.01825812403844857, + "learning_rate": 1.4909058132553944e-05, + "loss": 0.0003, + "step": 11399 + }, + { + "epoch": 4.636030906872713, + "grad_norm": 1.5777319244139638, + "learning_rate": 1.4908175404980707e-05, + "loss": 0.0264, + "step": 11400 + }, + { + "epoch": 4.636437576250509, + "grad_norm": 3.7352170247855336, + "learning_rate": 1.4907292627022585e-05, + "loss": 0.1035, + "step": 11401 + }, + { + "epoch": 4.636844245628304, + "grad_norm": 16.07317264195807, + "learning_rate": 1.4906409798688633e-05, + "loss": 1.4667, + "step": 11402 + }, + { + "epoch": 4.6372509150061, + "grad_norm": 8.046725505417026, + "learning_rate": 1.4905526919987915e-05, + "loss": 0.4574, + "step": 11403 + }, + { + "epoch": 4.637657584383896, + "grad_norm": 10.930813414903701, + "learning_rate": 1.4904643990929493e-05, + "loss": 0.4119, + "step": 11404 + }, + { + "epoch": 4.638064253761692, + "grad_norm": 9.987934720439847, + "learning_rate": 1.4903761011522434e-05, + "loss": 0.2522, + "step": 11405 + }, + { + "epoch": 4.638470923139487, + "grad_norm": 1.1297702547009845, + "learning_rate": 1.4902877981775795e-05, + "loss": 0.0138, + "step": 11406 + }, + { + "epoch": 4.638877592517283, + "grad_norm": 0.056762508021218405, + "learning_rate": 1.4901994901698654e-05, + "loss": 0.0009, + "step": 11407 + }, + { + "epoch": 4.639284261895079, + "grad_norm": 12.930390965439555, + "learning_rate": 1.4901111771300062e-05, + "loss": 0.7068, + "step": 11408 + }, + { + "epoch": 4.639690931272876, + "grad_norm": 0.721463488719059, + "learning_rate": 1.4900228590589093e-05, + "loss": 0.0107, + "step": 11409 + }, + { + "epoch": 4.640097600650671, + "grad_norm": 1.5857678910521624, + "learning_rate": 1.4899345359574813e-05, + "loss": 0.0119, + "step": 11410 + }, + { + "epoch": 4.640504270028467, + "grad_norm": 4.384307294862206, + "learning_rate": 1.4898462078266286e-05, + "loss": 0.0844, + "step": 11411 + }, + { + "epoch": 4.640910939406263, + "grad_norm": 5.846671243884194, + "learning_rate": 1.4897578746672578e-05, + "loss": 0.1357, + "step": 11412 + }, + { + "epoch": 4.641317608784059, + "grad_norm": 5.584128117662612, + "learning_rate": 1.489669536480276e-05, + "loss": 0.2419, + "step": 11413 + }, + { + "epoch": 4.641724278161854, + "grad_norm": 8.88839459054857, + "learning_rate": 1.4895811932665902e-05, + "loss": 0.2628, + "step": 11414 + }, + { + "epoch": 4.64213094753965, + "grad_norm": 0.7391727938685512, + "learning_rate": 1.4894928450271068e-05, + "loss": 0.0114, + "step": 11415 + }, + { + "epoch": 4.642537616917446, + "grad_norm": 9.236463813113776, + "learning_rate": 1.4894044917627333e-05, + "loss": 0.3514, + "step": 11416 + }, + { + "epoch": 4.642944286295242, + "grad_norm": 8.611350494030393, + "learning_rate": 1.4893161334743762e-05, + "loss": 0.3162, + "step": 11417 + }, + { + "epoch": 4.6433509556730375, + "grad_norm": 8.890592446402742, + "learning_rate": 1.4892277701629427e-05, + "loss": 0.3351, + "step": 11418 + }, + { + "epoch": 4.643757625050833, + "grad_norm": 6.689221902387409, + "learning_rate": 1.4891394018293397e-05, + "loss": 0.1456, + "step": 11419 + }, + { + "epoch": 4.64416429442863, + "grad_norm": 3.0299500040153142, + "learning_rate": 1.4890510284744748e-05, + "loss": 0.0369, + "step": 11420 + }, + { + "epoch": 4.644570963806426, + "grad_norm": 3.977553742805699, + "learning_rate": 1.4889626500992551e-05, + "loss": 0.0377, + "step": 11421 + }, + { + "epoch": 4.644977633184221, + "grad_norm": 0.1322525189547313, + "learning_rate": 1.4888742667045875e-05, + "loss": 0.0019, + "step": 11422 + }, + { + "epoch": 4.645384302562017, + "grad_norm": 8.421122974984186, + "learning_rate": 1.4887858782913795e-05, + "loss": 0.1932, + "step": 11423 + }, + { + "epoch": 4.645790971939813, + "grad_norm": 2.324522656788078, + "learning_rate": 1.4886974848605386e-05, + "loss": 0.1154, + "step": 11424 + }, + { + "epoch": 4.646197641317609, + "grad_norm": 10.853545628072522, + "learning_rate": 1.4886090864129718e-05, + "loss": 0.2348, + "step": 11425 + }, + { + "epoch": 4.6466043106954045, + "grad_norm": 10.135305058363599, + "learning_rate": 1.4885206829495871e-05, + "loss": 0.2963, + "step": 11426 + }, + { + "epoch": 4.6470109800732, + "grad_norm": 7.0275164709393065, + "learning_rate": 1.4884322744712915e-05, + "loss": 0.2173, + "step": 11427 + }, + { + "epoch": 4.647417649450996, + "grad_norm": 8.363652329094444, + "learning_rate": 1.4883438609789931e-05, + "loss": 0.1582, + "step": 11428 + }, + { + "epoch": 4.647824318828793, + "grad_norm": 10.579420405801477, + "learning_rate": 1.4882554424735989e-05, + "loss": 0.4839, + "step": 11429 + }, + { + "epoch": 4.648230988206588, + "grad_norm": 0.4371152266032423, + "learning_rate": 1.4881670189560171e-05, + "loss": 0.0076, + "step": 11430 + }, + { + "epoch": 4.648637657584384, + "grad_norm": 0.047574805607759535, + "learning_rate": 1.4880785904271552e-05, + "loss": 0.0011, + "step": 11431 + }, + { + "epoch": 4.64904432696218, + "grad_norm": 11.317258803711667, + "learning_rate": 1.4879901568879205e-05, + "loss": 0.5349, + "step": 11432 + }, + { + "epoch": 4.649450996339976, + "grad_norm": 0.19947661468772196, + "learning_rate": 1.4879017183392216e-05, + "loss": 0.0052, + "step": 11433 + }, + { + "epoch": 4.6498576657177715, + "grad_norm": 14.933273639286458, + "learning_rate": 1.4878132747819658e-05, + "loss": 0.3309, + "step": 11434 + }, + { + "epoch": 4.650264335095567, + "grad_norm": 0.12334397866603776, + "learning_rate": 1.4877248262170612e-05, + "loss": 0.0022, + "step": 11435 + }, + { + "epoch": 4.650671004473363, + "grad_norm": 7.12294935034683, + "learning_rate": 1.4876363726454157e-05, + "loss": 0.2491, + "step": 11436 + }, + { + "epoch": 4.651077673851159, + "grad_norm": 8.942198323409572, + "learning_rate": 1.4875479140679377e-05, + "loss": 0.4502, + "step": 11437 + }, + { + "epoch": 4.6514843432289545, + "grad_norm": 13.968822005625569, + "learning_rate": 1.4874594504855347e-05, + "loss": 0.4807, + "step": 11438 + }, + { + "epoch": 4.65189101260675, + "grad_norm": 0.15348209908803204, + "learning_rate": 1.4873709818991153e-05, + "loss": 0.0021, + "step": 11439 + }, + { + "epoch": 4.652297681984546, + "grad_norm": 5.229809679394289, + "learning_rate": 1.4872825083095876e-05, + "loss": 0.1008, + "step": 11440 + }, + { + "epoch": 4.652704351362343, + "grad_norm": 2.8203863878433393, + "learning_rate": 1.4871940297178596e-05, + "loss": 0.0441, + "step": 11441 + }, + { + "epoch": 4.6531110207401385, + "grad_norm": 8.456342963744406, + "learning_rate": 1.4871055461248396e-05, + "loss": 0.2991, + "step": 11442 + }, + { + "epoch": 4.653517690117934, + "grad_norm": 5.66202536443644, + "learning_rate": 1.487017057531436e-05, + "loss": 0.1026, + "step": 11443 + }, + { + "epoch": 4.65392435949573, + "grad_norm": 1.131655189631833, + "learning_rate": 1.4869285639385576e-05, + "loss": 0.0195, + "step": 11444 + }, + { + "epoch": 4.654331028873526, + "grad_norm": 8.37101024663999, + "learning_rate": 1.4868400653471118e-05, + "loss": 0.3053, + "step": 11445 + }, + { + "epoch": 4.6547376982513216, + "grad_norm": 21.513713230572957, + "learning_rate": 1.4867515617580081e-05, + "loss": 0.3355, + "step": 11446 + }, + { + "epoch": 4.655144367629117, + "grad_norm": 4.310692485958614, + "learning_rate": 1.4866630531721546e-05, + "loss": 0.0933, + "step": 11447 + }, + { + "epoch": 4.655551037006913, + "grad_norm": 10.15799699063326, + "learning_rate": 1.4865745395904596e-05, + "loss": 0.3907, + "step": 11448 + }, + { + "epoch": 4.655957706384709, + "grad_norm": 1.2675272565573625, + "learning_rate": 1.4864860210138325e-05, + "loss": 0.0263, + "step": 11449 + }, + { + "epoch": 4.6563643757625055, + "grad_norm": 9.501725245367048, + "learning_rate": 1.4863974974431814e-05, + "loss": 0.4031, + "step": 11450 + }, + { + "epoch": 4.656771045140301, + "grad_norm": 5.33305491552558, + "learning_rate": 1.486308968879415e-05, + "loss": 0.1089, + "step": 11451 + }, + { + "epoch": 4.657177714518097, + "grad_norm": 2.64047937995294, + "learning_rate": 1.4862204353234425e-05, + "loss": 0.0467, + "step": 11452 + }, + { + "epoch": 4.657584383895893, + "grad_norm": 12.90656357608245, + "learning_rate": 1.4861318967761726e-05, + "loss": 0.3918, + "step": 11453 + }, + { + "epoch": 4.657991053273689, + "grad_norm": 8.016869376006719, + "learning_rate": 1.4860433532385142e-05, + "loss": 0.2988, + "step": 11454 + }, + { + "epoch": 4.658397722651484, + "grad_norm": 0.64435357305349, + "learning_rate": 1.4859548047113758e-05, + "loss": 0.0249, + "step": 11455 + }, + { + "epoch": 4.65880439202928, + "grad_norm": 14.106202910107633, + "learning_rate": 1.485866251195667e-05, + "loss": 0.3776, + "step": 11456 + }, + { + "epoch": 4.659211061407076, + "grad_norm": 3.0361471150408152, + "learning_rate": 1.4857776926922965e-05, + "loss": 0.0783, + "step": 11457 + }, + { + "epoch": 4.659617730784872, + "grad_norm": 0.4753919736574036, + "learning_rate": 1.4856891292021734e-05, + "loss": 0.0085, + "step": 11458 + }, + { + "epoch": 4.660024400162667, + "grad_norm": 8.112839925824103, + "learning_rate": 1.4856005607262072e-05, + "loss": 0.3227, + "step": 11459 + }, + { + "epoch": 4.660431069540463, + "grad_norm": 5.459566982618859, + "learning_rate": 1.4855119872653066e-05, + "loss": 0.1459, + "step": 11460 + }, + { + "epoch": 4.66083773891826, + "grad_norm": 7.488182695294442, + "learning_rate": 1.4854234088203811e-05, + "loss": 0.3562, + "step": 11461 + }, + { + "epoch": 4.661244408296056, + "grad_norm": 0.5678654130975213, + "learning_rate": 1.4853348253923403e-05, + "loss": 0.0089, + "step": 11462 + }, + { + "epoch": 4.661651077673851, + "grad_norm": 1.8857279322745708, + "learning_rate": 1.485246236982093e-05, + "loss": 0.0296, + "step": 11463 + }, + { + "epoch": 4.662057747051647, + "grad_norm": 3.059851713538831, + "learning_rate": 1.4851576435905489e-05, + "loss": 0.0553, + "step": 11464 + }, + { + "epoch": 4.662464416429443, + "grad_norm": 0.26656450526022574, + "learning_rate": 1.4850690452186173e-05, + "loss": 0.0042, + "step": 11465 + }, + { + "epoch": 4.662871085807239, + "grad_norm": 11.080308402089457, + "learning_rate": 1.484980441867208e-05, + "loss": 0.7603, + "step": 11466 + }, + { + "epoch": 4.663277755185034, + "grad_norm": 8.108164063925878, + "learning_rate": 1.4848918335372303e-05, + "loss": 0.4654, + "step": 11467 + }, + { + "epoch": 4.66368442456283, + "grad_norm": 5.9325670520210325, + "learning_rate": 1.4848032202295939e-05, + "loss": 0.1791, + "step": 11468 + }, + { + "epoch": 4.664091093940626, + "grad_norm": 0.3581459811153693, + "learning_rate": 1.4847146019452086e-05, + "loss": 0.0078, + "step": 11469 + }, + { + "epoch": 4.664497763318423, + "grad_norm": 9.94823014940234, + "learning_rate": 1.4846259786849837e-05, + "loss": 0.4189, + "step": 11470 + }, + { + "epoch": 4.664904432696218, + "grad_norm": 5.712651929229576, + "learning_rate": 1.4845373504498293e-05, + "loss": 0.24, + "step": 11471 + }, + { + "epoch": 4.665311102074014, + "grad_norm": 7.916911812945187, + "learning_rate": 1.4844487172406552e-05, + "loss": 0.1878, + "step": 11472 + }, + { + "epoch": 4.66571777145181, + "grad_norm": 5.29044355093732, + "learning_rate": 1.4843600790583712e-05, + "loss": 0.1246, + "step": 11473 + }, + { + "epoch": 4.666124440829606, + "grad_norm": 5.256914365244692, + "learning_rate": 1.484271435903887e-05, + "loss": 0.3159, + "step": 11474 + }, + { + "epoch": 4.666531110207401, + "grad_norm": 10.684727285389771, + "learning_rate": 1.4841827877781132e-05, + "loss": 0.43, + "step": 11475 + }, + { + "epoch": 4.666937779585197, + "grad_norm": 7.637833567584091, + "learning_rate": 1.4840941346819591e-05, + "loss": 0.311, + "step": 11476 + }, + { + "epoch": 4.667344448962993, + "grad_norm": 1.8467909608500497, + "learning_rate": 1.484005476616335e-05, + "loss": 0.0399, + "step": 11477 + }, + { + "epoch": 4.667751118340789, + "grad_norm": 14.574648200266562, + "learning_rate": 1.4839168135821514e-05, + "loss": 0.7639, + "step": 11478 + }, + { + "epoch": 4.6681577877185845, + "grad_norm": 2.0074841759833304, + "learning_rate": 1.4838281455803179e-05, + "loss": 0.0306, + "step": 11479 + }, + { + "epoch": 4.66856445709638, + "grad_norm": 1.3073338345894554, + "learning_rate": 1.4837394726117449e-05, + "loss": 0.0179, + "step": 11480 + }, + { + "epoch": 4.668971126474176, + "grad_norm": 5.429292569874748, + "learning_rate": 1.483650794677343e-05, + "loss": 0.1412, + "step": 11481 + }, + { + "epoch": 4.669377795851973, + "grad_norm": 3.296281062769282, + "learning_rate": 1.483562111778022e-05, + "loss": 0.0479, + "step": 11482 + }, + { + "epoch": 4.669784465229768, + "grad_norm": 14.286553239982306, + "learning_rate": 1.4834734239146929e-05, + "loss": 0.7631, + "step": 11483 + }, + { + "epoch": 4.670191134607564, + "grad_norm": 1.487565894279573, + "learning_rate": 1.4833847310882656e-05, + "loss": 0.0229, + "step": 11484 + }, + { + "epoch": 4.67059780398536, + "grad_norm": 2.1520481071067907, + "learning_rate": 1.483296033299651e-05, + "loss": 0.0435, + "step": 11485 + }, + { + "epoch": 4.671004473363156, + "grad_norm": 8.52104209673043, + "learning_rate": 1.483207330549759e-05, + "loss": 0.2285, + "step": 11486 + }, + { + "epoch": 4.6714111427409515, + "grad_norm": 8.247218934612071, + "learning_rate": 1.4831186228395007e-05, + "loss": 0.251, + "step": 11487 + }, + { + "epoch": 4.671817812118747, + "grad_norm": 4.343575378371625, + "learning_rate": 1.4830299101697866e-05, + "loss": 0.227, + "step": 11488 + }, + { + "epoch": 4.672224481496543, + "grad_norm": 0.9700292936063966, + "learning_rate": 1.4829411925415273e-05, + "loss": 0.0141, + "step": 11489 + }, + { + "epoch": 4.672631150874339, + "grad_norm": 18.26449464721907, + "learning_rate": 1.4828524699556337e-05, + "loss": 0.3612, + "step": 11490 + }, + { + "epoch": 4.673037820252135, + "grad_norm": 8.611822365455232, + "learning_rate": 1.4827637424130164e-05, + "loss": 0.5541, + "step": 11491 + }, + { + "epoch": 4.673444489629931, + "grad_norm": 1.1100359271112494, + "learning_rate": 1.4826750099145865e-05, + "loss": 0.0165, + "step": 11492 + }, + { + "epoch": 4.673851159007727, + "grad_norm": 11.266756055977382, + "learning_rate": 1.4825862724612543e-05, + "loss": 0.2461, + "step": 11493 + }, + { + "epoch": 4.674257828385523, + "grad_norm": 1.3852792525098803, + "learning_rate": 1.4824975300539315e-05, + "loss": 0.022, + "step": 11494 + }, + { + "epoch": 4.6746644977633185, + "grad_norm": 11.040174663947623, + "learning_rate": 1.4824087826935287e-05, + "loss": 0.1431, + "step": 11495 + }, + { + "epoch": 4.675071167141114, + "grad_norm": 0.038797354820952264, + "learning_rate": 1.4823200303809567e-05, + "loss": 0.0006, + "step": 11496 + }, + { + "epoch": 4.67547783651891, + "grad_norm": 7.240919567414452, + "learning_rate": 1.4822312731171271e-05, + "loss": 0.234, + "step": 11497 + }, + { + "epoch": 4.675884505896706, + "grad_norm": 4.274082002270997, + "learning_rate": 1.4821425109029509e-05, + "loss": 0.0752, + "step": 11498 + }, + { + "epoch": 4.6762911752745016, + "grad_norm": 7.628675298007639, + "learning_rate": 1.4820537437393387e-05, + "loss": 0.1696, + "step": 11499 + }, + { + "epoch": 4.676697844652297, + "grad_norm": 10.9269453825787, + "learning_rate": 1.4819649716272025e-05, + "loss": 0.4411, + "step": 11500 + }, + { + "epoch": 4.677104514030093, + "grad_norm": 8.116981637173563, + "learning_rate": 1.4818761945674531e-05, + "loss": 0.2117, + "step": 11501 + }, + { + "epoch": 4.67751118340789, + "grad_norm": 2.648812162362164, + "learning_rate": 1.4817874125610024e-05, + "loss": 0.0581, + "step": 11502 + }, + { + "epoch": 4.6779178527856855, + "grad_norm": 17.761401727119466, + "learning_rate": 1.481698625608761e-05, + "loss": 0.5183, + "step": 11503 + }, + { + "epoch": 4.678324522163481, + "grad_norm": 2.655737099966943, + "learning_rate": 1.4816098337116414e-05, + "loss": 0.0393, + "step": 11504 + }, + { + "epoch": 4.678731191541277, + "grad_norm": 8.106175857516828, + "learning_rate": 1.481521036870554e-05, + "loss": 0.477, + "step": 11505 + }, + { + "epoch": 4.679137860919073, + "grad_norm": 4.581993338824615, + "learning_rate": 1.4814322350864108e-05, + "loss": 0.0867, + "step": 11506 + }, + { + "epoch": 4.679544530296869, + "grad_norm": 13.67859943721915, + "learning_rate": 1.4813434283601236e-05, + "loss": 0.5888, + "step": 11507 + }, + { + "epoch": 4.679951199674664, + "grad_norm": 6.044880020415764, + "learning_rate": 1.4812546166926037e-05, + "loss": 0.1182, + "step": 11508 + }, + { + "epoch": 4.68035786905246, + "grad_norm": 6.61167169626243, + "learning_rate": 1.4811658000847629e-05, + "loss": 0.4558, + "step": 11509 + }, + { + "epoch": 4.680764538430256, + "grad_norm": 10.223137875751691, + "learning_rate": 1.4810769785375133e-05, + "loss": 0.4142, + "step": 11510 + }, + { + "epoch": 4.6811712078080525, + "grad_norm": 0.2725865005625818, + "learning_rate": 1.480988152051766e-05, + "loss": 0.0037, + "step": 11511 + }, + { + "epoch": 4.681577877185848, + "grad_norm": 6.22184364028803, + "learning_rate": 1.4808993206284333e-05, + "loss": 0.2118, + "step": 11512 + }, + { + "epoch": 4.681984546563644, + "grad_norm": 5.373595224050827, + "learning_rate": 1.4808104842684272e-05, + "loss": 0.0956, + "step": 11513 + }, + { + "epoch": 4.68239121594144, + "grad_norm": 10.373295802916786, + "learning_rate": 1.4807216429726596e-05, + "loss": 0.5084, + "step": 11514 + }, + { + "epoch": 4.682797885319236, + "grad_norm": 0.8185102233647426, + "learning_rate": 1.4806327967420422e-05, + "loss": 0.0142, + "step": 11515 + }, + { + "epoch": 4.683204554697031, + "grad_norm": 0.9699589129155523, + "learning_rate": 1.4805439455774872e-05, + "loss": 0.0118, + "step": 11516 + }, + { + "epoch": 4.683611224074827, + "grad_norm": 0.24422023600226717, + "learning_rate": 1.480455089479907e-05, + "loss": 0.0042, + "step": 11517 + }, + { + "epoch": 4.684017893452623, + "grad_norm": 8.112789638781488, + "learning_rate": 1.4803662284502132e-05, + "loss": 0.2248, + "step": 11518 + }, + { + "epoch": 4.684424562830419, + "grad_norm": 4.6940764579060055, + "learning_rate": 1.4802773624893186e-05, + "loss": 0.1204, + "step": 11519 + }, + { + "epoch": 4.684831232208214, + "grad_norm": 0.9349157407501896, + "learning_rate": 1.4801884915981349e-05, + "loss": 0.0177, + "step": 11520 + }, + { + "epoch": 4.68523790158601, + "grad_norm": 4.421330766939949, + "learning_rate": 1.4800996157775748e-05, + "loss": 0.0979, + "step": 11521 + }, + { + "epoch": 4.685644570963806, + "grad_norm": 0.4316019626777608, + "learning_rate": 1.4800107350285505e-05, + "loss": 0.0046, + "step": 11522 + }, + { + "epoch": 4.686051240341603, + "grad_norm": 13.288771817337713, + "learning_rate": 1.4799218493519742e-05, + "loss": 0.5827, + "step": 11523 + }, + { + "epoch": 4.686457909719398, + "grad_norm": 8.410361352147971, + "learning_rate": 1.479832958748759e-05, + "loss": 0.428, + "step": 11524 + }, + { + "epoch": 4.686864579097194, + "grad_norm": 3.401917348057715, + "learning_rate": 1.4797440632198166e-05, + "loss": 0.0431, + "step": 11525 + }, + { + "epoch": 4.68727124847499, + "grad_norm": 0.8474961542045792, + "learning_rate": 1.47965516276606e-05, + "loss": 0.0197, + "step": 11526 + }, + { + "epoch": 4.687677917852786, + "grad_norm": 4.957490572290143, + "learning_rate": 1.479566257388402e-05, + "loss": 0.134, + "step": 11527 + }, + { + "epoch": 4.688084587230581, + "grad_norm": 11.018178937765454, + "learning_rate": 1.4794773470877549e-05, + "loss": 0.3383, + "step": 11528 + }, + { + "epoch": 4.688491256608377, + "grad_norm": 16.819291069452873, + "learning_rate": 1.4793884318650314e-05, + "loss": 1.3747, + "step": 11529 + }, + { + "epoch": 4.688897925986173, + "grad_norm": 4.531021505125642, + "learning_rate": 1.4792995117211444e-05, + "loss": 0.1782, + "step": 11530 + }, + { + "epoch": 4.689304595363969, + "grad_norm": 6.706551276091772, + "learning_rate": 1.4792105866570068e-05, + "loss": 0.1416, + "step": 11531 + }, + { + "epoch": 4.689711264741765, + "grad_norm": 14.658118984888219, + "learning_rate": 1.4791216566735312e-05, + "loss": 0.8242, + "step": 11532 + }, + { + "epoch": 4.690117934119561, + "grad_norm": 7.520314287807237, + "learning_rate": 1.4790327217716307e-05, + "loss": 0.3684, + "step": 11533 + }, + { + "epoch": 4.690524603497357, + "grad_norm": 1.3141897582983362, + "learning_rate": 1.4789437819522183e-05, + "loss": 0.0196, + "step": 11534 + }, + { + "epoch": 4.690931272875153, + "grad_norm": 3.0743840918298795, + "learning_rate": 1.4788548372162068e-05, + "loss": 0.0526, + "step": 11535 + }, + { + "epoch": 4.691337942252948, + "grad_norm": 0.04553712692318309, + "learning_rate": 1.4787658875645094e-05, + "loss": 0.0008, + "step": 11536 + }, + { + "epoch": 4.691744611630744, + "grad_norm": 9.508930974682748, + "learning_rate": 1.4786769329980395e-05, + "loss": 0.1296, + "step": 11537 + }, + { + "epoch": 4.69215128100854, + "grad_norm": 6.455836600960948, + "learning_rate": 1.4785879735177096e-05, + "loss": 0.1715, + "step": 11538 + }, + { + "epoch": 4.692557950386336, + "grad_norm": 8.248657350505018, + "learning_rate": 1.4784990091244335e-05, + "loss": 0.1941, + "step": 11539 + }, + { + "epoch": 4.6929646197641315, + "grad_norm": 9.396170013630796, + "learning_rate": 1.4784100398191242e-05, + "loss": 0.4782, + "step": 11540 + }, + { + "epoch": 4.693371289141927, + "grad_norm": 10.984500568887661, + "learning_rate": 1.478321065602695e-05, + "loss": 0.4802, + "step": 11541 + }, + { + "epoch": 4.693777958519723, + "grad_norm": 3.1418575006632987, + "learning_rate": 1.4782320864760593e-05, + "loss": 0.098, + "step": 11542 + }, + { + "epoch": 4.69418462789752, + "grad_norm": 1.9988281853944025, + "learning_rate": 1.4781431024401307e-05, + "loss": 0.0294, + "step": 11543 + }, + { + "epoch": 4.694591297275315, + "grad_norm": 8.497529424516355, + "learning_rate": 1.4780541134958224e-05, + "loss": 0.3818, + "step": 11544 + }, + { + "epoch": 4.694997966653111, + "grad_norm": 3.4330360960248334, + "learning_rate": 1.4779651196440479e-05, + "loss": 0.0806, + "step": 11545 + }, + { + "epoch": 4.695404636030907, + "grad_norm": 4.347723486089958, + "learning_rate": 1.4778761208857213e-05, + "loss": 0.0772, + "step": 11546 + }, + { + "epoch": 4.695811305408703, + "grad_norm": 7.274915450535141, + "learning_rate": 1.4777871172217555e-05, + "loss": 0.3842, + "step": 11547 + }, + { + "epoch": 4.6962179747864985, + "grad_norm": 0.4680478324663679, + "learning_rate": 1.4776981086530644e-05, + "loss": 0.0078, + "step": 11548 + }, + { + "epoch": 4.696624644164294, + "grad_norm": 6.7604087175392085, + "learning_rate": 1.4776090951805619e-05, + "loss": 0.2, + "step": 11549 + }, + { + "epoch": 4.69703131354209, + "grad_norm": 10.999976998340333, + "learning_rate": 1.4775200768051618e-05, + "loss": 0.4256, + "step": 11550 + }, + { + "epoch": 4.697437982919886, + "grad_norm": 1.5874693502954558, + "learning_rate": 1.4774310535277776e-05, + "loss": 0.0336, + "step": 11551 + }, + { + "epoch": 4.697844652297682, + "grad_norm": 4.9169947670604985, + "learning_rate": 1.4773420253493234e-05, + "loss": 0.3225, + "step": 11552 + }, + { + "epoch": 4.698251321675478, + "grad_norm": 0.666257734429551, + "learning_rate": 1.477252992270713e-05, + "loss": 0.0122, + "step": 11553 + }, + { + "epoch": 4.698657991053274, + "grad_norm": 4.6084093748067145, + "learning_rate": 1.4771639542928606e-05, + "loss": 0.0648, + "step": 11554 + }, + { + "epoch": 4.69906466043107, + "grad_norm": 0.773497138431865, + "learning_rate": 1.47707491141668e-05, + "loss": 0.0129, + "step": 11555 + }, + { + "epoch": 4.6994713298088655, + "grad_norm": 6.810385910540952, + "learning_rate": 1.4769858636430855e-05, + "loss": 0.3059, + "step": 11556 + }, + { + "epoch": 4.699877999186661, + "grad_norm": 10.01522500072375, + "learning_rate": 1.4768968109729906e-05, + "loss": 0.5614, + "step": 11557 + }, + { + "epoch": 4.700284668564457, + "grad_norm": 0.6836827342251153, + "learning_rate": 1.4768077534073105e-05, + "loss": 0.0119, + "step": 11558 + }, + { + "epoch": 4.700691337942253, + "grad_norm": 3.5928691714908623, + "learning_rate": 1.4767186909469584e-05, + "loss": 0.0857, + "step": 11559 + }, + { + "epoch": 4.701098007320049, + "grad_norm": 2.7923902375954404, + "learning_rate": 1.476629623592849e-05, + "loss": 0.0402, + "step": 11560 + }, + { + "epoch": 4.701504676697844, + "grad_norm": 6.061374020920517, + "learning_rate": 1.4765405513458968e-05, + "loss": 0.1844, + "step": 11561 + }, + { + "epoch": 4.70191134607564, + "grad_norm": 0.4611795811048022, + "learning_rate": 1.476451474207016e-05, + "loss": 0.0084, + "step": 11562 + }, + { + "epoch": 4.702318015453436, + "grad_norm": 0.7324875882789805, + "learning_rate": 1.476362392177121e-05, + "loss": 0.0133, + "step": 11563 + }, + { + "epoch": 4.7027246848312325, + "grad_norm": 0.2517711772386348, + "learning_rate": 1.4762733052571263e-05, + "loss": 0.004, + "step": 11564 + }, + { + "epoch": 4.703131354209028, + "grad_norm": 13.60339871511727, + "learning_rate": 1.4761842134479463e-05, + "loss": 1.4278, + "step": 11565 + }, + { + "epoch": 4.703538023586824, + "grad_norm": 14.398456854857047, + "learning_rate": 1.476095116750496e-05, + "loss": 0.615, + "step": 11566 + }, + { + "epoch": 4.70394469296462, + "grad_norm": 7.654295931302825, + "learning_rate": 1.4760060151656892e-05, + "loss": 0.3722, + "step": 11567 + }, + { + "epoch": 4.704351362342416, + "grad_norm": 5.734207071767565, + "learning_rate": 1.4759169086944415e-05, + "loss": 0.1501, + "step": 11568 + }, + { + "epoch": 4.704758031720211, + "grad_norm": 10.697219753637162, + "learning_rate": 1.475827797337667e-05, + "loss": 0.2921, + "step": 11569 + }, + { + "epoch": 4.705164701098007, + "grad_norm": 3.4096241144380754, + "learning_rate": 1.4757386810962809e-05, + "loss": 0.2833, + "step": 11570 + }, + { + "epoch": 4.705571370475803, + "grad_norm": 5.163137069396289, + "learning_rate": 1.4756495599711976e-05, + "loss": 0.1459, + "step": 11571 + }, + { + "epoch": 4.705978039853599, + "grad_norm": 16.28422693980623, + "learning_rate": 1.4755604339633323e-05, + "loss": 0.9778, + "step": 11572 + }, + { + "epoch": 4.706384709231395, + "grad_norm": 1.8176684949020536, + "learning_rate": 1.4754713030735997e-05, + "loss": 0.0181, + "step": 11573 + }, + { + "epoch": 4.706791378609191, + "grad_norm": 2.251806689461595, + "learning_rate": 1.475382167302915e-05, + "loss": 0.0308, + "step": 11574 + }, + { + "epoch": 4.707198047986987, + "grad_norm": 13.00507019307864, + "learning_rate": 1.475293026652193e-05, + "loss": 0.6996, + "step": 11575 + }, + { + "epoch": 4.707604717364783, + "grad_norm": 7.084322643528929, + "learning_rate": 1.475203881122349e-05, + "loss": 0.2013, + "step": 11576 + }, + { + "epoch": 4.708011386742578, + "grad_norm": 4.2737425144747565, + "learning_rate": 1.4751147307142976e-05, + "loss": 0.0953, + "step": 11577 + }, + { + "epoch": 4.708418056120374, + "grad_norm": 12.351718752380597, + "learning_rate": 1.4750255754289548e-05, + "loss": 0.3824, + "step": 11578 + }, + { + "epoch": 4.70882472549817, + "grad_norm": 0.4661756472734156, + "learning_rate": 1.4749364152672354e-05, + "loss": 0.0071, + "step": 11579 + }, + { + "epoch": 4.709231394875966, + "grad_norm": 6.299681505974893, + "learning_rate": 1.4748472502300544e-05, + "loss": 0.2867, + "step": 11580 + }, + { + "epoch": 4.709638064253761, + "grad_norm": 3.7185014149970685, + "learning_rate": 1.4747580803183274e-05, + "loss": 0.0681, + "step": 11581 + }, + { + "epoch": 4.710044733631557, + "grad_norm": 4.13421008939635, + "learning_rate": 1.4746689055329699e-05, + "loss": 0.0706, + "step": 11582 + }, + { + "epoch": 4.710451403009353, + "grad_norm": 2.667434526778776, + "learning_rate": 1.474579725874897e-05, + "loss": 0.0616, + "step": 11583 + }, + { + "epoch": 4.71085807238715, + "grad_norm": 5.34838171621222, + "learning_rate": 1.4744905413450248e-05, + "loss": 0.3678, + "step": 11584 + }, + { + "epoch": 4.711264741764945, + "grad_norm": 2.4497234317511114, + "learning_rate": 1.474401351944268e-05, + "loss": 0.0562, + "step": 11585 + }, + { + "epoch": 4.711671411142741, + "grad_norm": 0.6083365041171769, + "learning_rate": 1.4743121576735427e-05, + "loss": 0.0108, + "step": 11586 + }, + { + "epoch": 4.712078080520537, + "grad_norm": 2.9124592700227674, + "learning_rate": 1.4742229585337644e-05, + "loss": 0.0382, + "step": 11587 + }, + { + "epoch": 4.712484749898333, + "grad_norm": 0.16392042541119797, + "learning_rate": 1.4741337545258486e-05, + "loss": 0.0023, + "step": 11588 + }, + { + "epoch": 4.712891419276128, + "grad_norm": 5.911878526370607, + "learning_rate": 1.4740445456507116e-05, + "loss": 0.1846, + "step": 11589 + }, + { + "epoch": 4.713298088653924, + "grad_norm": 3.5692547516911293, + "learning_rate": 1.4739553319092683e-05, + "loss": 0.1081, + "step": 11590 + }, + { + "epoch": 4.71370475803172, + "grad_norm": 5.649760234444793, + "learning_rate": 1.4738661133024351e-05, + "loss": 0.2208, + "step": 11591 + }, + { + "epoch": 4.714111427409516, + "grad_norm": 0.9843862531211305, + "learning_rate": 1.4737768898311277e-05, + "loss": 0.0181, + "step": 11592 + }, + { + "epoch": 4.714518096787312, + "grad_norm": 5.757538383056297, + "learning_rate": 1.4736876614962624e-05, + "loss": 0.2535, + "step": 11593 + }, + { + "epoch": 4.714924766165108, + "grad_norm": 15.192234037995819, + "learning_rate": 1.4735984282987543e-05, + "loss": 0.5914, + "step": 11594 + }, + { + "epoch": 4.715331435542904, + "grad_norm": 0.4000831528193275, + "learning_rate": 1.4735091902395206e-05, + "loss": 0.0077, + "step": 11595 + }, + { + "epoch": 4.7157381049207, + "grad_norm": 6.459123003137849, + "learning_rate": 1.4734199473194762e-05, + "loss": 0.157, + "step": 11596 + }, + { + "epoch": 4.716144774298495, + "grad_norm": 0.5520154947071279, + "learning_rate": 1.4733306995395382e-05, + "loss": 0.0114, + "step": 11597 + }, + { + "epoch": 4.716551443676291, + "grad_norm": 0.16101497523726166, + "learning_rate": 1.4732414469006222e-05, + "loss": 0.0026, + "step": 11598 + }, + { + "epoch": 4.716958113054087, + "grad_norm": 0.8044973260522543, + "learning_rate": 1.4731521894036442e-05, + "loss": 0.0285, + "step": 11599 + }, + { + "epoch": 4.717364782431883, + "grad_norm": 0.1342609816404915, + "learning_rate": 1.4730629270495213e-05, + "loss": 0.0032, + "step": 11600 + }, + { + "epoch": 4.7177714518096785, + "grad_norm": 1.1588443623950235, + "learning_rate": 1.4729736598391693e-05, + "loss": 0.0188, + "step": 11601 + }, + { + "epoch": 4.718178121187474, + "grad_norm": 16.25196655047329, + "learning_rate": 1.4728843877735049e-05, + "loss": 0.5763, + "step": 11602 + }, + { + "epoch": 4.71858479056527, + "grad_norm": 6.419231254340846, + "learning_rate": 1.4727951108534438e-05, + "loss": 0.3997, + "step": 11603 + }, + { + "epoch": 4.718991459943066, + "grad_norm": 1.7926178085344422, + "learning_rate": 1.4727058290799032e-05, + "loss": 0.0255, + "step": 11604 + }, + { + "epoch": 4.7193981293208624, + "grad_norm": 11.540019979959093, + "learning_rate": 1.4726165424537993e-05, + "loss": 0.6996, + "step": 11605 + }, + { + "epoch": 4.719804798698658, + "grad_norm": 5.00595540281284, + "learning_rate": 1.4725272509760487e-05, + "loss": 0.2409, + "step": 11606 + }, + { + "epoch": 4.720211468076454, + "grad_norm": 9.094920918491193, + "learning_rate": 1.472437954647568e-05, + "loss": 0.3506, + "step": 11607 + }, + { + "epoch": 4.72061813745425, + "grad_norm": 1.468917846278472, + "learning_rate": 1.4723486534692737e-05, + "loss": 0.0235, + "step": 11608 + }, + { + "epoch": 4.7210248068320455, + "grad_norm": 0.16206148440453813, + "learning_rate": 1.472259347442083e-05, + "loss": 0.0028, + "step": 11609 + }, + { + "epoch": 4.721431476209841, + "grad_norm": 8.352989574604983, + "learning_rate": 1.4721700365669124e-05, + "loss": 0.4147, + "step": 11610 + }, + { + "epoch": 4.721838145587637, + "grad_norm": 8.73364399564007, + "learning_rate": 1.4720807208446791e-05, + "loss": 0.3286, + "step": 11611 + }, + { + "epoch": 4.722244814965433, + "grad_norm": 5.954415521136517, + "learning_rate": 1.4719914002762989e-05, + "loss": 0.2286, + "step": 11612 + }, + { + "epoch": 4.722651484343229, + "grad_norm": 6.632321966244322, + "learning_rate": 1.47190207486269e-05, + "loss": 0.318, + "step": 11613 + }, + { + "epoch": 4.723058153721025, + "grad_norm": 13.707419081558252, + "learning_rate": 1.4718127446047685e-05, + "loss": 0.6359, + "step": 11614 + }, + { + "epoch": 4.723464823098821, + "grad_norm": 12.111785358834377, + "learning_rate": 1.4717234095034517e-05, + "loss": 0.6808, + "step": 11615 + }, + { + "epoch": 4.723871492476617, + "grad_norm": 0.6432494622489855, + "learning_rate": 1.471634069559657e-05, + "loss": 0.0048, + "step": 11616 + }, + { + "epoch": 4.7242781618544125, + "grad_norm": 3.2394931807684206, + "learning_rate": 1.4715447247743009e-05, + "loss": 0.0516, + "step": 11617 + }, + { + "epoch": 4.724684831232208, + "grad_norm": 0.20835321002824514, + "learning_rate": 1.4714553751483007e-05, + "loss": 0.0035, + "step": 11618 + }, + { + "epoch": 4.725091500610004, + "grad_norm": 3.8775765646808646, + "learning_rate": 1.4713660206825742e-05, + "loss": 0.0846, + "step": 11619 + }, + { + "epoch": 4.7254981699878, + "grad_norm": 5.942047139829553, + "learning_rate": 1.4712766613780383e-05, + "loss": 0.1248, + "step": 11620 + }, + { + "epoch": 4.725904839365596, + "grad_norm": 13.033170630825964, + "learning_rate": 1.47118729723561e-05, + "loss": 0.8006, + "step": 11621 + }, + { + "epoch": 4.726311508743391, + "grad_norm": 5.292828704076134, + "learning_rate": 1.471097928256207e-05, + "loss": 0.1207, + "step": 11622 + }, + { + "epoch": 4.726718178121187, + "grad_norm": 2.3161714001164952, + "learning_rate": 1.4710085544407468e-05, + "loss": 0.0369, + "step": 11623 + }, + { + "epoch": 4.727124847498983, + "grad_norm": 4.987318564333675, + "learning_rate": 1.470919175790147e-05, + "loss": 0.0883, + "step": 11624 + }, + { + "epoch": 4.7275315168767795, + "grad_norm": 3.0925138959224845, + "learning_rate": 1.470829792305324e-05, + "loss": 0.0605, + "step": 11625 + }, + { + "epoch": 4.727938186254575, + "grad_norm": 2.9023235713108377, + "learning_rate": 1.4707404039871969e-05, + "loss": 0.0378, + "step": 11626 + }, + { + "epoch": 4.728344855632371, + "grad_norm": 3.043477903533509, + "learning_rate": 1.4706510108366825e-05, + "loss": 0.0551, + "step": 11627 + }, + { + "epoch": 4.728751525010167, + "grad_norm": 7.043413378945518, + "learning_rate": 1.4705616128546985e-05, + "loss": 0.1763, + "step": 11628 + }, + { + "epoch": 4.729158194387963, + "grad_norm": 3.79177016126587, + "learning_rate": 1.470472210042163e-05, + "loss": 0.1621, + "step": 11629 + }, + { + "epoch": 4.729564863765758, + "grad_norm": 1.690535726845844, + "learning_rate": 1.4703828023999931e-05, + "loss": 0.0242, + "step": 11630 + }, + { + "epoch": 4.729971533143554, + "grad_norm": 0.5292380216675839, + "learning_rate": 1.470293389929107e-05, + "loss": 0.0105, + "step": 11631 + }, + { + "epoch": 4.73037820252135, + "grad_norm": 1.1955228938659799, + "learning_rate": 1.4702039726304227e-05, + "loss": 0.015, + "step": 11632 + }, + { + "epoch": 4.730784871899146, + "grad_norm": 0.5769400991665286, + "learning_rate": 1.4701145505048582e-05, + "loss": 0.0123, + "step": 11633 + }, + { + "epoch": 4.731191541276942, + "grad_norm": 0.027462912095126045, + "learning_rate": 1.470025123553331e-05, + "loss": 0.0005, + "step": 11634 + }, + { + "epoch": 4.731598210654738, + "grad_norm": 4.023593117195137, + "learning_rate": 1.4699356917767593e-05, + "loss": 0.0518, + "step": 11635 + }, + { + "epoch": 4.732004880032534, + "grad_norm": 1.025742848065545, + "learning_rate": 1.4698462551760612e-05, + "loss": 0.0299, + "step": 11636 + }, + { + "epoch": 4.73241154941033, + "grad_norm": 2.676847914777053, + "learning_rate": 1.4697568137521549e-05, + "loss": 0.065, + "step": 11637 + }, + { + "epoch": 4.732818218788125, + "grad_norm": 4.312022529565173, + "learning_rate": 1.4696673675059584e-05, + "loss": 0.1469, + "step": 11638 + }, + { + "epoch": 4.733224888165921, + "grad_norm": 6.874189942087453, + "learning_rate": 1.4695779164383903e-05, + "loss": 0.181, + "step": 11639 + }, + { + "epoch": 4.733631557543717, + "grad_norm": 3.5182404060471604, + "learning_rate": 1.4694884605503683e-05, + "loss": 0.0668, + "step": 11640 + }, + { + "epoch": 4.734038226921513, + "grad_norm": 0.3037806435232879, + "learning_rate": 1.4693989998428108e-05, + "loss": 0.005, + "step": 11641 + }, + { + "epoch": 4.734444896299308, + "grad_norm": 1.5254266320004168, + "learning_rate": 1.4693095343166369e-05, + "loss": 0.0237, + "step": 11642 + }, + { + "epoch": 4.734851565677104, + "grad_norm": 13.166322334179222, + "learning_rate": 1.4692200639727639e-05, + "loss": 0.5089, + "step": 11643 + }, + { + "epoch": 4.7352582350549, + "grad_norm": 4.618660499024342, + "learning_rate": 1.4691305888121108e-05, + "loss": 0.1014, + "step": 11644 + }, + { + "epoch": 4.735664904432696, + "grad_norm": 3.066079542935498, + "learning_rate": 1.4690411088355963e-05, + "loss": 0.0742, + "step": 11645 + }, + { + "epoch": 4.736071573810492, + "grad_norm": 4.622699123065825, + "learning_rate": 1.4689516240441389e-05, + "loss": 0.0902, + "step": 11646 + }, + { + "epoch": 4.736478243188288, + "grad_norm": 4.200172362390155, + "learning_rate": 1.4688621344386569e-05, + "loss": 0.1535, + "step": 11647 + }, + { + "epoch": 4.736884912566084, + "grad_norm": 11.105268057355918, + "learning_rate": 1.4687726400200688e-05, + "loss": 0.4456, + "step": 11648 + }, + { + "epoch": 4.73729158194388, + "grad_norm": 3.294374160518849, + "learning_rate": 1.468683140789294e-05, + "loss": 0.1147, + "step": 11649 + }, + { + "epoch": 4.737698251321675, + "grad_norm": 4.713439885791199, + "learning_rate": 1.468593636747251e-05, + "loss": 0.1763, + "step": 11650 + }, + { + "epoch": 4.738104920699471, + "grad_norm": 6.452005855479704, + "learning_rate": 1.468504127894858e-05, + "loss": 0.1137, + "step": 11651 + }, + { + "epoch": 4.738511590077267, + "grad_norm": 11.29816692778356, + "learning_rate": 1.4684146142330344e-05, + "loss": 0.5628, + "step": 11652 + }, + { + "epoch": 4.738918259455063, + "grad_norm": 5.036242722479435, + "learning_rate": 1.4683250957626994e-05, + "loss": 0.1696, + "step": 11653 + }, + { + "epoch": 4.739324928832859, + "grad_norm": 5.69567192641519, + "learning_rate": 1.4682355724847715e-05, + "loss": 0.2314, + "step": 11654 + }, + { + "epoch": 4.739731598210655, + "grad_norm": 7.704569040307002, + "learning_rate": 1.4681460444001698e-05, + "loss": 0.1611, + "step": 11655 + }, + { + "epoch": 4.740138267588451, + "grad_norm": 6.439460608171691, + "learning_rate": 1.4680565115098132e-05, + "loss": 0.212, + "step": 11656 + }, + { + "epoch": 4.740544936966247, + "grad_norm": 6.284408232468488, + "learning_rate": 1.467966973814621e-05, + "loss": 0.2154, + "step": 11657 + }, + { + "epoch": 4.7409516063440424, + "grad_norm": 7.225284509162633, + "learning_rate": 1.4678774313155123e-05, + "loss": 0.5198, + "step": 11658 + }, + { + "epoch": 4.741358275721838, + "grad_norm": 9.051108462831937, + "learning_rate": 1.467787884013406e-05, + "loss": 0.2021, + "step": 11659 + }, + { + "epoch": 4.741764945099634, + "grad_norm": 3.126009267421711, + "learning_rate": 1.467698331909222e-05, + "loss": 0.0807, + "step": 11660 + }, + { + "epoch": 4.74217161447743, + "grad_norm": 0.1260334182297713, + "learning_rate": 1.4676087750038792e-05, + "loss": 0.0026, + "step": 11661 + }, + { + "epoch": 4.7425782838552255, + "grad_norm": 4.1384365133903875, + "learning_rate": 1.467519213298297e-05, + "loss": 0.0869, + "step": 11662 + }, + { + "epoch": 4.742984953233021, + "grad_norm": 9.226164766641052, + "learning_rate": 1.4674296467933946e-05, + "loss": 0.379, + "step": 11663 + }, + { + "epoch": 4.743391622610817, + "grad_norm": 2.5194672505249396, + "learning_rate": 1.4673400754900917e-05, + "loss": 0.0372, + "step": 11664 + }, + { + "epoch": 4.743798291988613, + "grad_norm": 5.220645837090641, + "learning_rate": 1.4672504993893078e-05, + "loss": 0.0994, + "step": 11665 + }, + { + "epoch": 4.7442049613664095, + "grad_norm": 11.880029931276558, + "learning_rate": 1.4671609184919622e-05, + "loss": 0.3642, + "step": 11666 + }, + { + "epoch": 4.744611630744205, + "grad_norm": 2.5693257401241176, + "learning_rate": 1.4670713327989747e-05, + "loss": 0.0624, + "step": 11667 + }, + { + "epoch": 4.745018300122001, + "grad_norm": 10.980284431468437, + "learning_rate": 1.4669817423112652e-05, + "loss": 0.6324, + "step": 11668 + }, + { + "epoch": 4.745424969499797, + "grad_norm": 6.240668129639849, + "learning_rate": 1.4668921470297528e-05, + "loss": 0.1052, + "step": 11669 + }, + { + "epoch": 4.7458316388775925, + "grad_norm": 8.700258872054446, + "learning_rate": 1.4668025469553576e-05, + "loss": 0.3827, + "step": 11670 + }, + { + "epoch": 4.746238308255388, + "grad_norm": 2.996175640766963, + "learning_rate": 1.4667129420889992e-05, + "loss": 0.1583, + "step": 11671 + }, + { + "epoch": 4.746644977633184, + "grad_norm": 15.439045626193101, + "learning_rate": 1.4666233324315977e-05, + "loss": 0.2186, + "step": 11672 + }, + { + "epoch": 4.74705164701098, + "grad_norm": 5.3738876953325425, + "learning_rate": 1.4665337179840727e-05, + "loss": 0.1833, + "step": 11673 + }, + { + "epoch": 4.747458316388776, + "grad_norm": 3.1488918470679863, + "learning_rate": 1.4664440987473443e-05, + "loss": 0.0671, + "step": 11674 + }, + { + "epoch": 4.747864985766572, + "grad_norm": 0.36325087507389425, + "learning_rate": 1.4663544747223326e-05, + "loss": 0.0072, + "step": 11675 + }, + { + "epoch": 4.748271655144368, + "grad_norm": 7.201888777238531, + "learning_rate": 1.4662648459099573e-05, + "loss": 0.2289, + "step": 11676 + }, + { + "epoch": 4.748678324522164, + "grad_norm": 6.6423923611709, + "learning_rate": 1.466175212311139e-05, + "loss": 0.2091, + "step": 11677 + }, + { + "epoch": 4.7490849938999595, + "grad_norm": 11.157658818383773, + "learning_rate": 1.4660855739267973e-05, + "loss": 0.2511, + "step": 11678 + }, + { + "epoch": 4.749491663277755, + "grad_norm": 10.387905182026907, + "learning_rate": 1.4659959307578527e-05, + "loss": 0.156, + "step": 11679 + }, + { + "epoch": 4.749898332655551, + "grad_norm": 5.93207709626483, + "learning_rate": 1.4659062828052253e-05, + "loss": 0.1735, + "step": 11680 + }, + { + "epoch": 4.750305002033347, + "grad_norm": 13.580720043331702, + "learning_rate": 1.4658166300698355e-05, + "loss": 0.7804, + "step": 11681 + }, + { + "epoch": 4.750711671411143, + "grad_norm": 4.970798773035022, + "learning_rate": 1.4657269725526035e-05, + "loss": 0.1098, + "step": 11682 + }, + { + "epoch": 4.751118340788938, + "grad_norm": 4.076138857016813, + "learning_rate": 1.4656373102544497e-05, + "loss": 0.1117, + "step": 11683 + }, + { + "epoch": 4.751525010166734, + "grad_norm": 12.054036568183154, + "learning_rate": 1.4655476431762946e-05, + "loss": 0.4622, + "step": 11684 + }, + { + "epoch": 4.75193167954453, + "grad_norm": 4.115157404082399, + "learning_rate": 1.4654579713190586e-05, + "loss": 0.07, + "step": 11685 + }, + { + "epoch": 4.752338348922326, + "grad_norm": 3.3790666637902795, + "learning_rate": 1.4653682946836623e-05, + "loss": 0.0547, + "step": 11686 + }, + { + "epoch": 4.752745018300122, + "grad_norm": 9.887453081578258, + "learning_rate": 1.4652786132710262e-05, + "loss": 0.4508, + "step": 11687 + }, + { + "epoch": 4.753151687677918, + "grad_norm": 11.04241184885488, + "learning_rate": 1.465188927082071e-05, + "loss": 0.4002, + "step": 11688 + }, + { + "epoch": 4.753558357055714, + "grad_norm": 6.898426303633031, + "learning_rate": 1.4650992361177172e-05, + "loss": 0.2887, + "step": 11689 + }, + { + "epoch": 4.75396502643351, + "grad_norm": 5.563341918273, + "learning_rate": 1.4650095403788858e-05, + "loss": 0.1185, + "step": 11690 + }, + { + "epoch": 4.754371695811305, + "grad_norm": 5.010027587652214, + "learning_rate": 1.4649198398664974e-05, + "loss": 0.1545, + "step": 11691 + }, + { + "epoch": 4.754778365189101, + "grad_norm": 4.816532621114185, + "learning_rate": 1.464830134581473e-05, + "loss": 0.2023, + "step": 11692 + }, + { + "epoch": 4.755185034566897, + "grad_norm": 9.031033235105062, + "learning_rate": 1.464740424524733e-05, + "loss": 0.2479, + "step": 11693 + }, + { + "epoch": 4.755591703944693, + "grad_norm": 12.401883879418413, + "learning_rate": 1.464650709697199e-05, + "loss": 0.4852, + "step": 11694 + }, + { + "epoch": 4.755998373322489, + "grad_norm": 4.170261343931192, + "learning_rate": 1.4645609900997915e-05, + "loss": 0.1578, + "step": 11695 + }, + { + "epoch": 4.756405042700285, + "grad_norm": 5.115126195840982, + "learning_rate": 1.4644712657334315e-05, + "loss": 0.1905, + "step": 11696 + }, + { + "epoch": 4.756811712078081, + "grad_norm": 0.07778887056998594, + "learning_rate": 1.4643815365990402e-05, + "loss": 0.0012, + "step": 11697 + }, + { + "epoch": 4.757218381455877, + "grad_norm": 4.835155359873071, + "learning_rate": 1.4642918026975388e-05, + "loss": 0.0785, + "step": 11698 + }, + { + "epoch": 4.757625050833672, + "grad_norm": 8.795605594575248, + "learning_rate": 1.4642020640298483e-05, + "loss": 0.1551, + "step": 11699 + }, + { + "epoch": 4.758031720211468, + "grad_norm": 5.734234989172708, + "learning_rate": 1.4641123205968901e-05, + "loss": 0.2229, + "step": 11700 + }, + { + "epoch": 4.758438389589264, + "grad_norm": 5.876824535951579, + "learning_rate": 1.4640225723995853e-05, + "loss": 0.1915, + "step": 11701 + }, + { + "epoch": 4.75884505896706, + "grad_norm": 0.060785898478534946, + "learning_rate": 1.4639328194388551e-05, + "loss": 0.001, + "step": 11702 + }, + { + "epoch": 4.759251728344855, + "grad_norm": 4.035051996378988, + "learning_rate": 1.4638430617156214e-05, + "loss": 0.1526, + "step": 11703 + }, + { + "epoch": 4.759658397722651, + "grad_norm": 4.325224459920925, + "learning_rate": 1.463753299230805e-05, + "loss": 0.3905, + "step": 11704 + }, + { + "epoch": 4.760065067100447, + "grad_norm": 5.069912655174877, + "learning_rate": 1.4636635319853274e-05, + "loss": 0.1035, + "step": 11705 + }, + { + "epoch": 4.760471736478243, + "grad_norm": 6.360292785377119, + "learning_rate": 1.4635737599801105e-05, + "loss": 0.2138, + "step": 11706 + }, + { + "epoch": 4.760878405856039, + "grad_norm": 0.6628663059231619, + "learning_rate": 1.4634839832160757e-05, + "loss": 0.0136, + "step": 11707 + }, + { + "epoch": 4.761285075233835, + "grad_norm": 0.15850591020861293, + "learning_rate": 1.4633942016941442e-05, + "loss": 0.0033, + "step": 11708 + }, + { + "epoch": 4.761691744611631, + "grad_norm": 16.37856084686627, + "learning_rate": 1.4633044154152386e-05, + "loss": 0.8176, + "step": 11709 + }, + { + "epoch": 4.762098413989427, + "grad_norm": 3.925878791863524, + "learning_rate": 1.4632146243802793e-05, + "loss": 0.126, + "step": 11710 + }, + { + "epoch": 4.7625050833672224, + "grad_norm": 0.6247418280534635, + "learning_rate": 1.4631248285901891e-05, + "loss": 0.0133, + "step": 11711 + }, + { + "epoch": 4.762911752745018, + "grad_norm": 1.2739155300323013, + "learning_rate": 1.4630350280458895e-05, + "loss": 0.0235, + "step": 11712 + }, + { + "epoch": 4.763318422122814, + "grad_norm": 0.5307167646387124, + "learning_rate": 1.4629452227483024e-05, + "loss": 0.0134, + "step": 11713 + }, + { + "epoch": 4.76372509150061, + "grad_norm": 6.763202756140637, + "learning_rate": 1.4628554126983495e-05, + "loss": 0.1819, + "step": 11714 + }, + { + "epoch": 4.7641317608784055, + "grad_norm": 9.445372524521359, + "learning_rate": 1.4627655978969526e-05, + "loss": 0.6095, + "step": 11715 + }, + { + "epoch": 4.764538430256202, + "grad_norm": 0.9768113965903366, + "learning_rate": 1.462675778345034e-05, + "loss": 0.0177, + "step": 11716 + }, + { + "epoch": 4.764945099633998, + "grad_norm": 4.749624199391748, + "learning_rate": 1.4625859540435158e-05, + "loss": 0.0998, + "step": 11717 + }, + { + "epoch": 4.765351769011794, + "grad_norm": 9.230600491670058, + "learning_rate": 1.46249612499332e-05, + "loss": 0.2395, + "step": 11718 + }, + { + "epoch": 4.7657584383895895, + "grad_norm": 0.9425585825432793, + "learning_rate": 1.4624062911953686e-05, + "loss": 0.0144, + "step": 11719 + }, + { + "epoch": 4.766165107767385, + "grad_norm": 0.9977276219878128, + "learning_rate": 1.462316452650584e-05, + "loss": 0.0168, + "step": 11720 + }, + { + "epoch": 4.766571777145181, + "grad_norm": 5.925912032777616, + "learning_rate": 1.4622266093598883e-05, + "loss": 0.1328, + "step": 11721 + }, + { + "epoch": 4.766978446522977, + "grad_norm": 13.849636891625881, + "learning_rate": 1.4621367613242037e-05, + "loss": 0.5504, + "step": 11722 + }, + { + "epoch": 4.7673851159007725, + "grad_norm": 6.113507348669197, + "learning_rate": 1.4620469085444528e-05, + "loss": 0.5093, + "step": 11723 + }, + { + "epoch": 4.767791785278568, + "grad_norm": 10.515492826179525, + "learning_rate": 1.4619570510215577e-05, + "loss": 0.4382, + "step": 11724 + }, + { + "epoch": 4.768198454656364, + "grad_norm": 6.9972325207382315, + "learning_rate": 1.461867188756441e-05, + "loss": 0.2663, + "step": 11725 + }, + { + "epoch": 4.76860512403416, + "grad_norm": 1.2436744456205024, + "learning_rate": 1.4617773217500253e-05, + "loss": 0.0284, + "step": 11726 + }, + { + "epoch": 4.7690117934119565, + "grad_norm": 13.903666802571683, + "learning_rate": 1.461687450003233e-05, + "loss": 0.3228, + "step": 11727 + }, + { + "epoch": 4.769418462789752, + "grad_norm": 3.688487353812832, + "learning_rate": 1.4615975735169865e-05, + "loss": 0.1177, + "step": 11728 + }, + { + "epoch": 4.769825132167548, + "grad_norm": 10.291623798751967, + "learning_rate": 1.4615076922922086e-05, + "loss": 0.5033, + "step": 11729 + }, + { + "epoch": 4.770231801545344, + "grad_norm": 7.071513148739822, + "learning_rate": 1.4614178063298221e-05, + "loss": 0.4343, + "step": 11730 + }, + { + "epoch": 4.7706384709231395, + "grad_norm": 0.06808075625810564, + "learning_rate": 1.4613279156307495e-05, + "loss": 0.0017, + "step": 11731 + }, + { + "epoch": 4.771045140300935, + "grad_norm": 5.97243892215105, + "learning_rate": 1.4612380201959137e-05, + "loss": 0.1575, + "step": 11732 + }, + { + "epoch": 4.771451809678731, + "grad_norm": 1.3223367681464715, + "learning_rate": 1.4611481200262375e-05, + "loss": 0.0226, + "step": 11733 + }, + { + "epoch": 4.771858479056527, + "grad_norm": 6.721955269810075, + "learning_rate": 1.4610582151226436e-05, + "loss": 0.1458, + "step": 11734 + }, + { + "epoch": 4.772265148434323, + "grad_norm": 9.66161288408403, + "learning_rate": 1.4609683054860552e-05, + "loss": 0.2926, + "step": 11735 + }, + { + "epoch": 4.772671817812119, + "grad_norm": 2.7504281227552814, + "learning_rate": 1.4608783911173951e-05, + "loss": 0.0638, + "step": 11736 + }, + { + "epoch": 4.773078487189915, + "grad_norm": 3.6408898336297875, + "learning_rate": 1.4607884720175864e-05, + "loss": 0.0687, + "step": 11737 + }, + { + "epoch": 4.773485156567711, + "grad_norm": 3.7690078862456815, + "learning_rate": 1.4606985481875522e-05, + "loss": 0.047, + "step": 11738 + }, + { + "epoch": 4.7738918259455065, + "grad_norm": 2.007537254773084, + "learning_rate": 1.4606086196282156e-05, + "loss": 0.0316, + "step": 11739 + }, + { + "epoch": 4.774298495323302, + "grad_norm": 5.132161691101198, + "learning_rate": 1.4605186863404997e-05, + "loss": 0.1911, + "step": 11740 + }, + { + "epoch": 4.774705164701098, + "grad_norm": 8.596204196527136, + "learning_rate": 1.4604287483253272e-05, + "loss": 0.2014, + "step": 11741 + }, + { + "epoch": 4.775111834078894, + "grad_norm": 9.887923493660532, + "learning_rate": 1.4603388055836222e-05, + "loss": 0.2333, + "step": 11742 + }, + { + "epoch": 4.77551850345669, + "grad_norm": 0.7226941296618993, + "learning_rate": 1.460248858116308e-05, + "loss": 0.0138, + "step": 11743 + }, + { + "epoch": 4.775925172834485, + "grad_norm": 7.355187105444149, + "learning_rate": 1.460158905924307e-05, + "loss": 0.2432, + "step": 11744 + }, + { + "epoch": 4.776331842212281, + "grad_norm": 3.2714442235347376, + "learning_rate": 1.4600689490085437e-05, + "loss": 0.0836, + "step": 11745 + }, + { + "epoch": 4.776738511590077, + "grad_norm": 0.1713921233887159, + "learning_rate": 1.4599789873699413e-05, + "loss": 0.0025, + "step": 11746 + }, + { + "epoch": 4.777145180967873, + "grad_norm": 8.071689641869625, + "learning_rate": 1.4598890210094225e-05, + "loss": 0.2422, + "step": 11747 + }, + { + "epoch": 4.777551850345669, + "grad_norm": 1.9441850525311162, + "learning_rate": 1.4597990499279118e-05, + "loss": 0.0336, + "step": 11748 + }, + { + "epoch": 4.777958519723465, + "grad_norm": 2.4320582831142037, + "learning_rate": 1.4597090741263324e-05, + "loss": 0.0441, + "step": 11749 + }, + { + "epoch": 4.778365189101261, + "grad_norm": 0.20068983842520416, + "learning_rate": 1.459619093605608e-05, + "loss": 0.0036, + "step": 11750 + }, + { + "epoch": 4.778771858479057, + "grad_norm": 0.613942322510227, + "learning_rate": 1.459529108366662e-05, + "loss": 0.0114, + "step": 11751 + }, + { + "epoch": 4.779178527856852, + "grad_norm": 0.39876069549366505, + "learning_rate": 1.4594391184104189e-05, + "loss": 0.0056, + "step": 11752 + }, + { + "epoch": 4.779585197234648, + "grad_norm": 0.2807750145469513, + "learning_rate": 1.4593491237378018e-05, + "loss": 0.0057, + "step": 11753 + }, + { + "epoch": 4.779991866612444, + "grad_norm": 3.9500415522164385, + "learning_rate": 1.4592591243497348e-05, + "loss": 0.062, + "step": 11754 + }, + { + "epoch": 4.78039853599024, + "grad_norm": 11.011713907962017, + "learning_rate": 1.4591691202471417e-05, + "loss": 0.3275, + "step": 11755 + }, + { + "epoch": 4.780805205368035, + "grad_norm": 1.7045156364627405, + "learning_rate": 1.4590791114309464e-05, + "loss": 0.0332, + "step": 11756 + }, + { + "epoch": 4.781211874745832, + "grad_norm": 7.741927944186168, + "learning_rate": 1.4589890979020731e-05, + "loss": 0.3916, + "step": 11757 + }, + { + "epoch": 4.781618544123628, + "grad_norm": 10.786555266320832, + "learning_rate": 1.4588990796614458e-05, + "loss": 0.7877, + "step": 11758 + }, + { + "epoch": 4.782025213501424, + "grad_norm": 2.257053261562003, + "learning_rate": 1.4588090567099882e-05, + "loss": 0.0419, + "step": 11759 + }, + { + "epoch": 4.782431882879219, + "grad_norm": 7.074469935163574, + "learning_rate": 1.4587190290486249e-05, + "loss": 0.3422, + "step": 11760 + }, + { + "epoch": 4.782838552257015, + "grad_norm": 7.81556197958579, + "learning_rate": 1.4586289966782795e-05, + "loss": 0.1484, + "step": 11761 + }, + { + "epoch": 4.783245221634811, + "grad_norm": 3.3342820208847717, + "learning_rate": 1.4585389595998773e-05, + "loss": 0.1405, + "step": 11762 + }, + { + "epoch": 4.783651891012607, + "grad_norm": 4.410991342044521, + "learning_rate": 1.4584489178143413e-05, + "loss": 0.2001, + "step": 11763 + }, + { + "epoch": 4.7840585603904024, + "grad_norm": 5.259894535850208, + "learning_rate": 1.458358871322597e-05, + "loss": 0.1452, + "step": 11764 + }, + { + "epoch": 4.784465229768198, + "grad_norm": 0.6447409480153243, + "learning_rate": 1.458268820125568e-05, + "loss": 0.0101, + "step": 11765 + }, + { + "epoch": 4.784871899145994, + "grad_norm": 12.71662422772487, + "learning_rate": 1.4581787642241784e-05, + "loss": 0.7332, + "step": 11766 + }, + { + "epoch": 4.78527856852379, + "grad_norm": 7.187628629818835, + "learning_rate": 1.4580887036193539e-05, + "loss": 0.0442, + "step": 11767 + }, + { + "epoch": 4.785685237901586, + "grad_norm": 1.3670785137359036, + "learning_rate": 1.457998638312018e-05, + "loss": 0.0272, + "step": 11768 + }, + { + "epoch": 4.786091907279382, + "grad_norm": 2.8528154315238, + "learning_rate": 1.4579085683030953e-05, + "loss": 0.0672, + "step": 11769 + }, + { + "epoch": 4.786498576657178, + "grad_norm": 8.875928617545817, + "learning_rate": 1.4578184935935111e-05, + "loss": 0.4113, + "step": 11770 + }, + { + "epoch": 4.786905246034974, + "grad_norm": 11.024013632734224, + "learning_rate": 1.4577284141841896e-05, + "loss": 0.4073, + "step": 11771 + }, + { + "epoch": 4.7873119154127695, + "grad_norm": 5.609418523445768, + "learning_rate": 1.4576383300760554e-05, + "loss": 0.1406, + "step": 11772 + }, + { + "epoch": 4.787718584790565, + "grad_norm": 13.672368447799473, + "learning_rate": 1.4575482412700334e-05, + "loss": 0.484, + "step": 11773 + }, + { + "epoch": 4.788125254168361, + "grad_norm": 0.24607890038223582, + "learning_rate": 1.4574581477670486e-05, + "loss": 0.004, + "step": 11774 + }, + { + "epoch": 4.788531923546157, + "grad_norm": 5.481710561753536, + "learning_rate": 1.4573680495680258e-05, + "loss": 0.2138, + "step": 11775 + }, + { + "epoch": 4.7889385929239525, + "grad_norm": 2.9383435666117816, + "learning_rate": 1.4572779466738893e-05, + "loss": 0.0785, + "step": 11776 + }, + { + "epoch": 4.789345262301749, + "grad_norm": 5.319943476574858, + "learning_rate": 1.4571878390855649e-05, + "loss": 0.1246, + "step": 11777 + }, + { + "epoch": 4.789751931679545, + "grad_norm": 4.0668865985789555, + "learning_rate": 1.4570977268039772e-05, + "loss": 0.0808, + "step": 11778 + }, + { + "epoch": 4.790158601057341, + "grad_norm": 2.8139461095744154, + "learning_rate": 1.4570076098300512e-05, + "loss": 0.0443, + "step": 11779 + }, + { + "epoch": 4.7905652704351365, + "grad_norm": 9.363500383452246, + "learning_rate": 1.4569174881647121e-05, + "loss": 0.1722, + "step": 11780 + }, + { + "epoch": 4.790971939812932, + "grad_norm": 12.92101672111702, + "learning_rate": 1.4568273618088851e-05, + "loss": 0.6264, + "step": 11781 + }, + { + "epoch": 4.791378609190728, + "grad_norm": 0.239771751161878, + "learning_rate": 1.456737230763495e-05, + "loss": 0.0042, + "step": 11782 + }, + { + "epoch": 4.791785278568524, + "grad_norm": 9.356135511828622, + "learning_rate": 1.4566470950294678e-05, + "loss": 0.4748, + "step": 11783 + }, + { + "epoch": 4.7921919479463195, + "grad_norm": 0.39133355155743715, + "learning_rate": 1.4565569546077283e-05, + "loss": 0.0074, + "step": 11784 + }, + { + "epoch": 4.792598617324115, + "grad_norm": 4.773502924941169, + "learning_rate": 1.4564668094992018e-05, + "loss": 0.0912, + "step": 11785 + }, + { + "epoch": 4.793005286701911, + "grad_norm": 1.1631234317954942, + "learning_rate": 1.4563766597048135e-05, + "loss": 0.0148, + "step": 11786 + }, + { + "epoch": 4.793411956079707, + "grad_norm": 0.2121821027318087, + "learning_rate": 1.4562865052254892e-05, + "loss": 0.0038, + "step": 11787 + }, + { + "epoch": 4.793818625457503, + "grad_norm": 1.424410780723568, + "learning_rate": 1.4561963460621544e-05, + "loss": 0.0241, + "step": 11788 + }, + { + "epoch": 4.794225294835299, + "grad_norm": 1.7267498245395363, + "learning_rate": 1.4561061822157345e-05, + "loss": 0.0211, + "step": 11789 + }, + { + "epoch": 4.794631964213095, + "grad_norm": 4.501480227198271, + "learning_rate": 1.4560160136871551e-05, + "loss": 0.1293, + "step": 11790 + }, + { + "epoch": 4.795038633590891, + "grad_norm": 10.6339638825535, + "learning_rate": 1.4559258404773418e-05, + "loss": 0.3557, + "step": 11791 + }, + { + "epoch": 4.7954453029686865, + "grad_norm": 15.56248047680353, + "learning_rate": 1.4558356625872202e-05, + "loss": 0.1117, + "step": 11792 + }, + { + "epoch": 4.795851972346482, + "grad_norm": 0.2959883952696401, + "learning_rate": 1.4557454800177161e-05, + "loss": 0.0051, + "step": 11793 + }, + { + "epoch": 4.796258641724278, + "grad_norm": 1.1206584264365618, + "learning_rate": 1.4556552927697554e-05, + "loss": 0.016, + "step": 11794 + }, + { + "epoch": 4.796665311102074, + "grad_norm": 3.1296262712572944, + "learning_rate": 1.455565100844264e-05, + "loss": 0.0371, + "step": 11795 + }, + { + "epoch": 4.79707198047987, + "grad_norm": 5.97530614575358, + "learning_rate": 1.455474904242167e-05, + "loss": 0.0846, + "step": 11796 + }, + { + "epoch": 4.797478649857665, + "grad_norm": 4.950643155046099, + "learning_rate": 1.4553847029643913e-05, + "loss": 0.1429, + "step": 11797 + }, + { + "epoch": 4.797885319235462, + "grad_norm": 0.3232306166622246, + "learning_rate": 1.4552944970118623e-05, + "loss": 0.0056, + "step": 11798 + }, + { + "epoch": 4.798291988613258, + "grad_norm": 0.5930289798882535, + "learning_rate": 1.4552042863855061e-05, + "loss": 0.0079, + "step": 11799 + }, + { + "epoch": 4.7986986579910536, + "grad_norm": 2.5635062843484366, + "learning_rate": 1.4551140710862489e-05, + "loss": 0.0725, + "step": 11800 + }, + { + "epoch": 4.799105327368849, + "grad_norm": 9.752067221112183, + "learning_rate": 1.4550238511150166e-05, + "loss": 0.365, + "step": 11801 + }, + { + "epoch": 4.799511996746645, + "grad_norm": 7.203038016322113, + "learning_rate": 1.4549336264727354e-05, + "loss": 0.2266, + "step": 11802 + }, + { + "epoch": 4.799918666124441, + "grad_norm": 20.49172720699393, + "learning_rate": 1.454843397160332e-05, + "loss": 1.294, + "step": 11803 + }, + { + "epoch": 4.800325335502237, + "grad_norm": 4.911835306761763, + "learning_rate": 1.4547531631787319e-05, + "loss": 0.0875, + "step": 11804 + }, + { + "epoch": 4.800732004880032, + "grad_norm": 2.230176354981368, + "learning_rate": 1.4546629245288615e-05, + "loss": 0.0563, + "step": 11805 + }, + { + "epoch": 4.801138674257828, + "grad_norm": 7.513480466197426, + "learning_rate": 1.4545726812116477e-05, + "loss": 0.32, + "step": 11806 + }, + { + "epoch": 4.801545343635624, + "grad_norm": 4.763731908204884, + "learning_rate": 1.4544824332280164e-05, + "loss": 0.0789, + "step": 11807 + }, + { + "epoch": 4.80195201301342, + "grad_norm": 8.682736392646106, + "learning_rate": 1.454392180578894e-05, + "loss": 0.4146, + "step": 11808 + }, + { + "epoch": 4.802358682391216, + "grad_norm": 8.390910276022277, + "learning_rate": 1.4543019232652076e-05, + "loss": 0.5103, + "step": 11809 + }, + { + "epoch": 4.802765351769012, + "grad_norm": 1.4493470607944272, + "learning_rate": 1.4542116612878829e-05, + "loss": 0.0183, + "step": 11810 + }, + { + "epoch": 4.803172021146808, + "grad_norm": 0.36579139866909144, + "learning_rate": 1.4541213946478469e-05, + "loss": 0.0046, + "step": 11811 + }, + { + "epoch": 4.803578690524604, + "grad_norm": 9.907852337337069, + "learning_rate": 1.4540311233460265e-05, + "loss": 0.183, + "step": 11812 + }, + { + "epoch": 4.803985359902399, + "grad_norm": 9.409872216244846, + "learning_rate": 1.453940847383348e-05, + "loss": 0.2278, + "step": 11813 + }, + { + "epoch": 4.804392029280195, + "grad_norm": 1.7363827426124983, + "learning_rate": 1.4538505667607381e-05, + "loss": 0.0227, + "step": 11814 + }, + { + "epoch": 4.804798698657991, + "grad_norm": 6.363363103738212, + "learning_rate": 1.4537602814791241e-05, + "loss": 0.1363, + "step": 11815 + }, + { + "epoch": 4.805205368035787, + "grad_norm": 7.098565244667987, + "learning_rate": 1.4536699915394322e-05, + "loss": 0.1715, + "step": 11816 + }, + { + "epoch": 4.8056120374135824, + "grad_norm": 1.6213349042060625, + "learning_rate": 1.4535796969425894e-05, + "loss": 0.0243, + "step": 11817 + }, + { + "epoch": 4.806018706791379, + "grad_norm": 10.399413702487717, + "learning_rate": 1.4534893976895227e-05, + "loss": 0.4531, + "step": 11818 + }, + { + "epoch": 4.806425376169175, + "grad_norm": 8.26979692567963, + "learning_rate": 1.4533990937811595e-05, + "loss": 0.2034, + "step": 11819 + }, + { + "epoch": 4.806832045546971, + "grad_norm": 7.081715576938495, + "learning_rate": 1.4533087852184262e-05, + "loss": 0.2205, + "step": 11820 + }, + { + "epoch": 4.807238714924766, + "grad_norm": 5.6117041591622945, + "learning_rate": 1.4532184720022497e-05, + "loss": 0.1027, + "step": 11821 + }, + { + "epoch": 4.807645384302562, + "grad_norm": 6.114874027920212, + "learning_rate": 1.4531281541335581e-05, + "loss": 0.1273, + "step": 11822 + }, + { + "epoch": 4.808052053680358, + "grad_norm": 5.603050807080975, + "learning_rate": 1.4530378316132776e-05, + "loss": 0.305, + "step": 11823 + }, + { + "epoch": 4.808458723058154, + "grad_norm": 9.547719492812865, + "learning_rate": 1.4529475044423356e-05, + "loss": 0.4962, + "step": 11824 + }, + { + "epoch": 4.8088653924359495, + "grad_norm": 11.960724753012968, + "learning_rate": 1.4528571726216597e-05, + "loss": 0.4433, + "step": 11825 + }, + { + "epoch": 4.809272061813745, + "grad_norm": 4.558152571302234, + "learning_rate": 1.452766836152177e-05, + "loss": 0.1248, + "step": 11826 + }, + { + "epoch": 4.809678731191541, + "grad_norm": 4.176107155702739, + "learning_rate": 1.4526764950348149e-05, + "loss": 0.0834, + "step": 11827 + }, + { + "epoch": 4.810085400569337, + "grad_norm": 10.927385854788248, + "learning_rate": 1.4525861492705007e-05, + "loss": 0.5735, + "step": 11828 + }, + { + "epoch": 4.8104920699471325, + "grad_norm": 10.390175513892407, + "learning_rate": 1.452495798860162e-05, + "loss": 0.6067, + "step": 11829 + }, + { + "epoch": 4.810898739324929, + "grad_norm": 6.334504543133913, + "learning_rate": 1.452405443804726e-05, + "loss": 0.094, + "step": 11830 + }, + { + "epoch": 4.811305408702725, + "grad_norm": 1.464387976671007, + "learning_rate": 1.4523150841051204e-05, + "loss": 0.0184, + "step": 11831 + }, + { + "epoch": 4.811712078080521, + "grad_norm": 7.501120823628987, + "learning_rate": 1.4522247197622729e-05, + "loss": 0.1832, + "step": 11832 + }, + { + "epoch": 4.8121187474583165, + "grad_norm": 6.7868004280552245, + "learning_rate": 1.4521343507771112e-05, + "loss": 0.1549, + "step": 11833 + }, + { + "epoch": 4.812525416836112, + "grad_norm": 6.3305519572033155, + "learning_rate": 1.4520439771505626e-05, + "loss": 0.2209, + "step": 11834 + }, + { + "epoch": 4.812932086213908, + "grad_norm": 11.568626734054273, + "learning_rate": 1.4519535988835554e-05, + "loss": 1.0889, + "step": 11835 + }, + { + "epoch": 4.813338755591704, + "grad_norm": 8.177257053395007, + "learning_rate": 1.4518632159770168e-05, + "loss": 0.3934, + "step": 11836 + }, + { + "epoch": 4.8137454249694995, + "grad_norm": 10.203611513276309, + "learning_rate": 1.4517728284318748e-05, + "loss": 0.5783, + "step": 11837 + }, + { + "epoch": 4.814152094347295, + "grad_norm": 9.870158580648129, + "learning_rate": 1.4516824362490574e-05, + "loss": 0.1696, + "step": 11838 + }, + { + "epoch": 4.814558763725092, + "grad_norm": 4.176192565243539, + "learning_rate": 1.4515920394294925e-05, + "loss": 0.0635, + "step": 11839 + }, + { + "epoch": 4.814965433102888, + "grad_norm": 2.822920506523068, + "learning_rate": 1.451501637974108e-05, + "loss": 0.0576, + "step": 11840 + }, + { + "epoch": 4.8153721024806835, + "grad_norm": 10.340671167195124, + "learning_rate": 1.4514112318838321e-05, + "loss": 0.5283, + "step": 11841 + }, + { + "epoch": 4.815778771858479, + "grad_norm": 7.629524052214798, + "learning_rate": 1.4513208211595925e-05, + "loss": 0.2067, + "step": 11842 + }, + { + "epoch": 4.816185441236275, + "grad_norm": 8.707532497103934, + "learning_rate": 1.4512304058023179e-05, + "loss": 0.2789, + "step": 11843 + }, + { + "epoch": 4.816592110614071, + "grad_norm": 2.394137851252146, + "learning_rate": 1.4511399858129355e-05, + "loss": 0.0454, + "step": 11844 + }, + { + "epoch": 4.8169987799918665, + "grad_norm": 7.969936281849422, + "learning_rate": 1.4510495611923746e-05, + "loss": 0.3216, + "step": 11845 + }, + { + "epoch": 4.817405449369662, + "grad_norm": 13.257808271617023, + "learning_rate": 1.4509591319415628e-05, + "loss": 0.3102, + "step": 11846 + }, + { + "epoch": 4.817812118747458, + "grad_norm": 0.05229533382565709, + "learning_rate": 1.4508686980614282e-05, + "loss": 0.0011, + "step": 11847 + }, + { + "epoch": 4.818218788125254, + "grad_norm": 23.683541732773747, + "learning_rate": 1.4507782595528998e-05, + "loss": 0.8876, + "step": 11848 + }, + { + "epoch": 4.81862545750305, + "grad_norm": 1.3886091160473248, + "learning_rate": 1.4506878164169058e-05, + "loss": 0.0338, + "step": 11849 + }, + { + "epoch": 4.819032126880846, + "grad_norm": 7.7197611337499294, + "learning_rate": 1.4505973686543744e-05, + "loss": 0.4719, + "step": 11850 + }, + { + "epoch": 4.819438796258642, + "grad_norm": 0.028667763356173475, + "learning_rate": 1.4505069162662343e-05, + "loss": 0.0005, + "step": 11851 + }, + { + "epoch": 4.819845465636438, + "grad_norm": 7.080643647200543, + "learning_rate": 1.450416459253414e-05, + "loss": 0.1461, + "step": 11852 + }, + { + "epoch": 4.8202521350142336, + "grad_norm": 6.750368532287898, + "learning_rate": 1.450325997616842e-05, + "loss": 0.1795, + "step": 11853 + }, + { + "epoch": 4.820658804392029, + "grad_norm": 10.22100491572976, + "learning_rate": 1.450235531357447e-05, + "loss": 0.4178, + "step": 11854 + }, + { + "epoch": 4.821065473769825, + "grad_norm": 0.8886000816622307, + "learning_rate": 1.4501450604761578e-05, + "loss": 0.0203, + "step": 11855 + }, + { + "epoch": 4.821472143147621, + "grad_norm": 5.5048620384564435, + "learning_rate": 1.4500545849739025e-05, + "loss": 0.1392, + "step": 11856 + }, + { + "epoch": 4.821878812525417, + "grad_norm": 7.008846562500362, + "learning_rate": 1.4499641048516109e-05, + "loss": 0.1648, + "step": 11857 + }, + { + "epoch": 4.822285481903212, + "grad_norm": 2.6647726449022175, + "learning_rate": 1.4498736201102112e-05, + "loss": 0.0517, + "step": 11858 + }, + { + "epoch": 4.822692151281009, + "grad_norm": 6.150479455939258, + "learning_rate": 1.4497831307506323e-05, + "loss": 0.3147, + "step": 11859 + }, + { + "epoch": 4.823098820658805, + "grad_norm": 1.0651651527073827, + "learning_rate": 1.449692636773803e-05, + "loss": 0.0202, + "step": 11860 + }, + { + "epoch": 4.823505490036601, + "grad_norm": 2.9012587222725155, + "learning_rate": 1.4496021381806529e-05, + "loss": 0.0638, + "step": 11861 + }, + { + "epoch": 4.823912159414396, + "grad_norm": 3.3316474520359187, + "learning_rate": 1.4495116349721103e-05, + "loss": 0.046, + "step": 11862 + }, + { + "epoch": 4.824318828792192, + "grad_norm": 6.555189811293721, + "learning_rate": 1.4494211271491044e-05, + "loss": 0.323, + "step": 11863 + }, + { + "epoch": 4.824725498169988, + "grad_norm": 3.033389296371147, + "learning_rate": 1.4493306147125647e-05, + "loss": 0.0608, + "step": 11864 + }, + { + "epoch": 4.825132167547784, + "grad_norm": 0.48318516979199644, + "learning_rate": 1.44924009766342e-05, + "loss": 0.0079, + "step": 11865 + }, + { + "epoch": 4.825538836925579, + "grad_norm": 25.68392742600187, + "learning_rate": 1.4491495760025994e-05, + "loss": 0.6102, + "step": 11866 + }, + { + "epoch": 4.825945506303375, + "grad_norm": 5.152916730329937, + "learning_rate": 1.4490590497310327e-05, + "loss": 0.1174, + "step": 11867 + }, + { + "epoch": 4.826352175681171, + "grad_norm": 11.186948711382165, + "learning_rate": 1.4489685188496488e-05, + "loss": 0.2314, + "step": 11868 + }, + { + "epoch": 4.826758845058967, + "grad_norm": 8.11603052305727, + "learning_rate": 1.4488779833593767e-05, + "loss": 0.091, + "step": 11869 + }, + { + "epoch": 4.8271655144367625, + "grad_norm": 5.60482800303171, + "learning_rate": 1.4487874432611466e-05, + "loss": 0.1643, + "step": 11870 + }, + { + "epoch": 4.827572183814559, + "grad_norm": 7.248468842914556, + "learning_rate": 1.4486968985558873e-05, + "loss": 0.4148, + "step": 11871 + }, + { + "epoch": 4.827978853192355, + "grad_norm": 8.253214159436611, + "learning_rate": 1.4486063492445286e-05, + "loss": 0.2966, + "step": 11872 + }, + { + "epoch": 4.828385522570151, + "grad_norm": 15.640662762503908, + "learning_rate": 1.448515795328e-05, + "loss": 0.6258, + "step": 11873 + }, + { + "epoch": 4.828792191947946, + "grad_norm": 8.944814839273064, + "learning_rate": 1.4484252368072309e-05, + "loss": 0.5985, + "step": 11874 + }, + { + "epoch": 4.829198861325742, + "grad_norm": 2.7668389121948156, + "learning_rate": 1.4483346736831511e-05, + "loss": 0.063, + "step": 11875 + }, + { + "epoch": 4.829605530703538, + "grad_norm": 4.930203613999217, + "learning_rate": 1.4482441059566903e-05, + "loss": 0.1202, + "step": 11876 + }, + { + "epoch": 4.830012200081334, + "grad_norm": 0.23943444790387935, + "learning_rate": 1.4481535336287782e-05, + "loss": 0.0045, + "step": 11877 + }, + { + "epoch": 4.8304188694591295, + "grad_norm": 9.49079950353244, + "learning_rate": 1.4480629567003446e-05, + "loss": 0.3181, + "step": 11878 + }, + { + "epoch": 4.830825538836925, + "grad_norm": 7.79869183221345, + "learning_rate": 1.4479723751723186e-05, + "loss": 0.2227, + "step": 11879 + }, + { + "epoch": 4.831232208214722, + "grad_norm": 2.5963214141184823, + "learning_rate": 1.4478817890456314e-05, + "loss": 0.0595, + "step": 11880 + }, + { + "epoch": 4.831638877592518, + "grad_norm": 5.046934428318167, + "learning_rate": 1.447791198321212e-05, + "loss": 0.3532, + "step": 11881 + }, + { + "epoch": 4.832045546970313, + "grad_norm": 3.6169448797175763, + "learning_rate": 1.4477006029999907e-05, + "loss": 0.1256, + "step": 11882 + }, + { + "epoch": 4.832452216348109, + "grad_norm": 7.137758889960945, + "learning_rate": 1.4476100030828972e-05, + "loss": 0.4221, + "step": 11883 + }, + { + "epoch": 4.832858885725905, + "grad_norm": 11.671434854525755, + "learning_rate": 1.447519398570862e-05, + "loss": 0.2489, + "step": 11884 + }, + { + "epoch": 4.833265555103701, + "grad_norm": 4.824966246883401, + "learning_rate": 1.4474287894648145e-05, + "loss": 0.07, + "step": 11885 + }, + { + "epoch": 4.8336722244814965, + "grad_norm": 14.648176445860077, + "learning_rate": 1.4473381757656857e-05, + "loss": 0.6334, + "step": 11886 + }, + { + "epoch": 4.834078893859292, + "grad_norm": 8.907998875287598, + "learning_rate": 1.4472475574744055e-05, + "loss": 0.1773, + "step": 11887 + }, + { + "epoch": 4.834485563237088, + "grad_norm": 1.487016418458232, + "learning_rate": 1.4471569345919037e-05, + "loss": 0.0237, + "step": 11888 + }, + { + "epoch": 4.834892232614884, + "grad_norm": 8.82746977969756, + "learning_rate": 1.4470663071191107e-05, + "loss": 0.3545, + "step": 11889 + }, + { + "epoch": 4.8352989019926795, + "grad_norm": 2.67211412396228, + "learning_rate": 1.4469756750569575e-05, + "loss": 0.0565, + "step": 11890 + }, + { + "epoch": 4.835705571370476, + "grad_norm": 5.032275934058598, + "learning_rate": 1.446885038406374e-05, + "loss": 0.3724, + "step": 11891 + }, + { + "epoch": 4.836112240748272, + "grad_norm": 6.259117830556009, + "learning_rate": 1.4467943971682902e-05, + "loss": 0.1639, + "step": 11892 + }, + { + "epoch": 4.836518910126068, + "grad_norm": 9.570625036741653, + "learning_rate": 1.4467037513436375e-05, + "loss": 0.2559, + "step": 11893 + }, + { + "epoch": 4.8369255795038635, + "grad_norm": 9.883524957239095, + "learning_rate": 1.446613100933346e-05, + "loss": 0.0974, + "step": 11894 + }, + { + "epoch": 4.837332248881659, + "grad_norm": 5.0922485755896245, + "learning_rate": 1.446522445938346e-05, + "loss": 0.1088, + "step": 11895 + }, + { + "epoch": 4.837738918259455, + "grad_norm": 16.75053760035918, + "learning_rate": 1.4464317863595685e-05, + "loss": 0.4682, + "step": 11896 + }, + { + "epoch": 4.838145587637251, + "grad_norm": 1.6609678028090837, + "learning_rate": 1.4463411221979442e-05, + "loss": 0.0294, + "step": 11897 + }, + { + "epoch": 4.8385522570150465, + "grad_norm": 0.7441732870907571, + "learning_rate": 1.4462504534544031e-05, + "loss": 0.0154, + "step": 11898 + }, + { + "epoch": 4.838958926392842, + "grad_norm": 5.466867186616574, + "learning_rate": 1.446159780129877e-05, + "loss": 0.1998, + "step": 11899 + }, + { + "epoch": 4.839365595770639, + "grad_norm": 11.137657258781102, + "learning_rate": 1.4460691022252962e-05, + "loss": 0.3977, + "step": 11900 + }, + { + "epoch": 4.839772265148435, + "grad_norm": 3.908467184613275, + "learning_rate": 1.4459784197415914e-05, + "loss": 0.0725, + "step": 11901 + }, + { + "epoch": 4.8401789345262305, + "grad_norm": 7.058037899199272, + "learning_rate": 1.4458877326796934e-05, + "loss": 0.2011, + "step": 11902 + }, + { + "epoch": 4.840585603904026, + "grad_norm": 6.504542104465788, + "learning_rate": 1.445797041040534e-05, + "loss": 0.1734, + "step": 11903 + }, + { + "epoch": 4.840992273281822, + "grad_norm": 3.7280613866527634, + "learning_rate": 1.4457063448250432e-05, + "loss": 0.0613, + "step": 11904 + }, + { + "epoch": 4.841398942659618, + "grad_norm": 0.8913711976573924, + "learning_rate": 1.4456156440341526e-05, + "loss": 0.0142, + "step": 11905 + }, + { + "epoch": 4.8418056120374136, + "grad_norm": 3.4089579638166376, + "learning_rate": 1.4455249386687933e-05, + "loss": 0.1104, + "step": 11906 + }, + { + "epoch": 4.842212281415209, + "grad_norm": 4.668850176508591, + "learning_rate": 1.445434228729896e-05, + "loss": 0.1056, + "step": 11907 + }, + { + "epoch": 4.842618950793005, + "grad_norm": 7.136877856586258, + "learning_rate": 1.4453435142183923e-05, + "loss": 0.1834, + "step": 11908 + }, + { + "epoch": 4.843025620170801, + "grad_norm": 6.292786415950042, + "learning_rate": 1.445252795135213e-05, + "loss": 0.1794, + "step": 11909 + }, + { + "epoch": 4.843432289548597, + "grad_norm": 13.536786299694546, + "learning_rate": 1.4451620714812902e-05, + "loss": 0.6772, + "step": 11910 + }, + { + "epoch": 4.843838958926392, + "grad_norm": 0.16656306362002224, + "learning_rate": 1.4450713432575541e-05, + "loss": 0.0019, + "step": 11911 + }, + { + "epoch": 4.844245628304189, + "grad_norm": 3.688103873193013, + "learning_rate": 1.444980610464937e-05, + "loss": 0.0698, + "step": 11912 + }, + { + "epoch": 4.844652297681985, + "grad_norm": 0.04655251039771781, + "learning_rate": 1.4448898731043699e-05, + "loss": 0.0007, + "step": 11913 + }, + { + "epoch": 4.845058967059781, + "grad_norm": 8.058534727952894, + "learning_rate": 1.4447991311767844e-05, + "loss": 0.1515, + "step": 11914 + }, + { + "epoch": 4.845465636437576, + "grad_norm": 31.50834964414462, + "learning_rate": 1.4447083846831117e-05, + "loss": 1.8017, + "step": 11915 + }, + { + "epoch": 4.845872305815372, + "grad_norm": 0.11932814881371562, + "learning_rate": 1.444617633624284e-05, + "loss": 0.0014, + "step": 11916 + }, + { + "epoch": 4.846278975193168, + "grad_norm": 42.24547744985719, + "learning_rate": 1.4445268780012322e-05, + "loss": 0.7193, + "step": 11917 + }, + { + "epoch": 4.846685644570964, + "grad_norm": 9.270529316007533, + "learning_rate": 1.4444361178148886e-05, + "loss": 0.2345, + "step": 11918 + }, + { + "epoch": 4.847092313948759, + "grad_norm": 1.7107066719147066, + "learning_rate": 1.4443453530661843e-05, + "loss": 0.0584, + "step": 11919 + }, + { + "epoch": 4.847498983326555, + "grad_norm": 11.50429624275319, + "learning_rate": 1.4442545837560516e-05, + "loss": 0.57, + "step": 11920 + }, + { + "epoch": 4.847905652704352, + "grad_norm": 23.23502774081679, + "learning_rate": 1.4441638098854216e-05, + "loss": 1.3951, + "step": 11921 + }, + { + "epoch": 4.848312322082148, + "grad_norm": 11.579866219330311, + "learning_rate": 1.4440730314552269e-05, + "loss": 0.4648, + "step": 11922 + }, + { + "epoch": 4.848718991459943, + "grad_norm": 1.3035947097468248, + "learning_rate": 1.4439822484663991e-05, + "loss": 0.0173, + "step": 11923 + }, + { + "epoch": 4.849125660837739, + "grad_norm": 8.51915312280551, + "learning_rate": 1.4438914609198696e-05, + "loss": 0.3043, + "step": 11924 + }, + { + "epoch": 4.849532330215535, + "grad_norm": 3.5519168900071545, + "learning_rate": 1.443800668816571e-05, + "loss": 0.0949, + "step": 11925 + }, + { + "epoch": 4.849938999593331, + "grad_norm": 10.382805546570939, + "learning_rate": 1.4437098721574356e-05, + "loss": 0.4665, + "step": 11926 + }, + { + "epoch": 4.850345668971126, + "grad_norm": 1.2891389670707432, + "learning_rate": 1.4436190709433946e-05, + "loss": 0.0208, + "step": 11927 + }, + { + "epoch": 4.850752338348922, + "grad_norm": 0.43377498079294785, + "learning_rate": 1.4435282651753809e-05, + "loss": 0.0065, + "step": 11928 + }, + { + "epoch": 4.851159007726718, + "grad_norm": 2.1052261228195417, + "learning_rate": 1.4434374548543259e-05, + "loss": 0.0423, + "step": 11929 + }, + { + "epoch": 4.851565677104514, + "grad_norm": 5.045623379403715, + "learning_rate": 1.4433466399811625e-05, + "loss": 0.0956, + "step": 11930 + }, + { + "epoch": 4.8519723464823095, + "grad_norm": 0.15065854589252548, + "learning_rate": 1.4432558205568228e-05, + "loss": 0.0028, + "step": 11931 + }, + { + "epoch": 4.852379015860106, + "grad_norm": 6.148243692299773, + "learning_rate": 1.443164996582239e-05, + "loss": 0.1302, + "step": 11932 + }, + { + "epoch": 4.852785685237902, + "grad_norm": 2.4142091607303855, + "learning_rate": 1.4430741680583434e-05, + "loss": 0.0678, + "step": 11933 + }, + { + "epoch": 4.853192354615698, + "grad_norm": 4.698507417783633, + "learning_rate": 1.4429833349860683e-05, + "loss": 0.1073, + "step": 11934 + }, + { + "epoch": 4.853599023993493, + "grad_norm": 6.5072335638144905, + "learning_rate": 1.4428924973663466e-05, + "loss": 0.202, + "step": 11935 + }, + { + "epoch": 4.854005693371289, + "grad_norm": 5.775855150527855, + "learning_rate": 1.4428016552001106e-05, + "loss": 0.1331, + "step": 11936 + }, + { + "epoch": 4.854412362749085, + "grad_norm": 7.7917853096787075, + "learning_rate": 1.4427108084882921e-05, + "loss": 0.1904, + "step": 11937 + }, + { + "epoch": 4.854819032126881, + "grad_norm": 5.294578748458352, + "learning_rate": 1.442619957231825e-05, + "loss": 0.1475, + "step": 11938 + }, + { + "epoch": 4.8552257015046765, + "grad_norm": 3.9876220157459663, + "learning_rate": 1.442529101431641e-05, + "loss": 0.1638, + "step": 11939 + }, + { + "epoch": 4.855632370882472, + "grad_norm": 10.740628432450068, + "learning_rate": 1.442438241088673e-05, + "loss": 0.1557, + "step": 11940 + }, + { + "epoch": 4.856039040260269, + "grad_norm": 8.963719662562916, + "learning_rate": 1.442347376203854e-05, + "loss": 0.2646, + "step": 11941 + }, + { + "epoch": 4.856445709638065, + "grad_norm": 2.2434834695492265, + "learning_rate": 1.4422565067781164e-05, + "loss": 0.0377, + "step": 11942 + }, + { + "epoch": 4.85685237901586, + "grad_norm": 3.2496178074463766, + "learning_rate": 1.4421656328123932e-05, + "loss": 0.0525, + "step": 11943 + }, + { + "epoch": 4.857259048393656, + "grad_norm": 5.6559044710967195, + "learning_rate": 1.4420747543076173e-05, + "loss": 0.1641, + "step": 11944 + }, + { + "epoch": 4.857665717771452, + "grad_norm": 3.0862190261518614, + "learning_rate": 1.4419838712647214e-05, + "loss": 0.0305, + "step": 11945 + }, + { + "epoch": 4.858072387149248, + "grad_norm": 0.2945105670920551, + "learning_rate": 1.441892983684639e-05, + "loss": 0.007, + "step": 11946 + }, + { + "epoch": 4.8584790565270435, + "grad_norm": 6.158303827481099, + "learning_rate": 1.4418020915683022e-05, + "loss": 0.1749, + "step": 11947 + }, + { + "epoch": 4.858885725904839, + "grad_norm": 1.663297032327272, + "learning_rate": 1.4417111949166448e-05, + "loss": 0.0243, + "step": 11948 + }, + { + "epoch": 4.859292395282635, + "grad_norm": 6.624046416522586, + "learning_rate": 1.4416202937305998e-05, + "loss": 0.1739, + "step": 11949 + }, + { + "epoch": 4.859699064660431, + "grad_norm": 7.436737131272036, + "learning_rate": 1.4415293880111002e-05, + "loss": 0.3449, + "step": 11950 + }, + { + "epoch": 4.8601057340382265, + "grad_norm": 1.9036937328531112, + "learning_rate": 1.441438477759079e-05, + "loss": 0.0347, + "step": 11951 + }, + { + "epoch": 4.860512403416022, + "grad_norm": 3.561328369902709, + "learning_rate": 1.4413475629754698e-05, + "loss": 0.0879, + "step": 11952 + }, + { + "epoch": 4.860919072793819, + "grad_norm": 5.4503130222535425, + "learning_rate": 1.4412566436612059e-05, + "loss": 0.1176, + "step": 11953 + }, + { + "epoch": 4.861325742171615, + "grad_norm": 4.512110353131672, + "learning_rate": 1.4411657198172205e-05, + "loss": 0.1981, + "step": 11954 + }, + { + "epoch": 4.8617324115494105, + "grad_norm": 5.404144275132101, + "learning_rate": 1.4410747914444467e-05, + "loss": 0.0598, + "step": 11955 + }, + { + "epoch": 4.862139080927206, + "grad_norm": 8.655024687111645, + "learning_rate": 1.4409838585438181e-05, + "loss": 0.427, + "step": 11956 + }, + { + "epoch": 4.862545750305002, + "grad_norm": 0.07287282428564937, + "learning_rate": 1.4408929211162686e-05, + "loss": 0.0011, + "step": 11957 + }, + { + "epoch": 4.862952419682798, + "grad_norm": 0.9076362401280265, + "learning_rate": 1.440801979162731e-05, + "loss": 0.0158, + "step": 11958 + }, + { + "epoch": 4.8633590890605936, + "grad_norm": 0.9278199108119272, + "learning_rate": 1.4407110326841393e-05, + "loss": 0.0284, + "step": 11959 + }, + { + "epoch": 4.863765758438389, + "grad_norm": 6.02652439752461, + "learning_rate": 1.440620081681427e-05, + "loss": 0.1122, + "step": 11960 + }, + { + "epoch": 4.864172427816185, + "grad_norm": 4.012878552345051, + "learning_rate": 1.4405291261555281e-05, + "loss": 0.1737, + "step": 11961 + }, + { + "epoch": 4.864579097193982, + "grad_norm": 2.2075000141692662, + "learning_rate": 1.4404381661073759e-05, + "loss": 0.0543, + "step": 11962 + }, + { + "epoch": 4.8649857665717775, + "grad_norm": 0.6887084523388113, + "learning_rate": 1.440347201537904e-05, + "loss": 0.01, + "step": 11963 + }, + { + "epoch": 4.865392435949573, + "grad_norm": 0.3785617292507804, + "learning_rate": 1.440256232448047e-05, + "loss": 0.0037, + "step": 11964 + }, + { + "epoch": 4.865799105327369, + "grad_norm": 16.19711049920531, + "learning_rate": 1.4401652588387378e-05, + "loss": 1.0075, + "step": 11965 + }, + { + "epoch": 4.866205774705165, + "grad_norm": 3.467904318974522, + "learning_rate": 1.4400742807109105e-05, + "loss": 0.1021, + "step": 11966 + }, + { + "epoch": 4.866612444082961, + "grad_norm": 4.928408933018187, + "learning_rate": 1.4399832980654995e-05, + "loss": 0.1008, + "step": 11967 + }, + { + "epoch": 4.867019113460756, + "grad_norm": 2.9376218145462705, + "learning_rate": 1.4398923109034385e-05, + "loss": 0.0476, + "step": 11968 + }, + { + "epoch": 4.867425782838552, + "grad_norm": 0.07476603858312013, + "learning_rate": 1.4398013192256615e-05, + "loss": 0.0009, + "step": 11969 + }, + { + "epoch": 4.867832452216348, + "grad_norm": 4.8029945560313125, + "learning_rate": 1.4397103230331026e-05, + "loss": 0.1017, + "step": 11970 + }, + { + "epoch": 4.868239121594144, + "grad_norm": 3.736448864174967, + "learning_rate": 1.439619322326696e-05, + "loss": 0.0676, + "step": 11971 + }, + { + "epoch": 4.868645790971939, + "grad_norm": 10.317967295034586, + "learning_rate": 1.4395283171073758e-05, + "loss": 0.5142, + "step": 11972 + }, + { + "epoch": 4.869052460349736, + "grad_norm": 0.03783537708745715, + "learning_rate": 1.439437307376076e-05, + "loss": 0.0008, + "step": 11973 + }, + { + "epoch": 4.869459129727532, + "grad_norm": 3.4836020705286224, + "learning_rate": 1.4393462931337315e-05, + "loss": 0.0597, + "step": 11974 + }, + { + "epoch": 4.869865799105328, + "grad_norm": 6.501336910752482, + "learning_rate": 1.4392552743812755e-05, + "loss": 0.1742, + "step": 11975 + }, + { + "epoch": 4.870272468483123, + "grad_norm": 0.03695229382619553, + "learning_rate": 1.4391642511196438e-05, + "loss": 0.0006, + "step": 11976 + }, + { + "epoch": 4.870679137860919, + "grad_norm": 2.147848728909741, + "learning_rate": 1.4390732233497695e-05, + "loss": 0.0361, + "step": 11977 + }, + { + "epoch": 4.871085807238715, + "grad_norm": 0.11206903827947173, + "learning_rate": 1.438982191072588e-05, + "loss": 0.0016, + "step": 11978 + }, + { + "epoch": 4.871492476616511, + "grad_norm": 6.556712961025145, + "learning_rate": 1.438891154289033e-05, + "loss": 0.2083, + "step": 11979 + }, + { + "epoch": 4.871899145994306, + "grad_norm": 0.08014119485780738, + "learning_rate": 1.4388001130000395e-05, + "loss": 0.0013, + "step": 11980 + }, + { + "epoch": 4.872305815372102, + "grad_norm": 2.9128076653498725, + "learning_rate": 1.438709067206542e-05, + "loss": 0.041, + "step": 11981 + }, + { + "epoch": 4.872712484749899, + "grad_norm": 7.443363682673821, + "learning_rate": 1.4386180169094752e-05, + "loss": 0.2297, + "step": 11982 + }, + { + "epoch": 4.873119154127695, + "grad_norm": 0.18634656705134536, + "learning_rate": 1.4385269621097737e-05, + "loss": 0.0035, + "step": 11983 + }, + { + "epoch": 4.87352582350549, + "grad_norm": 5.3140687764092025, + "learning_rate": 1.4384359028083723e-05, + "loss": 0.1074, + "step": 11984 + }, + { + "epoch": 4.873932492883286, + "grad_norm": 8.777532179507142, + "learning_rate": 1.4383448390062053e-05, + "loss": 0.2147, + "step": 11985 + }, + { + "epoch": 4.874339162261082, + "grad_norm": 9.229448525742546, + "learning_rate": 1.4382537707042085e-05, + "loss": 0.6146, + "step": 11986 + }, + { + "epoch": 4.874745831638878, + "grad_norm": 5.570885457590586, + "learning_rate": 1.4381626979033158e-05, + "loss": 0.1751, + "step": 11987 + }, + { + "epoch": 4.875152501016673, + "grad_norm": 11.552341964728893, + "learning_rate": 1.4380716206044625e-05, + "loss": 0.8082, + "step": 11988 + }, + { + "epoch": 4.875559170394469, + "grad_norm": 26.123175142062433, + "learning_rate": 1.4379805388085837e-05, + "loss": 0.3193, + "step": 11989 + }, + { + "epoch": 4.875965839772265, + "grad_norm": 5.621565755461923, + "learning_rate": 1.4378894525166142e-05, + "loss": 0.2505, + "step": 11990 + }, + { + "epoch": 4.876372509150061, + "grad_norm": 7.485811872071114, + "learning_rate": 1.437798361729489e-05, + "loss": 0.1915, + "step": 11991 + }, + { + "epoch": 4.8767791785278565, + "grad_norm": 14.677055616287184, + "learning_rate": 1.4377072664481433e-05, + "loss": 0.6509, + "step": 11992 + }, + { + "epoch": 4.877185847905652, + "grad_norm": 0.9242474034696251, + "learning_rate": 1.4376161666735121e-05, + "loss": 0.0159, + "step": 11993 + }, + { + "epoch": 4.877592517283449, + "grad_norm": 1.3117549845449334, + "learning_rate": 1.4375250624065311e-05, + "loss": 0.0244, + "step": 11994 + }, + { + "epoch": 4.877999186661245, + "grad_norm": 4.443873376054611, + "learning_rate": 1.4374339536481347e-05, + "loss": 0.1654, + "step": 11995 + }, + { + "epoch": 4.87840585603904, + "grad_norm": 7.361137800117059, + "learning_rate": 1.4373428403992588e-05, + "loss": 0.1683, + "step": 11996 + }, + { + "epoch": 4.878812525416836, + "grad_norm": 7.056887296350537, + "learning_rate": 1.4372517226608386e-05, + "loss": 0.4516, + "step": 11997 + }, + { + "epoch": 4.879219194794632, + "grad_norm": 0.40719675704107855, + "learning_rate": 1.4371606004338092e-05, + "loss": 0.0068, + "step": 11998 + }, + { + "epoch": 4.879625864172428, + "grad_norm": 1.749803728036272, + "learning_rate": 1.4370694737191065e-05, + "loss": 0.0247, + "step": 11999 + }, + { + "epoch": 4.8800325335502235, + "grad_norm": 8.653853145717786, + "learning_rate": 1.4369783425176656e-05, + "loss": 0.4809, + "step": 12000 + }, + { + "epoch": 4.880439202928019, + "grad_norm": 12.696421401378197, + "learning_rate": 1.4368872068304217e-05, + "loss": 0.5859, + "step": 12001 + }, + { + "epoch": 4.880845872305815, + "grad_norm": 5.993948700953081, + "learning_rate": 1.4367960666583114e-05, + "loss": 0.1632, + "step": 12002 + }, + { + "epoch": 4.881252541683612, + "grad_norm": 15.30903465996846, + "learning_rate": 1.4367049220022694e-05, + "loss": 0.983, + "step": 12003 + }, + { + "epoch": 4.881659211061407, + "grad_norm": 12.421259387974292, + "learning_rate": 1.4366137728632313e-05, + "loss": 0.2403, + "step": 12004 + }, + { + "epoch": 4.882065880439203, + "grad_norm": 5.141938757220815, + "learning_rate": 1.4365226192421334e-05, + "loss": 0.0868, + "step": 12005 + }, + { + "epoch": 4.882472549816999, + "grad_norm": 6.836893522577445, + "learning_rate": 1.4364314611399109e-05, + "loss": 0.5231, + "step": 12006 + }, + { + "epoch": 4.882879219194795, + "grad_norm": 2.2209070236402018, + "learning_rate": 1.4363402985574997e-05, + "loss": 0.0412, + "step": 12007 + }, + { + "epoch": 4.8832858885725905, + "grad_norm": 3.197840667137531, + "learning_rate": 1.436249131495836e-05, + "loss": 0.0614, + "step": 12008 + }, + { + "epoch": 4.883692557950386, + "grad_norm": 0.5004694753114253, + "learning_rate": 1.4361579599558552e-05, + "loss": 0.0111, + "step": 12009 + }, + { + "epoch": 4.884099227328182, + "grad_norm": 7.140279589381042, + "learning_rate": 1.4360667839384937e-05, + "loss": 0.1814, + "step": 12010 + }, + { + "epoch": 4.884505896705978, + "grad_norm": 12.610622912086903, + "learning_rate": 1.435975603444687e-05, + "loss": 0.8082, + "step": 12011 + }, + { + "epoch": 4.8849125660837736, + "grad_norm": 4.290492973676199, + "learning_rate": 1.4358844184753713e-05, + "loss": 0.0839, + "step": 12012 + }, + { + "epoch": 4.885319235461569, + "grad_norm": 5.322795307154837, + "learning_rate": 1.4357932290314826e-05, + "loss": 0.0951, + "step": 12013 + }, + { + "epoch": 4.885725904839366, + "grad_norm": 4.356358741905878, + "learning_rate": 1.4357020351139573e-05, + "loss": 0.0905, + "step": 12014 + }, + { + "epoch": 4.886132574217162, + "grad_norm": 5.231223368890262, + "learning_rate": 1.435610836723731e-05, + "loss": 0.0837, + "step": 12015 + }, + { + "epoch": 4.8865392435949575, + "grad_norm": 0.6661577908185922, + "learning_rate": 1.4355196338617408e-05, + "loss": 0.0113, + "step": 12016 + }, + { + "epoch": 4.886945912972753, + "grad_norm": 6.053406520281094, + "learning_rate": 1.4354284265289218e-05, + "loss": 0.2422, + "step": 12017 + }, + { + "epoch": 4.887352582350549, + "grad_norm": 0.2841346371619756, + "learning_rate": 1.435337214726211e-05, + "loss": 0.0073, + "step": 12018 + }, + { + "epoch": 4.887759251728345, + "grad_norm": 8.824795154633252, + "learning_rate": 1.4352459984545446e-05, + "loss": 0.2586, + "step": 12019 + }, + { + "epoch": 4.888165921106141, + "grad_norm": 0.40292535519053513, + "learning_rate": 1.4351547777148589e-05, + "loss": 0.007, + "step": 12020 + }, + { + "epoch": 4.888572590483936, + "grad_norm": 4.11353208564079, + "learning_rate": 1.4350635525080906e-05, + "loss": 0.0533, + "step": 12021 + }, + { + "epoch": 4.888979259861732, + "grad_norm": 2.631944393767909, + "learning_rate": 1.434972322835176e-05, + "loss": 0.0687, + "step": 12022 + }, + { + "epoch": 4.889385929239529, + "grad_norm": 3.819801781005867, + "learning_rate": 1.4348810886970515e-05, + "loss": 0.2119, + "step": 12023 + }, + { + "epoch": 4.8897925986173245, + "grad_norm": 1.1468747576640514, + "learning_rate": 1.4347898500946535e-05, + "loss": 0.0142, + "step": 12024 + }, + { + "epoch": 4.89019926799512, + "grad_norm": 7.9723997358536565, + "learning_rate": 1.4346986070289193e-05, + "loss": 0.5801, + "step": 12025 + }, + { + "epoch": 4.890605937372916, + "grad_norm": 6.199673999588872, + "learning_rate": 1.4346073595007848e-05, + "loss": 0.1933, + "step": 12026 + }, + { + "epoch": 4.891012606750712, + "grad_norm": 2.65164308187171, + "learning_rate": 1.4345161075111871e-05, + "loss": 0.0717, + "step": 12027 + }, + { + "epoch": 4.891419276128508, + "grad_norm": 6.126873358075621, + "learning_rate": 1.434424851061063e-05, + "loss": 0.2836, + "step": 12028 + }, + { + "epoch": 4.891825945506303, + "grad_norm": 9.693297804275275, + "learning_rate": 1.434333590151349e-05, + "loss": 0.5447, + "step": 12029 + }, + { + "epoch": 4.892232614884099, + "grad_norm": 9.405611746562307, + "learning_rate": 1.4342423247829819e-05, + "loss": 0.4482, + "step": 12030 + }, + { + "epoch": 4.892639284261895, + "grad_norm": 10.133642268195201, + "learning_rate": 1.4341510549568992e-05, + "loss": 0.4878, + "step": 12031 + }, + { + "epoch": 4.893045953639691, + "grad_norm": 5.711713813165508, + "learning_rate": 1.4340597806740372e-05, + "loss": 0.1769, + "step": 12032 + }, + { + "epoch": 4.893452623017486, + "grad_norm": 0.4685382947334016, + "learning_rate": 1.433968501935333e-05, + "loss": 0.0071, + "step": 12033 + }, + { + "epoch": 4.893859292395282, + "grad_norm": 7.764910231379648, + "learning_rate": 1.433877218741724e-05, + "loss": 0.2493, + "step": 12034 + }, + { + "epoch": 4.894265961773079, + "grad_norm": 10.532709006135072, + "learning_rate": 1.4337859310941472e-05, + "loss": 0.4603, + "step": 12035 + }, + { + "epoch": 4.894672631150875, + "grad_norm": 2.0912230433447845, + "learning_rate": 1.433694638993539e-05, + "loss": 0.0488, + "step": 12036 + }, + { + "epoch": 4.89507930052867, + "grad_norm": 10.261769051659055, + "learning_rate": 1.433603342440837e-05, + "loss": 0.2972, + "step": 12037 + }, + { + "epoch": 4.895485969906466, + "grad_norm": 8.661259867353126, + "learning_rate": 1.4335120414369787e-05, + "loss": 0.0907, + "step": 12038 + }, + { + "epoch": 4.895892639284262, + "grad_norm": 2.4704304851912933, + "learning_rate": 1.4334207359829013e-05, + "loss": 0.0473, + "step": 12039 + }, + { + "epoch": 4.896299308662058, + "grad_norm": 5.841838371044343, + "learning_rate": 1.4333294260795414e-05, + "loss": 0.1453, + "step": 12040 + }, + { + "epoch": 4.896705978039853, + "grad_norm": 4.502411405140124, + "learning_rate": 1.4332381117278373e-05, + "loss": 0.0831, + "step": 12041 + }, + { + "epoch": 4.897112647417649, + "grad_norm": 9.619512978768157, + "learning_rate": 1.4331467929287258e-05, + "loss": 0.2079, + "step": 12042 + }, + { + "epoch": 4.897519316795445, + "grad_norm": 4.479469351988885, + "learning_rate": 1.4330554696831443e-05, + "loss": 0.2114, + "step": 12043 + }, + { + "epoch": 4.897925986173242, + "grad_norm": 0.18646877663816702, + "learning_rate": 1.4329641419920307e-05, + "loss": 0.0037, + "step": 12044 + }, + { + "epoch": 4.898332655551037, + "grad_norm": 8.9559380820918, + "learning_rate": 1.4328728098563222e-05, + "loss": 0.3943, + "step": 12045 + }, + { + "epoch": 4.898739324928833, + "grad_norm": 23.097094914637516, + "learning_rate": 1.4327814732769563e-05, + "loss": 1.1522, + "step": 12046 + }, + { + "epoch": 4.899145994306629, + "grad_norm": 11.771274590731387, + "learning_rate": 1.4326901322548709e-05, + "loss": 0.2101, + "step": 12047 + }, + { + "epoch": 4.899552663684425, + "grad_norm": 0.5422997066091491, + "learning_rate": 1.4325987867910037e-05, + "loss": 0.0093, + "step": 12048 + }, + { + "epoch": 4.89995933306222, + "grad_norm": 12.68960795294127, + "learning_rate": 1.432507436886292e-05, + "loss": 0.5231, + "step": 12049 + }, + { + "epoch": 4.900366002440016, + "grad_norm": 5.232250749980909, + "learning_rate": 1.4324160825416737e-05, + "loss": 0.1325, + "step": 12050 + }, + { + "epoch": 4.900772671817812, + "grad_norm": 12.068919131028801, + "learning_rate": 1.4323247237580869e-05, + "loss": 0.2728, + "step": 12051 + }, + { + "epoch": 4.901179341195608, + "grad_norm": 5.302223707698848, + "learning_rate": 1.4322333605364692e-05, + "loss": 0.2805, + "step": 12052 + }, + { + "epoch": 4.9015860105734035, + "grad_norm": 0.08939623265357263, + "learning_rate": 1.4321419928777585e-05, + "loss": 0.0019, + "step": 12053 + }, + { + "epoch": 4.901992679951199, + "grad_norm": 1.0660759009080245, + "learning_rate": 1.4320506207828928e-05, + "loss": 0.0157, + "step": 12054 + }, + { + "epoch": 4.902399349328996, + "grad_norm": 14.019612016336264, + "learning_rate": 1.4319592442528097e-05, + "loss": 0.5453, + "step": 12055 + }, + { + "epoch": 4.902806018706792, + "grad_norm": 5.4870939291192125, + "learning_rate": 1.4318678632884479e-05, + "loss": 0.2524, + "step": 12056 + }, + { + "epoch": 4.903212688084587, + "grad_norm": 11.231397568813748, + "learning_rate": 1.431776477890745e-05, + "loss": 0.772, + "step": 12057 + }, + { + "epoch": 4.903619357462383, + "grad_norm": 5.504941580568505, + "learning_rate": 1.4316850880606394e-05, + "loss": 0.1701, + "step": 12058 + }, + { + "epoch": 4.904026026840179, + "grad_norm": 12.156097648960365, + "learning_rate": 1.4315936937990689e-05, + "loss": 0.4676, + "step": 12059 + }, + { + "epoch": 4.904432696217975, + "grad_norm": 0.2434859246682476, + "learning_rate": 1.431502295106972e-05, + "loss": 0.0033, + "step": 12060 + }, + { + "epoch": 4.9048393655957705, + "grad_norm": 3.6646967750468074, + "learning_rate": 1.4314108919852869e-05, + "loss": 0.0732, + "step": 12061 + }, + { + "epoch": 4.905246034973566, + "grad_norm": 3.9335704356954446, + "learning_rate": 1.4313194844349515e-05, + "loss": 0.0339, + "step": 12062 + }, + { + "epoch": 4.905652704351362, + "grad_norm": 6.490578818501304, + "learning_rate": 1.431228072456905e-05, + "loss": 0.141, + "step": 12063 + }, + { + "epoch": 4.906059373729159, + "grad_norm": 13.802879082949623, + "learning_rate": 1.4311366560520851e-05, + "loss": 0.7521, + "step": 12064 + }, + { + "epoch": 4.9064660431069544, + "grad_norm": 3.698995126376916, + "learning_rate": 1.4310452352214306e-05, + "loss": 0.0629, + "step": 12065 + }, + { + "epoch": 4.90687271248475, + "grad_norm": 12.770240418405331, + "learning_rate": 1.4309538099658798e-05, + "loss": 0.5272, + "step": 12066 + }, + { + "epoch": 4.907279381862546, + "grad_norm": 5.726435289427182, + "learning_rate": 1.430862380286371e-05, + "loss": 0.1702, + "step": 12067 + }, + { + "epoch": 4.907686051240342, + "grad_norm": 2.7841215229406404, + "learning_rate": 1.4307709461838433e-05, + "loss": 0.0559, + "step": 12068 + }, + { + "epoch": 4.9080927206181375, + "grad_norm": 12.973086984312864, + "learning_rate": 1.4306795076592347e-05, + "loss": 0.4324, + "step": 12069 + }, + { + "epoch": 4.908499389995933, + "grad_norm": 0.15109574438157083, + "learning_rate": 1.4305880647134847e-05, + "loss": 0.0035, + "step": 12070 + }, + { + "epoch": 4.908906059373729, + "grad_norm": 0.8164846737183381, + "learning_rate": 1.4304966173475312e-05, + "loss": 0.0109, + "step": 12071 + }, + { + "epoch": 4.909312728751525, + "grad_norm": 0.0852130816093667, + "learning_rate": 1.430405165562313e-05, + "loss": 0.0015, + "step": 12072 + }, + { + "epoch": 4.909719398129321, + "grad_norm": 7.076498998178776, + "learning_rate": 1.4303137093587696e-05, + "loss": 0.2953, + "step": 12073 + }, + { + "epoch": 4.910126067507116, + "grad_norm": 10.013702991529138, + "learning_rate": 1.4302222487378392e-05, + "loss": 0.3134, + "step": 12074 + }, + { + "epoch": 4.910532736884912, + "grad_norm": 3.184024579660041, + "learning_rate": 1.4301307837004608e-05, + "loss": 0.0777, + "step": 12075 + }, + { + "epoch": 4.910939406262709, + "grad_norm": 2.659364324940785, + "learning_rate": 1.4300393142475735e-05, + "loss": 0.0345, + "step": 12076 + }, + { + "epoch": 4.9113460756405045, + "grad_norm": 7.712096825006763, + "learning_rate": 1.4299478403801164e-05, + "loss": 0.254, + "step": 12077 + }, + { + "epoch": 4.9117527450183, + "grad_norm": 3.605305951380678, + "learning_rate": 1.429856362099028e-05, + "loss": 0.0609, + "step": 12078 + }, + { + "epoch": 4.912159414396096, + "grad_norm": 3.5040998904718736, + "learning_rate": 1.429764879405248e-05, + "loss": 0.0407, + "step": 12079 + }, + { + "epoch": 4.912566083773892, + "grad_norm": 8.952876867834508, + "learning_rate": 1.429673392299715e-05, + "loss": 0.2891, + "step": 12080 + }, + { + "epoch": 4.912972753151688, + "grad_norm": 10.6165277222877, + "learning_rate": 1.4295819007833688e-05, + "loss": 0.2898, + "step": 12081 + }, + { + "epoch": 4.913379422529483, + "grad_norm": 5.313253888065264, + "learning_rate": 1.4294904048571475e-05, + "loss": 0.1533, + "step": 12082 + }, + { + "epoch": 4.913786091907279, + "grad_norm": 8.326819786367455, + "learning_rate": 1.4293989045219917e-05, + "loss": 0.2327, + "step": 12083 + }, + { + "epoch": 4.914192761285075, + "grad_norm": 0.36434147794029603, + "learning_rate": 1.4293073997788396e-05, + "loss": 0.0078, + "step": 12084 + }, + { + "epoch": 4.9145994306628715, + "grad_norm": 4.006629593668693, + "learning_rate": 1.4292158906286309e-05, + "loss": 0.1163, + "step": 12085 + }, + { + "epoch": 4.915006100040667, + "grad_norm": 6.750500620473721, + "learning_rate": 1.4291243770723053e-05, + "loss": 0.1713, + "step": 12086 + }, + { + "epoch": 4.915412769418463, + "grad_norm": 3.9843178831037456, + "learning_rate": 1.429032859110802e-05, + "loss": 0.1583, + "step": 12087 + }, + { + "epoch": 4.915819438796259, + "grad_norm": 9.146877606097158, + "learning_rate": 1.4289413367450604e-05, + "loss": 0.4418, + "step": 12088 + }, + { + "epoch": 4.916226108174055, + "grad_norm": 7.903143502391366, + "learning_rate": 1.4288498099760202e-05, + "loss": 0.3638, + "step": 12089 + }, + { + "epoch": 4.91663277755185, + "grad_norm": 5.2486557998103445, + "learning_rate": 1.4287582788046207e-05, + "loss": 0.1004, + "step": 12090 + }, + { + "epoch": 4.917039446929646, + "grad_norm": 0.22196843577691258, + "learning_rate": 1.4286667432318018e-05, + "loss": 0.0056, + "step": 12091 + }, + { + "epoch": 4.917446116307442, + "grad_norm": 0.14006407962839634, + "learning_rate": 1.4285752032585029e-05, + "loss": 0.0019, + "step": 12092 + }, + { + "epoch": 4.917852785685238, + "grad_norm": 1.8220882898698763, + "learning_rate": 1.428483658885664e-05, + "loss": 0.0308, + "step": 12093 + }, + { + "epoch": 4.918259455063033, + "grad_norm": 9.543400096991846, + "learning_rate": 1.4283921101142249e-05, + "loss": 0.6746, + "step": 12094 + }, + { + "epoch": 4.918666124440829, + "grad_norm": 8.253153489968966, + "learning_rate": 1.4283005569451247e-05, + "loss": 0.4898, + "step": 12095 + }, + { + "epoch": 4.919072793818626, + "grad_norm": 9.932228372750084, + "learning_rate": 1.428208999379304e-05, + "loss": 0.238, + "step": 12096 + }, + { + "epoch": 4.919479463196422, + "grad_norm": 9.388530692494614, + "learning_rate": 1.4281174374177025e-05, + "loss": 0.1985, + "step": 12097 + }, + { + "epoch": 4.919886132574217, + "grad_norm": 14.60312385168771, + "learning_rate": 1.4280258710612595e-05, + "loss": 0.4697, + "step": 12098 + }, + { + "epoch": 4.920292801952013, + "grad_norm": 12.449190174150184, + "learning_rate": 1.4279343003109161e-05, + "loss": 0.3948, + "step": 12099 + }, + { + "epoch": 4.920699471329809, + "grad_norm": 12.254493413847623, + "learning_rate": 1.4278427251676115e-05, + "loss": 0.4637, + "step": 12100 + }, + { + "epoch": 4.921106140707605, + "grad_norm": 1.4417291735990834, + "learning_rate": 1.4277511456322861e-05, + "loss": 0.0173, + "step": 12101 + }, + { + "epoch": 4.9215128100854, + "grad_norm": 8.158414106446894, + "learning_rate": 1.4276595617058799e-05, + "loss": 0.2965, + "step": 12102 + }, + { + "epoch": 4.921919479463196, + "grad_norm": 0.34552786569978733, + "learning_rate": 1.427567973389333e-05, + "loss": 0.0095, + "step": 12103 + }, + { + "epoch": 4.922326148840992, + "grad_norm": 2.232901056323996, + "learning_rate": 1.4274763806835857e-05, + "loss": 0.0577, + "step": 12104 + }, + { + "epoch": 4.922732818218789, + "grad_norm": 3.765632591422925, + "learning_rate": 1.4273847835895783e-05, + "loss": 0.0663, + "step": 12105 + }, + { + "epoch": 4.923139487596584, + "grad_norm": 4.466038418360664, + "learning_rate": 1.4272931821082509e-05, + "loss": 0.0847, + "step": 12106 + }, + { + "epoch": 4.92354615697438, + "grad_norm": 7.860028324152801, + "learning_rate": 1.4272015762405438e-05, + "loss": 0.331, + "step": 12107 + }, + { + "epoch": 4.923952826352176, + "grad_norm": 8.790102363742465, + "learning_rate": 1.4271099659873976e-05, + "loss": 0.4095, + "step": 12108 + }, + { + "epoch": 4.924359495729972, + "grad_norm": 9.031180480169517, + "learning_rate": 1.4270183513497527e-05, + "loss": 0.3143, + "step": 12109 + }, + { + "epoch": 4.924766165107767, + "grad_norm": 1.8790688037941037, + "learning_rate": 1.4269267323285493e-05, + "loss": 0.0346, + "step": 12110 + }, + { + "epoch": 4.925172834485563, + "grad_norm": 3.16759517458068, + "learning_rate": 1.4268351089247286e-05, + "loss": 0.1184, + "step": 12111 + }, + { + "epoch": 4.925579503863359, + "grad_norm": 10.210589033352326, + "learning_rate": 1.4267434811392303e-05, + "loss": 0.5579, + "step": 12112 + }, + { + "epoch": 4.925986173241155, + "grad_norm": 4.254423828696172, + "learning_rate": 1.4266518489729955e-05, + "loss": 0.0462, + "step": 12113 + }, + { + "epoch": 4.9263928426189505, + "grad_norm": 4.030271541430598, + "learning_rate": 1.4265602124269644e-05, + "loss": 0.0873, + "step": 12114 + }, + { + "epoch": 4.926799511996746, + "grad_norm": 0.325902076869454, + "learning_rate": 1.4264685715020785e-05, + "loss": 0.0053, + "step": 12115 + }, + { + "epoch": 4.927206181374542, + "grad_norm": 0.544328919140697, + "learning_rate": 1.4263769261992782e-05, + "loss": 0.0089, + "step": 12116 + }, + { + "epoch": 4.927612850752339, + "grad_norm": 6.663542169150171, + "learning_rate": 1.4262852765195037e-05, + "loss": 0.2397, + "step": 12117 + }, + { + "epoch": 4.9280195201301344, + "grad_norm": 10.445251525585562, + "learning_rate": 1.4261936224636963e-05, + "loss": 0.4055, + "step": 12118 + }, + { + "epoch": 4.92842618950793, + "grad_norm": 5.4401554840740465, + "learning_rate": 1.4261019640327973e-05, + "loss": 0.1987, + "step": 12119 + }, + { + "epoch": 4.928832858885726, + "grad_norm": 6.7324871035725735, + "learning_rate": 1.4260103012277467e-05, + "loss": 0.2096, + "step": 12120 + }, + { + "epoch": 4.929239528263522, + "grad_norm": 4.600787407814331, + "learning_rate": 1.4259186340494862e-05, + "loss": 0.2395, + "step": 12121 + }, + { + "epoch": 4.9296461976413175, + "grad_norm": 1.576758441375022, + "learning_rate": 1.4258269624989568e-05, + "loss": 0.0291, + "step": 12122 + }, + { + "epoch": 4.930052867019113, + "grad_norm": 6.5439684305578245, + "learning_rate": 1.4257352865770989e-05, + "loss": 0.2402, + "step": 12123 + }, + { + "epoch": 4.930459536396909, + "grad_norm": 1.1540965820198539, + "learning_rate": 1.425643606284854e-05, + "loss": 0.0197, + "step": 12124 + }, + { + "epoch": 4.930866205774705, + "grad_norm": 0.7433639621352696, + "learning_rate": 1.4255519216231638e-05, + "loss": 0.0166, + "step": 12125 + }, + { + "epoch": 4.9312728751525015, + "grad_norm": 7.804237630091807, + "learning_rate": 1.4254602325929685e-05, + "loss": 0.2681, + "step": 12126 + }, + { + "epoch": 4.931679544530297, + "grad_norm": 7.952258852614094, + "learning_rate": 1.4253685391952097e-05, + "loss": 0.219, + "step": 12127 + }, + { + "epoch": 4.932086213908093, + "grad_norm": 7.0776051689686925, + "learning_rate": 1.425276841430829e-05, + "loss": 0.212, + "step": 12128 + }, + { + "epoch": 4.932492883285889, + "grad_norm": 8.697264498775986, + "learning_rate": 1.4251851393007674e-05, + "loss": 0.2265, + "step": 12129 + }, + { + "epoch": 4.9328995526636845, + "grad_norm": 5.4969359035846495, + "learning_rate": 1.4250934328059663e-05, + "loss": 0.1443, + "step": 12130 + }, + { + "epoch": 4.93330622204148, + "grad_norm": 5.4644981892627085, + "learning_rate": 1.4250017219473673e-05, + "loss": 0.1441, + "step": 12131 + }, + { + "epoch": 4.933712891419276, + "grad_norm": 8.673661154583217, + "learning_rate": 1.4249100067259116e-05, + "loss": 0.5342, + "step": 12132 + }, + { + "epoch": 4.934119560797072, + "grad_norm": 9.83558178819762, + "learning_rate": 1.4248182871425408e-05, + "loss": 0.4174, + "step": 12133 + }, + { + "epoch": 4.934526230174868, + "grad_norm": 1.0447622614226377, + "learning_rate": 1.4247265631981966e-05, + "loss": 0.0166, + "step": 12134 + }, + { + "epoch": 4.934932899552663, + "grad_norm": 6.009863408520216, + "learning_rate": 1.4246348348938207e-05, + "loss": 0.1324, + "step": 12135 + }, + { + "epoch": 4.935339568930459, + "grad_norm": 8.243566235485336, + "learning_rate": 1.424543102230354e-05, + "loss": 0.1997, + "step": 12136 + }, + { + "epoch": 4.935746238308256, + "grad_norm": 12.309614225473425, + "learning_rate": 1.4244513652087389e-05, + "loss": 0.4562, + "step": 12137 + }, + { + "epoch": 4.9361529076860515, + "grad_norm": 19.472748742223487, + "learning_rate": 1.4243596238299172e-05, + "loss": 1.0942, + "step": 12138 + }, + { + "epoch": 4.936559577063847, + "grad_norm": 7.52306931147512, + "learning_rate": 1.42426787809483e-05, + "loss": 0.4558, + "step": 12139 + }, + { + "epoch": 4.936966246441643, + "grad_norm": 0.09448233560669492, + "learning_rate": 1.4241761280044195e-05, + "loss": 0.0015, + "step": 12140 + }, + { + "epoch": 4.937372915819439, + "grad_norm": 6.826035262038311, + "learning_rate": 1.4240843735596276e-05, + "loss": 0.2403, + "step": 12141 + }, + { + "epoch": 4.937779585197235, + "grad_norm": 4.892931932707186, + "learning_rate": 1.4239926147613965e-05, + "loss": 0.1434, + "step": 12142 + }, + { + "epoch": 4.93818625457503, + "grad_norm": 0.5711881136229034, + "learning_rate": 1.4239008516106674e-05, + "loss": 0.0105, + "step": 12143 + }, + { + "epoch": 4.938592923952826, + "grad_norm": 1.02521679349227, + "learning_rate": 1.423809084108383e-05, + "loss": 0.0188, + "step": 12144 + }, + { + "epoch": 4.938999593330622, + "grad_norm": 11.942161201768524, + "learning_rate": 1.423717312255485e-05, + "loss": 0.6106, + "step": 12145 + }, + { + "epoch": 4.9394062627084185, + "grad_norm": 0.39131156588232086, + "learning_rate": 1.4236255360529152e-05, + "loss": 0.0062, + "step": 12146 + }, + { + "epoch": 4.939812932086214, + "grad_norm": 1.0700487487861057, + "learning_rate": 1.4235337555016164e-05, + "loss": 0.0209, + "step": 12147 + }, + { + "epoch": 4.94021960146401, + "grad_norm": 2.6168809234348474, + "learning_rate": 1.4234419706025305e-05, + "loss": 0.0389, + "step": 12148 + }, + { + "epoch": 4.940626270841806, + "grad_norm": 4.445736318997608, + "learning_rate": 1.4233501813565992e-05, + "loss": 0.1069, + "step": 12149 + }, + { + "epoch": 4.941032940219602, + "grad_norm": 6.662505568346341, + "learning_rate": 1.4232583877647659e-05, + "loss": 0.1064, + "step": 12150 + }, + { + "epoch": 4.941439609597397, + "grad_norm": 7.742315765847511, + "learning_rate": 1.4231665898279718e-05, + "loss": 0.195, + "step": 12151 + }, + { + "epoch": 4.941846278975193, + "grad_norm": 1.3041911943021707, + "learning_rate": 1.4230747875471598e-05, + "loss": 0.0243, + "step": 12152 + }, + { + "epoch": 4.942252948352989, + "grad_norm": 4.67847862773662, + "learning_rate": 1.4229829809232723e-05, + "loss": 0.056, + "step": 12153 + }, + { + "epoch": 4.942659617730785, + "grad_norm": 7.106717186019279, + "learning_rate": 1.4228911699572514e-05, + "loss": 0.2573, + "step": 12154 + }, + { + "epoch": 4.94306628710858, + "grad_norm": 4.940013798185328, + "learning_rate": 1.4227993546500398e-05, + "loss": 0.2471, + "step": 12155 + }, + { + "epoch": 4.943472956486376, + "grad_norm": 3.0630412389869752, + "learning_rate": 1.4227075350025803e-05, + "loss": 0.0758, + "step": 12156 + }, + { + "epoch": 4.943879625864172, + "grad_norm": 5.670858091652952, + "learning_rate": 1.422615711015815e-05, + "loss": 0.1905, + "step": 12157 + }, + { + "epoch": 4.944286295241969, + "grad_norm": 3.150270122871063, + "learning_rate": 1.4225238826906866e-05, + "loss": 0.0937, + "step": 12158 + }, + { + "epoch": 4.944692964619764, + "grad_norm": 6.682911773832398, + "learning_rate": 1.422432050028138e-05, + "loss": 0.1028, + "step": 12159 + }, + { + "epoch": 4.94509963399756, + "grad_norm": 2.435849736239898, + "learning_rate": 1.422340213029112e-05, + "loss": 0.0492, + "step": 12160 + }, + { + "epoch": 4.945506303375356, + "grad_norm": 9.579554600900906, + "learning_rate": 1.422248371694551e-05, + "loss": 0.5768, + "step": 12161 + }, + { + "epoch": 4.945912972753152, + "grad_norm": 0.2212494397473612, + "learning_rate": 1.4221565260253977e-05, + "loss": 0.0032, + "step": 12162 + }, + { + "epoch": 4.946319642130947, + "grad_norm": 3.5984352086678366, + "learning_rate": 1.4220646760225956e-05, + "loss": 0.0503, + "step": 12163 + }, + { + "epoch": 4.946726311508743, + "grad_norm": 7.446152893759979, + "learning_rate": 1.4219728216870872e-05, + "loss": 0.2995, + "step": 12164 + }, + { + "epoch": 4.947132980886539, + "grad_norm": 0.131228569786816, + "learning_rate": 1.4218809630198149e-05, + "loss": 0.0028, + "step": 12165 + }, + { + "epoch": 4.947539650264335, + "grad_norm": 0.27815861314718227, + "learning_rate": 1.4217891000217225e-05, + "loss": 0.0058, + "step": 12166 + }, + { + "epoch": 4.947946319642131, + "grad_norm": 4.205818757533956, + "learning_rate": 1.4216972326937528e-05, + "loss": 0.1945, + "step": 12167 + }, + { + "epoch": 4.948352989019927, + "grad_norm": 5.966763284654234, + "learning_rate": 1.4216053610368485e-05, + "loss": 0.2029, + "step": 12168 + }, + { + "epoch": 4.948759658397723, + "grad_norm": 6.263093584448391, + "learning_rate": 1.4215134850519535e-05, + "loss": 0.0792, + "step": 12169 + }, + { + "epoch": 4.949166327775519, + "grad_norm": 6.741167616492569, + "learning_rate": 1.42142160474001e-05, + "loss": 0.2654, + "step": 12170 + }, + { + "epoch": 4.9495729971533144, + "grad_norm": 8.908503896172634, + "learning_rate": 1.4213297201019618e-05, + "loss": 0.295, + "step": 12171 + }, + { + "epoch": 4.94997966653111, + "grad_norm": 12.825930035682502, + "learning_rate": 1.4212378311387519e-05, + "loss": 0.6848, + "step": 12172 + }, + { + "epoch": 4.950386335908906, + "grad_norm": 2.110639581781277, + "learning_rate": 1.4211459378513238e-05, + "loss": 0.0426, + "step": 12173 + }, + { + "epoch": 4.950793005286702, + "grad_norm": 2.3395794523072007, + "learning_rate": 1.4210540402406205e-05, + "loss": 0.0409, + "step": 12174 + }, + { + "epoch": 4.9511996746644975, + "grad_norm": 6.521656317263583, + "learning_rate": 1.4209621383075854e-05, + "loss": 0.3965, + "step": 12175 + }, + { + "epoch": 4.951606344042293, + "grad_norm": 2.3226675259985736, + "learning_rate": 1.4208702320531625e-05, + "loss": 0.0442, + "step": 12176 + }, + { + "epoch": 4.952013013420089, + "grad_norm": 13.155566879056902, + "learning_rate": 1.4207783214782946e-05, + "loss": 0.3192, + "step": 12177 + }, + { + "epoch": 4.952419682797886, + "grad_norm": 8.213811394471707, + "learning_rate": 1.4206864065839254e-05, + "loss": 0.2016, + "step": 12178 + }, + { + "epoch": 4.9528263521756815, + "grad_norm": 0.2718704145515477, + "learning_rate": 1.4205944873709987e-05, + "loss": 0.0038, + "step": 12179 + }, + { + "epoch": 4.953233021553477, + "grad_norm": 0.3172029811983023, + "learning_rate": 1.4205025638404579e-05, + "loss": 0.0027, + "step": 12180 + }, + { + "epoch": 4.953639690931273, + "grad_norm": 6.653369380703359, + "learning_rate": 1.4204106359932463e-05, + "loss": 0.3426, + "step": 12181 + }, + { + "epoch": 4.954046360309069, + "grad_norm": 4.74902204775432, + "learning_rate": 1.4203187038303083e-05, + "loss": 0.1208, + "step": 12182 + }, + { + "epoch": 4.9544530296868645, + "grad_norm": 7.818586180218814, + "learning_rate": 1.420226767352587e-05, + "loss": 0.1074, + "step": 12183 + }, + { + "epoch": 4.95485969906466, + "grad_norm": 15.897812878619177, + "learning_rate": 1.420134826561027e-05, + "loss": 0.7421, + "step": 12184 + }, + { + "epoch": 4.955266368442456, + "grad_norm": 5.394639604545132, + "learning_rate": 1.420042881456571e-05, + "loss": 0.0894, + "step": 12185 + }, + { + "epoch": 4.955673037820252, + "grad_norm": 8.082218667761628, + "learning_rate": 1.4199509320401635e-05, + "loss": 0.287, + "step": 12186 + }, + { + "epoch": 4.9560797071980485, + "grad_norm": 10.133077071086221, + "learning_rate": 1.4198589783127486e-05, + "loss": 0.2082, + "step": 12187 + }, + { + "epoch": 4.956486376575844, + "grad_norm": 0.6698852914548834, + "learning_rate": 1.4197670202752695e-05, + "loss": 0.0149, + "step": 12188 + }, + { + "epoch": 4.95689304595364, + "grad_norm": 5.029762144881369, + "learning_rate": 1.4196750579286711e-05, + "loss": 0.0964, + "step": 12189 + }, + { + "epoch": 4.957299715331436, + "grad_norm": 8.179195649316654, + "learning_rate": 1.4195830912738968e-05, + "loss": 0.1577, + "step": 12190 + }, + { + "epoch": 4.9577063847092315, + "grad_norm": 13.176666026770054, + "learning_rate": 1.4194911203118908e-05, + "loss": 0.2855, + "step": 12191 + }, + { + "epoch": 4.958113054087027, + "grad_norm": 6.55344610740688, + "learning_rate": 1.4193991450435976e-05, + "loss": 0.0808, + "step": 12192 + }, + { + "epoch": 4.958519723464823, + "grad_norm": 6.426742174399507, + "learning_rate": 1.419307165469961e-05, + "loss": 0.1902, + "step": 12193 + }, + { + "epoch": 4.958926392842619, + "grad_norm": 11.905351134597339, + "learning_rate": 1.4192151815919252e-05, + "loss": 0.4786, + "step": 12194 + }, + { + "epoch": 4.959333062220415, + "grad_norm": 18.644895912121243, + "learning_rate": 1.4191231934104348e-05, + "loss": 0.6718, + "step": 12195 + }, + { + "epoch": 4.95973973159821, + "grad_norm": 4.133708654897291, + "learning_rate": 1.4190312009264338e-05, + "loss": 0.0803, + "step": 12196 + }, + { + "epoch": 4.960146400976006, + "grad_norm": 12.951457146106103, + "learning_rate": 1.4189392041408666e-05, + "loss": 0.6773, + "step": 12197 + }, + { + "epoch": 4.960553070353802, + "grad_norm": 4.580801054672209, + "learning_rate": 1.4188472030546774e-05, + "loss": 0.1436, + "step": 12198 + }, + { + "epoch": 4.9609597397315985, + "grad_norm": 10.333619842109176, + "learning_rate": 1.4187551976688112e-05, + "loss": 0.5308, + "step": 12199 + }, + { + "epoch": 4.961366409109394, + "grad_norm": 6.587612713776279, + "learning_rate": 1.4186631879842122e-05, + "loss": 0.2148, + "step": 12200 + }, + { + "epoch": 4.96177307848719, + "grad_norm": 0.16268569551271914, + "learning_rate": 1.4185711740018244e-05, + "loss": 0.003, + "step": 12201 + }, + { + "epoch": 4.962179747864986, + "grad_norm": 14.96558259610286, + "learning_rate": 1.4184791557225932e-05, + "loss": 0.1352, + "step": 12202 + }, + { + "epoch": 4.962586417242782, + "grad_norm": 0.5018072746690807, + "learning_rate": 1.4183871331474628e-05, + "loss": 0.0072, + "step": 12203 + }, + { + "epoch": 4.962993086620577, + "grad_norm": 9.442995852477623, + "learning_rate": 1.4182951062773778e-05, + "loss": 0.2137, + "step": 12204 + }, + { + "epoch": 4.963399755998373, + "grad_norm": 1.334645110265986, + "learning_rate": 1.4182030751132832e-05, + "loss": 0.0246, + "step": 12205 + }, + { + "epoch": 4.963806425376169, + "grad_norm": 2.961884291226837, + "learning_rate": 1.4181110396561235e-05, + "loss": 0.0804, + "step": 12206 + }, + { + "epoch": 4.964213094753965, + "grad_norm": 5.378712035493674, + "learning_rate": 1.4180189999068433e-05, + "loss": 0.1742, + "step": 12207 + }, + { + "epoch": 4.964619764131761, + "grad_norm": 10.720276771806128, + "learning_rate": 1.4179269558663879e-05, + "loss": 0.511, + "step": 12208 + }, + { + "epoch": 4.965026433509557, + "grad_norm": 7.5668922748194305, + "learning_rate": 1.4178349075357018e-05, + "loss": 0.21, + "step": 12209 + }, + { + "epoch": 4.965433102887353, + "grad_norm": 6.113456987870103, + "learning_rate": 1.4177428549157303e-05, + "loss": 0.1609, + "step": 12210 + }, + { + "epoch": 4.965839772265149, + "grad_norm": 1.4859573410576536, + "learning_rate": 1.4176507980074181e-05, + "loss": 0.0299, + "step": 12211 + }, + { + "epoch": 4.966246441642944, + "grad_norm": 1.2708587226138046, + "learning_rate": 1.4175587368117103e-05, + "loss": 0.0516, + "step": 12212 + }, + { + "epoch": 4.96665311102074, + "grad_norm": 4.289889071200298, + "learning_rate": 1.4174666713295517e-05, + "loss": 0.0861, + "step": 12213 + }, + { + "epoch": 4.967059780398536, + "grad_norm": 14.581482858306881, + "learning_rate": 1.4173746015618876e-05, + "loss": 1.0494, + "step": 12214 + }, + { + "epoch": 4.967466449776332, + "grad_norm": 0.9336892404705462, + "learning_rate": 1.4172825275096634e-05, + "loss": 0.0151, + "step": 12215 + }, + { + "epoch": 4.967873119154127, + "grad_norm": 11.548398941083668, + "learning_rate": 1.4171904491738237e-05, + "loss": 0.5237, + "step": 12216 + }, + { + "epoch": 4.968279788531923, + "grad_norm": 9.090901656180797, + "learning_rate": 1.4170983665553144e-05, + "loss": 0.2493, + "step": 12217 + }, + { + "epoch": 4.968686457909719, + "grad_norm": 0.5593356109167253, + "learning_rate": 1.4170062796550805e-05, + "loss": 0.0096, + "step": 12218 + }, + { + "epoch": 4.969093127287516, + "grad_norm": 6.942195134665271, + "learning_rate": 1.4169141884740672e-05, + "loss": 0.5262, + "step": 12219 + }, + { + "epoch": 4.969499796665311, + "grad_norm": 2.6150075755740994, + "learning_rate": 1.4168220930132198e-05, + "loss": 0.0419, + "step": 12220 + }, + { + "epoch": 4.969906466043107, + "grad_norm": 7.048697159381029, + "learning_rate": 1.4167299932734838e-05, + "loss": 0.1917, + "step": 12221 + }, + { + "epoch": 4.970313135420903, + "grad_norm": 6.307681861207172, + "learning_rate": 1.4166378892558048e-05, + "loss": 0.3252, + "step": 12222 + }, + { + "epoch": 4.970719804798699, + "grad_norm": 4.788169134211972, + "learning_rate": 1.416545780961128e-05, + "loss": 0.5007, + "step": 12223 + }, + { + "epoch": 4.9711264741764944, + "grad_norm": 12.492463371404893, + "learning_rate": 1.4164536683903993e-05, + "loss": 0.3091, + "step": 12224 + }, + { + "epoch": 4.97153314355429, + "grad_norm": 1.9812222865114184, + "learning_rate": 1.416361551544564e-05, + "loss": 0.0457, + "step": 12225 + }, + { + "epoch": 4.971939812932086, + "grad_norm": 7.777389325805172, + "learning_rate": 1.4162694304245679e-05, + "loss": 0.3774, + "step": 12226 + }, + { + "epoch": 4.972346482309882, + "grad_norm": 7.309653598754305, + "learning_rate": 1.4161773050313566e-05, + "loss": 0.2729, + "step": 12227 + }, + { + "epoch": 4.972753151687678, + "grad_norm": 5.76804005654708, + "learning_rate": 1.4160851753658757e-05, + "loss": 0.2718, + "step": 12228 + }, + { + "epoch": 4.973159821065474, + "grad_norm": 6.443605182937859, + "learning_rate": 1.4159930414290713e-05, + "loss": 0.1772, + "step": 12229 + }, + { + "epoch": 4.97356649044327, + "grad_norm": 13.117461685742361, + "learning_rate": 1.4159009032218886e-05, + "loss": 0.3772, + "step": 12230 + }, + { + "epoch": 4.973973159821066, + "grad_norm": 3.5489136071756837, + "learning_rate": 1.4158087607452741e-05, + "loss": 0.0625, + "step": 12231 + }, + { + "epoch": 4.9743798291988615, + "grad_norm": 11.129181805995508, + "learning_rate": 1.4157166140001733e-05, + "loss": 0.661, + "step": 12232 + }, + { + "epoch": 4.974786498576657, + "grad_norm": 4.507265078686417, + "learning_rate": 1.4156244629875321e-05, + "loss": 0.1892, + "step": 12233 + }, + { + "epoch": 4.975193167954453, + "grad_norm": 6.138278017468269, + "learning_rate": 1.4155323077082967e-05, + "loss": 0.1265, + "step": 12234 + }, + { + "epoch": 4.975599837332249, + "grad_norm": 8.79379750900137, + "learning_rate": 1.4154401481634132e-05, + "loss": 0.19, + "step": 12235 + }, + { + "epoch": 4.9760065067100445, + "grad_norm": 6.000601322603327, + "learning_rate": 1.415347984353827e-05, + "loss": 0.1958, + "step": 12236 + }, + { + "epoch": 4.97641317608784, + "grad_norm": 6.472644560284784, + "learning_rate": 1.4152558162804854e-05, + "loss": 0.2486, + "step": 12237 + }, + { + "epoch": 4.976819845465636, + "grad_norm": 25.481229717816387, + "learning_rate": 1.4151636439443338e-05, + "loss": 0.6748, + "step": 12238 + }, + { + "epoch": 4.977226514843432, + "grad_norm": 9.876181961423773, + "learning_rate": 1.4150714673463177e-05, + "loss": 0.3479, + "step": 12239 + }, + { + "epoch": 4.9776331842212285, + "grad_norm": 3.5758738790751203, + "learning_rate": 1.4149792864873847e-05, + "loss": 0.042, + "step": 12240 + }, + { + "epoch": 4.978039853599024, + "grad_norm": 9.265123117911868, + "learning_rate": 1.4148871013684803e-05, + "loss": 0.5669, + "step": 12241 + }, + { + "epoch": 4.97844652297682, + "grad_norm": 0.663784158613667, + "learning_rate": 1.4147949119905511e-05, + "loss": 0.0097, + "step": 12242 + }, + { + "epoch": 4.978853192354616, + "grad_norm": 3.4080502014638214, + "learning_rate": 1.4147027183545432e-05, + "loss": 0.0684, + "step": 12243 + }, + { + "epoch": 4.9792598617324115, + "grad_norm": 34.15323390234773, + "learning_rate": 1.4146105204614034e-05, + "loss": 1.6397, + "step": 12244 + }, + { + "epoch": 4.979666531110207, + "grad_norm": 5.673815466929053, + "learning_rate": 1.414518318312078e-05, + "loss": 0.1658, + "step": 12245 + }, + { + "epoch": 4.980073200488003, + "grad_norm": 21.34924325095994, + "learning_rate": 1.4144261119075129e-05, + "loss": 1.6686, + "step": 12246 + }, + { + "epoch": 4.980479869865799, + "grad_norm": 4.755257747437969, + "learning_rate": 1.4143339012486554e-05, + "loss": 0.1953, + "step": 12247 + }, + { + "epoch": 4.980886539243595, + "grad_norm": 5.651947268920741, + "learning_rate": 1.4142416863364523e-05, + "loss": 0.1054, + "step": 12248 + }, + { + "epoch": 4.981293208621391, + "grad_norm": 1.552956206442286, + "learning_rate": 1.4141494671718491e-05, + "loss": 0.0531, + "step": 12249 + }, + { + "epoch": 4.981699877999187, + "grad_norm": 3.2045741189835035, + "learning_rate": 1.4140572437557939e-05, + "loss": 0.0558, + "step": 12250 + }, + { + "epoch": 4.982106547376983, + "grad_norm": 3.5577872845203684, + "learning_rate": 1.4139650160892324e-05, + "loss": 0.0624, + "step": 12251 + }, + { + "epoch": 4.9825132167547785, + "grad_norm": 1.3928341995625673, + "learning_rate": 1.4138727841731117e-05, + "loss": 0.0316, + "step": 12252 + }, + { + "epoch": 4.982919886132574, + "grad_norm": 1.4082332415135324, + "learning_rate": 1.4137805480083786e-05, + "loss": 0.0285, + "step": 12253 + }, + { + "epoch": 4.98332655551037, + "grad_norm": 3.305420264578577, + "learning_rate": 1.4136883075959799e-05, + "loss": 0.1279, + "step": 12254 + }, + { + "epoch": 4.983733224888166, + "grad_norm": 6.788495751746249, + "learning_rate": 1.4135960629368623e-05, + "loss": 0.1364, + "step": 12255 + }, + { + "epoch": 4.984139894265962, + "grad_norm": 3.959052430627953, + "learning_rate": 1.4135038140319732e-05, + "loss": 0.1553, + "step": 12256 + }, + { + "epoch": 4.984546563643757, + "grad_norm": 12.318190453579156, + "learning_rate": 1.4134115608822593e-05, + "loss": 0.3768, + "step": 12257 + }, + { + "epoch": 4.984953233021553, + "grad_norm": 16.23671775633952, + "learning_rate": 1.4133193034886674e-05, + "loss": 0.884, + "step": 12258 + }, + { + "epoch": 4.985359902399349, + "grad_norm": 11.407122677902406, + "learning_rate": 1.4132270418521452e-05, + "loss": 0.1427, + "step": 12259 + }, + { + "epoch": 4.9857665717771456, + "grad_norm": 9.28253755585266, + "learning_rate": 1.4131347759736393e-05, + "loss": 0.4414, + "step": 12260 + }, + { + "epoch": 4.986173241154941, + "grad_norm": 5.63658884552895, + "learning_rate": 1.4130425058540969e-05, + "loss": 0.155, + "step": 12261 + }, + { + "epoch": 4.986579910532737, + "grad_norm": 6.158505032762759, + "learning_rate": 1.4129502314944654e-05, + "loss": 0.2516, + "step": 12262 + }, + { + "epoch": 4.986986579910533, + "grad_norm": 5.001317001050678, + "learning_rate": 1.412857952895692e-05, + "loss": 0.1337, + "step": 12263 + }, + { + "epoch": 4.987393249288329, + "grad_norm": 1.9768805408762884, + "learning_rate": 1.412765670058724e-05, + "loss": 0.0224, + "step": 12264 + }, + { + "epoch": 4.987799918666124, + "grad_norm": 1.3459373057201203, + "learning_rate": 1.4126733829845083e-05, + "loss": 0.0283, + "step": 12265 + }, + { + "epoch": 4.98820658804392, + "grad_norm": 7.621963873577625, + "learning_rate": 1.4125810916739929e-05, + "loss": 0.2068, + "step": 12266 + }, + { + "epoch": 4.988613257421716, + "grad_norm": 0.5319164427264461, + "learning_rate": 1.4124887961281251e-05, + "loss": 0.008, + "step": 12267 + }, + { + "epoch": 4.989019926799512, + "grad_norm": 1.2640712995404413, + "learning_rate": 1.4123964963478518e-05, + "loss": 0.0307, + "step": 12268 + }, + { + "epoch": 4.989426596177308, + "grad_norm": 11.224716693712951, + "learning_rate": 1.4123041923341213e-05, + "loss": 0.5478, + "step": 12269 + }, + { + "epoch": 4.989833265555104, + "grad_norm": 12.967250086038876, + "learning_rate": 1.4122118840878805e-05, + "loss": 0.8034, + "step": 12270 + }, + { + "epoch": 4.9902399349329, + "grad_norm": 13.808025871265155, + "learning_rate": 1.412119571610077e-05, + "loss": 0.6931, + "step": 12271 + }, + { + "epoch": 4.990646604310696, + "grad_norm": 6.918971052101731, + "learning_rate": 1.4120272549016591e-05, + "loss": 0.3112, + "step": 12272 + }, + { + "epoch": 4.991053273688491, + "grad_norm": 5.080254335548285, + "learning_rate": 1.4119349339635742e-05, + "loss": 0.1147, + "step": 12273 + }, + { + "epoch": 4.991459943066287, + "grad_norm": 0.8915730269888423, + "learning_rate": 1.4118426087967693e-05, + "loss": 0.0137, + "step": 12274 + }, + { + "epoch": 4.991866612444083, + "grad_norm": 0.23316518229996297, + "learning_rate": 1.4117502794021931e-05, + "loss": 0.0049, + "step": 12275 + }, + { + "epoch": 4.992273281821879, + "grad_norm": 0.9532152087961013, + "learning_rate": 1.4116579457807932e-05, + "loss": 0.0194, + "step": 12276 + }, + { + "epoch": 4.9926799511996744, + "grad_norm": 12.726921490309447, + "learning_rate": 1.4115656079335172e-05, + "loss": 1.0198, + "step": 12277 + }, + { + "epoch": 4.99308662057747, + "grad_norm": 4.796434455164331, + "learning_rate": 1.4114732658613128e-05, + "loss": 0.2745, + "step": 12278 + }, + { + "epoch": 4.993493289955266, + "grad_norm": 6.6718192179154805, + "learning_rate": 1.4113809195651287e-05, + "loss": 0.1886, + "step": 12279 + }, + { + "epoch": 4.993899959333062, + "grad_norm": 8.330116680856541, + "learning_rate": 1.4112885690459121e-05, + "loss": 0.2874, + "step": 12280 + }, + { + "epoch": 4.994306628710858, + "grad_norm": 0.1260619713094804, + "learning_rate": 1.4111962143046113e-05, + "loss": 0.003, + "step": 12281 + }, + { + "epoch": 4.994713298088654, + "grad_norm": 5.661671160977864, + "learning_rate": 1.4111038553421748e-05, + "loss": 0.1152, + "step": 12282 + }, + { + "epoch": 4.99511996746645, + "grad_norm": 10.736985671459472, + "learning_rate": 1.41101149215955e-05, + "loss": 0.2969, + "step": 12283 + }, + { + "epoch": 4.995526636844246, + "grad_norm": 4.483267060238845, + "learning_rate": 1.4109191247576852e-05, + "loss": 0.1083, + "step": 12284 + }, + { + "epoch": 4.9959333062220415, + "grad_norm": 7.658931475419352, + "learning_rate": 1.410826753137529e-05, + "loss": 0.4444, + "step": 12285 + }, + { + "epoch": 4.996339975599837, + "grad_norm": 0.32231493719871146, + "learning_rate": 1.4107343773000296e-05, + "loss": 0.0081, + "step": 12286 + }, + { + "epoch": 4.996746644977633, + "grad_norm": 7.3752511037796005, + "learning_rate": 1.4106419972461349e-05, + "loss": 0.2017, + "step": 12287 + }, + { + "epoch": 4.997153314355429, + "grad_norm": 10.323312947573214, + "learning_rate": 1.4105496129767934e-05, + "loss": 0.3472, + "step": 12288 + }, + { + "epoch": 4.9975599837332245, + "grad_norm": 3.127844041319637, + "learning_rate": 1.4104572244929535e-05, + "loss": 0.0683, + "step": 12289 + }, + { + "epoch": 4.997966653111021, + "grad_norm": 4.631377467701493, + "learning_rate": 1.4103648317955638e-05, + "loss": 0.0846, + "step": 12290 + }, + { + "epoch": 4.998373322488817, + "grad_norm": 12.083965092482226, + "learning_rate": 1.4102724348855724e-05, + "loss": 0.4431, + "step": 12291 + }, + { + "epoch": 4.998779991866613, + "grad_norm": 4.172602625823963, + "learning_rate": 1.410180033763928e-05, + "loss": 0.0856, + "step": 12292 + }, + { + "epoch": 4.9991866612444085, + "grad_norm": 1.568848444893642, + "learning_rate": 1.4100876284315792e-05, + "loss": 0.0378, + "step": 12293 + }, + { + "epoch": 4.999593330622204, + "grad_norm": 0.5657852667079181, + "learning_rate": 1.4099952188894742e-05, + "loss": 0.0166, + "step": 12294 + }, + { + "epoch": 5.0, + "grad_norm": 5.077752146317735, + "learning_rate": 1.4099028051385621e-05, + "loss": 0.1235, + "step": 12295 + }, + { + "epoch": 5.000406669377796, + "grad_norm": 2.7248595639543556, + "learning_rate": 1.4098103871797915e-05, + "loss": 0.0595, + "step": 12296 + }, + { + "epoch": 5.0008133387555915, + "grad_norm": 13.92992266324352, + "learning_rate": 1.4097179650141108e-05, + "loss": 0.326, + "step": 12297 + }, + { + "epoch": 5.001220008133387, + "grad_norm": 1.056872521102284, + "learning_rate": 1.4096255386424694e-05, + "loss": 0.0188, + "step": 12298 + }, + { + "epoch": 5.001626677511183, + "grad_norm": 0.1756045688924431, + "learning_rate": 1.4095331080658155e-05, + "loss": 0.004, + "step": 12299 + }, + { + "epoch": 5.002033346888979, + "grad_norm": 2.679871247616172, + "learning_rate": 1.409440673285098e-05, + "loss": 0.0433, + "step": 12300 + }, + { + "epoch": 5.0024400162667755, + "grad_norm": 3.9589079576821615, + "learning_rate": 1.409348234301266e-05, + "loss": 0.1867, + "step": 12301 + }, + { + "epoch": 5.002846685644571, + "grad_norm": 7.569566335205782, + "learning_rate": 1.4092557911152684e-05, + "loss": 0.148, + "step": 12302 + }, + { + "epoch": 5.003253355022367, + "grad_norm": 3.2092037564900897, + "learning_rate": 1.4091633437280542e-05, + "loss": 0.0576, + "step": 12303 + }, + { + "epoch": 5.003660024400163, + "grad_norm": 3.9615586616590313, + "learning_rate": 1.409070892140572e-05, + "loss": 0.1834, + "step": 12304 + }, + { + "epoch": 5.0040666937779585, + "grad_norm": 3.357149339869578, + "learning_rate": 1.4089784363537715e-05, + "loss": 0.1575, + "step": 12305 + }, + { + "epoch": 5.004473363155754, + "grad_norm": 3.6587770099834778, + "learning_rate": 1.4088859763686014e-05, + "loss": 0.0699, + "step": 12306 + }, + { + "epoch": 5.00488003253355, + "grad_norm": 6.048167525215196, + "learning_rate": 1.408793512186011e-05, + "loss": 0.4037, + "step": 12307 + }, + { + "epoch": 5.005286701911346, + "grad_norm": 12.557162891526923, + "learning_rate": 1.4087010438069496e-05, + "loss": 0.4236, + "step": 12308 + }, + { + "epoch": 5.005693371289142, + "grad_norm": 1.2197102005752478, + "learning_rate": 1.4086085712323661e-05, + "loss": 0.039, + "step": 12309 + }, + { + "epoch": 5.006100040666937, + "grad_norm": 3.6069484885060437, + "learning_rate": 1.40851609446321e-05, + "loss": 0.0546, + "step": 12310 + }, + { + "epoch": 5.006506710044734, + "grad_norm": 10.513777783471824, + "learning_rate": 1.4084236135004306e-05, + "loss": 0.52, + "step": 12311 + }, + { + "epoch": 5.00691337942253, + "grad_norm": 1.3400638996221697, + "learning_rate": 1.4083311283449774e-05, + "loss": 0.0209, + "step": 12312 + }, + { + "epoch": 5.0073200488003256, + "grad_norm": 1.850969581369625, + "learning_rate": 1.4082386389977993e-05, + "loss": 0.0738, + "step": 12313 + }, + { + "epoch": 5.007726718178121, + "grad_norm": 12.760665830305054, + "learning_rate": 1.4081461454598463e-05, + "loss": 0.8331, + "step": 12314 + }, + { + "epoch": 5.008133387555917, + "grad_norm": 0.646008868728196, + "learning_rate": 1.408053647732068e-05, + "loss": 0.0097, + "step": 12315 + }, + { + "epoch": 5.008540056933713, + "grad_norm": 6.285010370217605, + "learning_rate": 1.4079611458154133e-05, + "loss": 0.2494, + "step": 12316 + }, + { + "epoch": 5.008946726311509, + "grad_norm": 3.025205889250879, + "learning_rate": 1.407868639710832e-05, + "loss": 0.0532, + "step": 12317 + }, + { + "epoch": 5.009353395689304, + "grad_norm": 3.618149453886878, + "learning_rate": 1.4077761294192743e-05, + "loss": 0.0574, + "step": 12318 + }, + { + "epoch": 5.0097600650671, + "grad_norm": 4.5403068647195255, + "learning_rate": 1.4076836149416889e-05, + "loss": 0.1519, + "step": 12319 + }, + { + "epoch": 5.010166734444896, + "grad_norm": 20.012247448292037, + "learning_rate": 1.4075910962790263e-05, + "loss": 0.3592, + "step": 12320 + }, + { + "epoch": 5.010573403822693, + "grad_norm": 9.573122707185787, + "learning_rate": 1.407498573432236e-05, + "loss": 0.8018, + "step": 12321 + }, + { + "epoch": 5.010980073200488, + "grad_norm": 2.3684155770332427, + "learning_rate": 1.4074060464022677e-05, + "loss": 0.0551, + "step": 12322 + }, + { + "epoch": 5.011386742578284, + "grad_norm": 5.311581977113808, + "learning_rate": 1.407313515190071e-05, + "loss": 0.2407, + "step": 12323 + }, + { + "epoch": 5.01179341195608, + "grad_norm": 0.26098811604564404, + "learning_rate": 1.4072209797965965e-05, + "loss": 0.0039, + "step": 12324 + }, + { + "epoch": 5.012200081333876, + "grad_norm": 2.2374687821492096, + "learning_rate": 1.4071284402227937e-05, + "loss": 0.0427, + "step": 12325 + }, + { + "epoch": 5.012606750711671, + "grad_norm": 5.474407147228924, + "learning_rate": 1.407035896469612e-05, + "loss": 0.0933, + "step": 12326 + }, + { + "epoch": 5.013013420089467, + "grad_norm": 0.5946277730294383, + "learning_rate": 1.4069433485380024e-05, + "loss": 0.0082, + "step": 12327 + }, + { + "epoch": 5.013420089467263, + "grad_norm": 1.867841998071515, + "learning_rate": 1.4068507964289145e-05, + "loss": 0.0426, + "step": 12328 + }, + { + "epoch": 5.013826758845059, + "grad_norm": 1.0293160719311056, + "learning_rate": 1.4067582401432983e-05, + "loss": 0.0175, + "step": 12329 + }, + { + "epoch": 5.0142334282228544, + "grad_norm": 0.352840260679191, + "learning_rate": 1.406665679682104e-05, + "loss": 0.0073, + "step": 12330 + }, + { + "epoch": 5.014640097600651, + "grad_norm": 7.698154755526167, + "learning_rate": 1.4065731150462823e-05, + "loss": 0.1405, + "step": 12331 + }, + { + "epoch": 5.015046766978447, + "grad_norm": 5.008922592579296, + "learning_rate": 1.4064805462367823e-05, + "loss": 0.1657, + "step": 12332 + }, + { + "epoch": 5.015453436356243, + "grad_norm": 0.6065284433260003, + "learning_rate": 1.4063879732545553e-05, + "loss": 0.0083, + "step": 12333 + }, + { + "epoch": 5.015860105734038, + "grad_norm": 11.775257748107043, + "learning_rate": 1.4062953961005512e-05, + "loss": 0.6577, + "step": 12334 + }, + { + "epoch": 5.016266775111834, + "grad_norm": 0.11598764621034277, + "learning_rate": 1.4062028147757203e-05, + "loss": 0.0025, + "step": 12335 + }, + { + "epoch": 5.01667344448963, + "grad_norm": 5.340375124738369, + "learning_rate": 1.4061102292810129e-05, + "loss": 0.159, + "step": 12336 + }, + { + "epoch": 5.017080113867426, + "grad_norm": 7.524403459936318, + "learning_rate": 1.4060176396173798e-05, + "loss": 0.3318, + "step": 12337 + }, + { + "epoch": 5.0174867832452215, + "grad_norm": 3.1767755564215485, + "learning_rate": 1.4059250457857714e-05, + "loss": 0.0886, + "step": 12338 + }, + { + "epoch": 5.017893452623017, + "grad_norm": 0.8015373364033404, + "learning_rate": 1.4058324477871377e-05, + "loss": 0.0148, + "step": 12339 + }, + { + "epoch": 5.018300122000813, + "grad_norm": 6.468739706816754, + "learning_rate": 1.40573984562243e-05, + "loss": 0.1317, + "step": 12340 + }, + { + "epoch": 5.018706791378609, + "grad_norm": 4.355099613216408, + "learning_rate": 1.4056472392925986e-05, + "loss": 0.1426, + "step": 12341 + }, + { + "epoch": 5.019113460756405, + "grad_norm": 8.624308746304425, + "learning_rate": 1.4055546287985939e-05, + "loss": 0.1694, + "step": 12342 + }, + { + "epoch": 5.019520130134201, + "grad_norm": 0.10542727987692309, + "learning_rate": 1.405462014141367e-05, + "loss": 0.0018, + "step": 12343 + }, + { + "epoch": 5.019926799511997, + "grad_norm": 9.187307131942177, + "learning_rate": 1.4053693953218686e-05, + "loss": 0.4338, + "step": 12344 + }, + { + "epoch": 5.020333468889793, + "grad_norm": 3.7273442497544926, + "learning_rate": 1.4052767723410493e-05, + "loss": 0.151, + "step": 12345 + }, + { + "epoch": 5.0207401382675885, + "grad_norm": 6.143614027555037, + "learning_rate": 1.4051841451998596e-05, + "loss": 0.1, + "step": 12346 + }, + { + "epoch": 5.021146807645384, + "grad_norm": 6.485138420901452, + "learning_rate": 1.4050915138992508e-05, + "loss": 0.447, + "step": 12347 + }, + { + "epoch": 5.02155347702318, + "grad_norm": 2.9719064910575197, + "learning_rate": 1.404998878440174e-05, + "loss": 0.0476, + "step": 12348 + }, + { + "epoch": 5.021960146400976, + "grad_norm": 13.284788385579416, + "learning_rate": 1.4049062388235796e-05, + "loss": 0.5058, + "step": 12349 + }, + { + "epoch": 5.0223668157787715, + "grad_norm": 8.080869872408668, + "learning_rate": 1.404813595050419e-05, + "loss": 0.9706, + "step": 12350 + }, + { + "epoch": 5.022773485156567, + "grad_norm": 8.681754180800617, + "learning_rate": 1.4047209471216434e-05, + "loss": 0.1941, + "step": 12351 + }, + { + "epoch": 5.023180154534364, + "grad_norm": 0.059149189565003424, + "learning_rate": 1.4046282950382035e-05, + "loss": 0.0013, + "step": 12352 + }, + { + "epoch": 5.02358682391216, + "grad_norm": 1.402464360192492, + "learning_rate": 1.4045356388010505e-05, + "loss": 0.0302, + "step": 12353 + }, + { + "epoch": 5.0239934932899555, + "grad_norm": 2.0294256703114764, + "learning_rate": 1.4044429784111354e-05, + "loss": 0.0325, + "step": 12354 + }, + { + "epoch": 5.024400162667751, + "grad_norm": 1.1443336418144778, + "learning_rate": 1.4043503138694093e-05, + "loss": 0.0191, + "step": 12355 + }, + { + "epoch": 5.024806832045547, + "grad_norm": 5.935637770941098, + "learning_rate": 1.4042576451768242e-05, + "loss": 0.1364, + "step": 12356 + }, + { + "epoch": 5.025213501423343, + "grad_norm": 3.562077635590821, + "learning_rate": 1.4041649723343306e-05, + "loss": 0.0827, + "step": 12357 + }, + { + "epoch": 5.0256201708011385, + "grad_norm": 2.2442531973157616, + "learning_rate": 1.4040722953428803e-05, + "loss": 0.0436, + "step": 12358 + }, + { + "epoch": 5.026026840178934, + "grad_norm": 5.3862067956739645, + "learning_rate": 1.4039796142034245e-05, + "loss": 0.1037, + "step": 12359 + }, + { + "epoch": 5.02643350955673, + "grad_norm": 5.139732230143249, + "learning_rate": 1.4038869289169146e-05, + "loss": 0.1891, + "step": 12360 + }, + { + "epoch": 5.026840178934526, + "grad_norm": 0.052972503952649186, + "learning_rate": 1.403794239484302e-05, + "loss": 0.001, + "step": 12361 + }, + { + "epoch": 5.0272468483123225, + "grad_norm": 7.229332684000878, + "learning_rate": 1.4037015459065387e-05, + "loss": 0.1591, + "step": 12362 + }, + { + "epoch": 5.027653517690118, + "grad_norm": 7.844309070693972, + "learning_rate": 1.4036088481845756e-05, + "loss": 0.1815, + "step": 12363 + }, + { + "epoch": 5.028060187067914, + "grad_norm": 8.466954210275881, + "learning_rate": 1.4035161463193642e-05, + "loss": 0.4168, + "step": 12364 + }, + { + "epoch": 5.02846685644571, + "grad_norm": 0.02641455457473786, + "learning_rate": 1.403423440311857e-05, + "loss": 0.0005, + "step": 12365 + }, + { + "epoch": 5.0288735258235056, + "grad_norm": 5.754373987846621, + "learning_rate": 1.403330730163005e-05, + "loss": 0.1463, + "step": 12366 + }, + { + "epoch": 5.029280195201301, + "grad_norm": 2.0131640196803247, + "learning_rate": 1.4032380158737601e-05, + "loss": 0.0267, + "step": 12367 + }, + { + "epoch": 5.029686864579097, + "grad_norm": 18.774076631090733, + "learning_rate": 1.403145297445074e-05, + "loss": 1.2198, + "step": 12368 + }, + { + "epoch": 5.030093533956893, + "grad_norm": 6.631203134587674, + "learning_rate": 1.4030525748778983e-05, + "loss": 0.1557, + "step": 12369 + }, + { + "epoch": 5.030500203334689, + "grad_norm": 0.7478906240426864, + "learning_rate": 1.4029598481731852e-05, + "loss": 0.0144, + "step": 12370 + }, + { + "epoch": 5.030906872712484, + "grad_norm": 0.36176012971195026, + "learning_rate": 1.4028671173318864e-05, + "loss": 0.0078, + "step": 12371 + }, + { + "epoch": 5.031313542090281, + "grad_norm": 0.19808050770894256, + "learning_rate": 1.402774382354954e-05, + "loss": 0.0036, + "step": 12372 + }, + { + "epoch": 5.031720211468077, + "grad_norm": 1.1622996857005181, + "learning_rate": 1.40268164324334e-05, + "loss": 0.0205, + "step": 12373 + }, + { + "epoch": 5.032126880845873, + "grad_norm": 18.584851632401392, + "learning_rate": 1.402588899997996e-05, + "loss": 1.0238, + "step": 12374 + }, + { + "epoch": 5.032533550223668, + "grad_norm": 5.678971719043009, + "learning_rate": 1.4024961526198744e-05, + "loss": 0.2617, + "step": 12375 + }, + { + "epoch": 5.032940219601464, + "grad_norm": 10.380800325256232, + "learning_rate": 1.4024034011099272e-05, + "loss": 0.2349, + "step": 12376 + }, + { + "epoch": 5.03334688897926, + "grad_norm": 0.9205705700895294, + "learning_rate": 1.4023106454691065e-05, + "loss": 0.0143, + "step": 12377 + }, + { + "epoch": 5.033753558357056, + "grad_norm": 7.633257670723983, + "learning_rate": 1.4022178856983646e-05, + "loss": 0.3612, + "step": 12378 + }, + { + "epoch": 5.034160227734851, + "grad_norm": 6.979330100464787, + "learning_rate": 1.4021251217986538e-05, + "loss": 0.2983, + "step": 12379 + }, + { + "epoch": 5.034566897112647, + "grad_norm": 1.1222717606579755, + "learning_rate": 1.4020323537709261e-05, + "loss": 0.0176, + "step": 12380 + }, + { + "epoch": 5.034973566490443, + "grad_norm": 9.048599621437331, + "learning_rate": 1.401939581616134e-05, + "loss": 0.6311, + "step": 12381 + }, + { + "epoch": 5.035380235868239, + "grad_norm": 14.251676695695197, + "learning_rate": 1.4018468053352299e-05, + "loss": 1.1031, + "step": 12382 + }, + { + "epoch": 5.035786905246035, + "grad_norm": 6.558357563567087, + "learning_rate": 1.401754024929166e-05, + "loss": 0.256, + "step": 12383 + }, + { + "epoch": 5.036193574623831, + "grad_norm": 1.9297691121452, + "learning_rate": 1.4016612403988947e-05, + "loss": 0.0506, + "step": 12384 + }, + { + "epoch": 5.036600244001627, + "grad_norm": 0.21001416412507598, + "learning_rate": 1.401568451745369e-05, + "loss": 0.0039, + "step": 12385 + }, + { + "epoch": 5.037006913379423, + "grad_norm": 3.6126106111402647, + "learning_rate": 1.4014756589695406e-05, + "loss": 0.0855, + "step": 12386 + }, + { + "epoch": 5.037413582757218, + "grad_norm": 8.061498730495698, + "learning_rate": 1.4013828620723626e-05, + "loss": 0.2274, + "step": 12387 + }, + { + "epoch": 5.037820252135014, + "grad_norm": 2.4637788876607756, + "learning_rate": 1.4012900610547877e-05, + "loss": 0.056, + "step": 12388 + }, + { + "epoch": 5.03822692151281, + "grad_norm": 4.916040766989495, + "learning_rate": 1.4011972559177683e-05, + "loss": 0.2035, + "step": 12389 + }, + { + "epoch": 5.038633590890606, + "grad_norm": 11.219165464698623, + "learning_rate": 1.4011044466622572e-05, + "loss": 0.6394, + "step": 12390 + }, + { + "epoch": 5.0390402602684015, + "grad_norm": 6.881504120668955, + "learning_rate": 1.4010116332892068e-05, + "loss": 0.3285, + "step": 12391 + }, + { + "epoch": 5.039446929646197, + "grad_norm": 7.0943250298340725, + "learning_rate": 1.4009188157995704e-05, + "loss": 0.2173, + "step": 12392 + }, + { + "epoch": 5.039853599023994, + "grad_norm": 5.489076850791184, + "learning_rate": 1.4008259941943005e-05, + "loss": 0.2511, + "step": 12393 + }, + { + "epoch": 5.04026026840179, + "grad_norm": 10.177204134441483, + "learning_rate": 1.40073316847435e-05, + "loss": 0.246, + "step": 12394 + }, + { + "epoch": 5.040666937779585, + "grad_norm": 3.3059058578325975, + "learning_rate": 1.400640338640672e-05, + "loss": 0.0628, + "step": 12395 + }, + { + "epoch": 5.041073607157381, + "grad_norm": 7.076742874083338, + "learning_rate": 1.4005475046942191e-05, + "loss": 0.2633, + "step": 12396 + }, + { + "epoch": 5.041480276535177, + "grad_norm": 8.233654190632159, + "learning_rate": 1.4004546666359444e-05, + "loss": 0.1453, + "step": 12397 + }, + { + "epoch": 5.041886945912973, + "grad_norm": 6.375334466999832, + "learning_rate": 1.4003618244668013e-05, + "loss": 0.354, + "step": 12398 + }, + { + "epoch": 5.0422936152907685, + "grad_norm": 8.368640980597903, + "learning_rate": 1.4002689781877425e-05, + "loss": 0.2706, + "step": 12399 + }, + { + "epoch": 5.042700284668564, + "grad_norm": 14.079100873885873, + "learning_rate": 1.400176127799721e-05, + "loss": 0.5285, + "step": 12400 + }, + { + "epoch": 5.04310695404636, + "grad_norm": 5.949438594265389, + "learning_rate": 1.4000832733036904e-05, + "loss": 0.1166, + "step": 12401 + }, + { + "epoch": 5.043513623424156, + "grad_norm": 2.829977835796122, + "learning_rate": 1.3999904147006036e-05, + "loss": 0.0433, + "step": 12402 + }, + { + "epoch": 5.043920292801952, + "grad_norm": 0.47821839538379685, + "learning_rate": 1.3998975519914138e-05, + "loss": 0.0057, + "step": 12403 + }, + { + "epoch": 5.044326962179748, + "grad_norm": 11.26657057792025, + "learning_rate": 1.3998046851770743e-05, + "loss": 0.5994, + "step": 12404 + }, + { + "epoch": 5.044733631557544, + "grad_norm": 2.4127650169099506, + "learning_rate": 1.3997118142585387e-05, + "loss": 0.06, + "step": 12405 + }, + { + "epoch": 5.04514030093534, + "grad_norm": 1.324793180558502, + "learning_rate": 1.39961893923676e-05, + "loss": 0.0169, + "step": 12406 + }, + { + "epoch": 5.0455469703131355, + "grad_norm": 5.677971523457917, + "learning_rate": 1.3995260601126916e-05, + "loss": 0.276, + "step": 12407 + }, + { + "epoch": 5.045953639690931, + "grad_norm": 0.7772536429220609, + "learning_rate": 1.3994331768872873e-05, + "loss": 0.0132, + "step": 12408 + }, + { + "epoch": 5.046360309068727, + "grad_norm": 4.902307288349626, + "learning_rate": 1.3993402895615002e-05, + "loss": 0.1894, + "step": 12409 + }, + { + "epoch": 5.046766978446523, + "grad_norm": 4.911259159305011, + "learning_rate": 1.3992473981362843e-05, + "loss": 0.0984, + "step": 12410 + }, + { + "epoch": 5.0471736478243185, + "grad_norm": 5.6971889748728195, + "learning_rate": 1.399154502612593e-05, + "loss": 0.1729, + "step": 12411 + }, + { + "epoch": 5.047580317202114, + "grad_norm": 0.1333286938296992, + "learning_rate": 1.3990616029913797e-05, + "loss": 0.0025, + "step": 12412 + }, + { + "epoch": 5.047986986579911, + "grad_norm": 6.737335793639981, + "learning_rate": 1.3989686992735983e-05, + "loss": 0.34, + "step": 12413 + }, + { + "epoch": 5.048393655957707, + "grad_norm": 4.260904146812923, + "learning_rate": 1.3988757914602022e-05, + "loss": 0.1364, + "step": 12414 + }, + { + "epoch": 5.0488003253355025, + "grad_norm": 0.644478975792022, + "learning_rate": 1.3987828795521457e-05, + "loss": 0.0123, + "step": 12415 + }, + { + "epoch": 5.049206994713298, + "grad_norm": 6.352602115819573, + "learning_rate": 1.3986899635503816e-05, + "loss": 0.1969, + "step": 12416 + }, + { + "epoch": 5.049613664091094, + "grad_norm": 0.8997192460002152, + "learning_rate": 1.3985970434558648e-05, + "loss": 0.0163, + "step": 12417 + }, + { + "epoch": 5.05002033346889, + "grad_norm": 0.09668569474584256, + "learning_rate": 1.398504119269549e-05, + "loss": 0.0013, + "step": 12418 + }, + { + "epoch": 5.0504270028466856, + "grad_norm": 0.03644180640685103, + "learning_rate": 1.3984111909923872e-05, + "loss": 0.0008, + "step": 12419 + }, + { + "epoch": 5.050833672224481, + "grad_norm": 4.001779210168738, + "learning_rate": 1.3983182586253345e-05, + "loss": 0.1289, + "step": 12420 + }, + { + "epoch": 5.051240341602277, + "grad_norm": 0.4505126998969509, + "learning_rate": 1.3982253221693443e-05, + "loss": 0.0092, + "step": 12421 + }, + { + "epoch": 5.051647010980073, + "grad_norm": 0.7227292148713151, + "learning_rate": 1.3981323816253707e-05, + "loss": 0.0158, + "step": 12422 + }, + { + "epoch": 5.052053680357869, + "grad_norm": 1.5992404595421092, + "learning_rate": 1.3980394369943677e-05, + "loss": 0.0332, + "step": 12423 + }, + { + "epoch": 5.052460349735665, + "grad_norm": 4.77791718449485, + "learning_rate": 1.39794648827729e-05, + "loss": 0.1939, + "step": 12424 + }, + { + "epoch": 5.052867019113461, + "grad_norm": 0.5392946198409972, + "learning_rate": 1.3978535354750911e-05, + "loss": 0.0107, + "step": 12425 + }, + { + "epoch": 5.053273688491257, + "grad_norm": 1.266788208235598, + "learning_rate": 1.397760578588725e-05, + "loss": 0.0198, + "step": 12426 + }, + { + "epoch": 5.053680357869053, + "grad_norm": 3.257344302714829, + "learning_rate": 1.3976676176191468e-05, + "loss": 0.1348, + "step": 12427 + }, + { + "epoch": 5.054087027246848, + "grad_norm": 3.0015658609367963, + "learning_rate": 1.3975746525673102e-05, + "loss": 0.1083, + "step": 12428 + }, + { + "epoch": 5.054493696624644, + "grad_norm": 7.074615725082687, + "learning_rate": 1.3974816834341698e-05, + "loss": 0.2327, + "step": 12429 + }, + { + "epoch": 5.05490036600244, + "grad_norm": 8.436064717314464, + "learning_rate": 1.3973887102206798e-05, + "loss": 0.2207, + "step": 12430 + }, + { + "epoch": 5.055307035380236, + "grad_norm": 14.49122014843311, + "learning_rate": 1.3972957329277947e-05, + "loss": 0.8592, + "step": 12431 + }, + { + "epoch": 5.055713704758031, + "grad_norm": 1.990925796292441, + "learning_rate": 1.3972027515564687e-05, + "loss": 0.0294, + "step": 12432 + }, + { + "epoch": 5.056120374135827, + "grad_norm": 8.182111378560077, + "learning_rate": 1.3971097661076572e-05, + "loss": 0.217, + "step": 12433 + }, + { + "epoch": 5.056527043513624, + "grad_norm": 3.817236737247198, + "learning_rate": 1.3970167765823137e-05, + "loss": 0.0714, + "step": 12434 + }, + { + "epoch": 5.05693371289142, + "grad_norm": 0.6370824353584097, + "learning_rate": 1.3969237829813929e-05, + "loss": 0.0101, + "step": 12435 + }, + { + "epoch": 5.057340382269215, + "grad_norm": 2.3521304881750034, + "learning_rate": 1.3968307853058503e-05, + "loss": 0.0459, + "step": 12436 + }, + { + "epoch": 5.057747051647011, + "grad_norm": 1.8123404213212644, + "learning_rate": 1.3967377835566396e-05, + "loss": 0.0303, + "step": 12437 + }, + { + "epoch": 5.058153721024807, + "grad_norm": 4.040970427717613, + "learning_rate": 1.3966447777347158e-05, + "loss": 0.0904, + "step": 12438 + }, + { + "epoch": 5.058560390402603, + "grad_norm": 6.6371617914466885, + "learning_rate": 1.3965517678410338e-05, + "loss": 0.2256, + "step": 12439 + }, + { + "epoch": 5.058967059780398, + "grad_norm": 0.05954172891664403, + "learning_rate": 1.3964587538765484e-05, + "loss": 0.001, + "step": 12440 + }, + { + "epoch": 5.059373729158194, + "grad_norm": 3.1560349509922077, + "learning_rate": 1.3963657358422145e-05, + "loss": 0.0473, + "step": 12441 + }, + { + "epoch": 5.05978039853599, + "grad_norm": 0.4103384419509712, + "learning_rate": 1.3962727137389864e-05, + "loss": 0.0056, + "step": 12442 + }, + { + "epoch": 5.060187067913786, + "grad_norm": 4.620451065814656, + "learning_rate": 1.3961796875678197e-05, + "loss": 0.1135, + "step": 12443 + }, + { + "epoch": 5.060593737291582, + "grad_norm": 9.533351441792497, + "learning_rate": 1.3960866573296694e-05, + "loss": 0.2687, + "step": 12444 + }, + { + "epoch": 5.061000406669378, + "grad_norm": 0.25520586045732874, + "learning_rate": 1.3959936230254899e-05, + "loss": 0.0041, + "step": 12445 + }, + { + "epoch": 5.061407076047174, + "grad_norm": 6.484513338032355, + "learning_rate": 1.3959005846562366e-05, + "loss": 0.2396, + "step": 12446 + }, + { + "epoch": 5.06181374542497, + "grad_norm": 0.8292615290587858, + "learning_rate": 1.3958075422228646e-05, + "loss": 0.0148, + "step": 12447 + }, + { + "epoch": 5.062220414802765, + "grad_norm": 4.728622709563653, + "learning_rate": 1.3957144957263288e-05, + "loss": 0.1239, + "step": 12448 + }, + { + "epoch": 5.062627084180561, + "grad_norm": 7.490152325236744, + "learning_rate": 1.3956214451675847e-05, + "loss": 0.1549, + "step": 12449 + }, + { + "epoch": 5.063033753558357, + "grad_norm": 2.6265746858656716, + "learning_rate": 1.3955283905475875e-05, + "loss": 0.0576, + "step": 12450 + }, + { + "epoch": 5.063440422936153, + "grad_norm": 9.50891982575615, + "learning_rate": 1.395435331867292e-05, + "loss": 0.4069, + "step": 12451 + }, + { + "epoch": 5.0638470923139485, + "grad_norm": 2.2076975093192575, + "learning_rate": 1.3953422691276539e-05, + "loss": 0.0396, + "step": 12452 + }, + { + "epoch": 5.064253761691744, + "grad_norm": 1.3939559384919529, + "learning_rate": 1.3952492023296286e-05, + "loss": 0.0219, + "step": 12453 + }, + { + "epoch": 5.064660431069541, + "grad_norm": 3.2721428184340806, + "learning_rate": 1.395156131474171e-05, + "loss": 0.0535, + "step": 12454 + }, + { + "epoch": 5.065067100447337, + "grad_norm": 3.033815658744528, + "learning_rate": 1.3950630565622372e-05, + "loss": 0.0573, + "step": 12455 + }, + { + "epoch": 5.065473769825132, + "grad_norm": 5.5141997748447835, + "learning_rate": 1.3949699775947823e-05, + "loss": 0.0979, + "step": 12456 + }, + { + "epoch": 5.065880439202928, + "grad_norm": 2.2983438389482354, + "learning_rate": 1.3948768945727615e-05, + "loss": 0.0911, + "step": 12457 + }, + { + "epoch": 5.066287108580724, + "grad_norm": 0.07776119485153148, + "learning_rate": 1.3947838074971306e-05, + "loss": 0.0013, + "step": 12458 + }, + { + "epoch": 5.06669377795852, + "grad_norm": 1.478124775219668, + "learning_rate": 1.3946907163688454e-05, + "loss": 0.0207, + "step": 12459 + }, + { + "epoch": 5.0671004473363155, + "grad_norm": 7.42424842034384, + "learning_rate": 1.3945976211888615e-05, + "loss": 0.2578, + "step": 12460 + }, + { + "epoch": 5.067507116714111, + "grad_norm": 12.704019012699066, + "learning_rate": 1.3945045219581341e-05, + "loss": 0.3988, + "step": 12461 + }, + { + "epoch": 5.067913786091907, + "grad_norm": 3.9365930587469333, + "learning_rate": 1.3944114186776194e-05, + "loss": 0.057, + "step": 12462 + }, + { + "epoch": 5.068320455469703, + "grad_norm": 2.9677265559877832, + "learning_rate": 1.394318311348273e-05, + "loss": 0.0556, + "step": 12463 + }, + { + "epoch": 5.0687271248474985, + "grad_norm": 0.9134565437656696, + "learning_rate": 1.3942251999710505e-05, + "loss": 0.0076, + "step": 12464 + }, + { + "epoch": 5.069133794225295, + "grad_norm": 0.3216330883948109, + "learning_rate": 1.3941320845469082e-05, + "loss": 0.0036, + "step": 12465 + }, + { + "epoch": 5.069540463603091, + "grad_norm": 2.2657840555774245, + "learning_rate": 1.3940389650768013e-05, + "loss": 0.0295, + "step": 12466 + }, + { + "epoch": 5.069947132980887, + "grad_norm": 6.686234589851147, + "learning_rate": 1.3939458415616864e-05, + "loss": 0.4268, + "step": 12467 + }, + { + "epoch": 5.0703538023586825, + "grad_norm": 1.9700924334852838, + "learning_rate": 1.3938527140025191e-05, + "loss": 0.0391, + "step": 12468 + }, + { + "epoch": 5.070760471736478, + "grad_norm": 5.204135463328225, + "learning_rate": 1.3937595824002555e-05, + "loss": 0.0997, + "step": 12469 + }, + { + "epoch": 5.071167141114274, + "grad_norm": 0.13326918984281014, + "learning_rate": 1.3936664467558516e-05, + "loss": 0.0023, + "step": 12470 + }, + { + "epoch": 5.07157381049207, + "grad_norm": 5.256147917085488, + "learning_rate": 1.3935733070702634e-05, + "loss": 0.2472, + "step": 12471 + }, + { + "epoch": 5.0719804798698656, + "grad_norm": 0.33522122546134697, + "learning_rate": 1.3934801633444472e-05, + "loss": 0.0053, + "step": 12472 + }, + { + "epoch": 5.072387149247661, + "grad_norm": 2.7412117671659524, + "learning_rate": 1.3933870155793594e-05, + "loss": 0.036, + "step": 12473 + }, + { + "epoch": 5.072793818625457, + "grad_norm": 0.16886938484521763, + "learning_rate": 1.3932938637759555e-05, + "loss": 0.0029, + "step": 12474 + }, + { + "epoch": 5.073200488003254, + "grad_norm": 8.66196428229041, + "learning_rate": 1.393200707935192e-05, + "loss": 0.3914, + "step": 12475 + }, + { + "epoch": 5.0736071573810495, + "grad_norm": 17.58184392287293, + "learning_rate": 1.3931075480580258e-05, + "loss": 0.1895, + "step": 12476 + }, + { + "epoch": 5.074013826758845, + "grad_norm": 8.029816440186142, + "learning_rate": 1.3930143841454124e-05, + "loss": 0.2346, + "step": 12477 + }, + { + "epoch": 5.074420496136641, + "grad_norm": 6.520221980744322, + "learning_rate": 1.3929212161983088e-05, + "loss": 0.1914, + "step": 12478 + }, + { + "epoch": 5.074827165514437, + "grad_norm": 0.315004617507825, + "learning_rate": 1.3928280442176712e-05, + "loss": 0.0045, + "step": 12479 + }, + { + "epoch": 5.075233834892233, + "grad_norm": 0.36334747118391186, + "learning_rate": 1.3927348682044557e-05, + "loss": 0.0043, + "step": 12480 + }, + { + "epoch": 5.075640504270028, + "grad_norm": 0.9203565506034733, + "learning_rate": 1.3926416881596194e-05, + "loss": 0.0162, + "step": 12481 + }, + { + "epoch": 5.076047173647824, + "grad_norm": 0.31504077908830697, + "learning_rate": 1.3925485040841184e-05, + "loss": 0.0048, + "step": 12482 + }, + { + "epoch": 5.07645384302562, + "grad_norm": 0.5249690791212579, + "learning_rate": 1.3924553159789096e-05, + "loss": 0.0077, + "step": 12483 + }, + { + "epoch": 5.076860512403416, + "grad_norm": 0.31846407949755784, + "learning_rate": 1.3923621238449493e-05, + "loss": 0.002, + "step": 12484 + }, + { + "epoch": 5.077267181781212, + "grad_norm": 15.186544675274352, + "learning_rate": 1.3922689276831944e-05, + "loss": 0.6553, + "step": 12485 + }, + { + "epoch": 5.077673851159008, + "grad_norm": 3.4866451861890555, + "learning_rate": 1.3921757274946016e-05, + "loss": 0.0859, + "step": 12486 + }, + { + "epoch": 5.078080520536804, + "grad_norm": 0.030043125740900536, + "learning_rate": 1.3920825232801274e-05, + "loss": 0.0003, + "step": 12487 + }, + { + "epoch": 5.0784871899146, + "grad_norm": 5.812156443415233, + "learning_rate": 1.391989315040729e-05, + "loss": 0.0966, + "step": 12488 + }, + { + "epoch": 5.078893859292395, + "grad_norm": 4.424677447631394, + "learning_rate": 1.3918961027773628e-05, + "loss": 0.1641, + "step": 12489 + }, + { + "epoch": 5.079300528670191, + "grad_norm": 17.904852920808043, + "learning_rate": 1.3918028864909858e-05, + "loss": 0.6365, + "step": 12490 + }, + { + "epoch": 5.079707198047987, + "grad_norm": 0.13627141150922362, + "learning_rate": 1.3917096661825551e-05, + "loss": 0.0023, + "step": 12491 + }, + { + "epoch": 5.080113867425783, + "grad_norm": 0.5545648673824016, + "learning_rate": 1.3916164418530276e-05, + "loss": 0.0073, + "step": 12492 + }, + { + "epoch": 5.080520536803578, + "grad_norm": 0.19784855952270056, + "learning_rate": 1.39152321350336e-05, + "loss": 0.0022, + "step": 12493 + }, + { + "epoch": 5.080927206181374, + "grad_norm": 3.9295795699109237, + "learning_rate": 1.39142998113451e-05, + "loss": 0.07, + "step": 12494 + }, + { + "epoch": 5.081333875559171, + "grad_norm": 11.953549399352665, + "learning_rate": 1.3913367447474339e-05, + "loss": 0.4032, + "step": 12495 + }, + { + "epoch": 5.081740544936967, + "grad_norm": 9.492058106411763, + "learning_rate": 1.3912435043430894e-05, + "loss": 0.4171, + "step": 12496 + }, + { + "epoch": 5.082147214314762, + "grad_norm": 9.31604056740283, + "learning_rate": 1.3911502599224332e-05, + "loss": 0.2701, + "step": 12497 + }, + { + "epoch": 5.082553883692558, + "grad_norm": 2.1707348061530976, + "learning_rate": 1.3910570114864229e-05, + "loss": 0.0311, + "step": 12498 + }, + { + "epoch": 5.082960553070354, + "grad_norm": 3.2129431928954717, + "learning_rate": 1.3909637590360152e-05, + "loss": 0.0274, + "step": 12499 + }, + { + "epoch": 5.08336722244815, + "grad_norm": 10.528475482244097, + "learning_rate": 1.390870502572168e-05, + "loss": 0.2814, + "step": 12500 + }, + { + "epoch": 5.083773891825945, + "grad_norm": 5.145518992210597, + "learning_rate": 1.3907772420958386e-05, + "loss": 0.0971, + "step": 12501 + }, + { + "epoch": 5.084180561203741, + "grad_norm": 3.657508044430691, + "learning_rate": 1.3906839776079839e-05, + "loss": 0.0828, + "step": 12502 + }, + { + "epoch": 5.084587230581537, + "grad_norm": 4.9856721183734365, + "learning_rate": 1.3905907091095614e-05, + "loss": 0.1153, + "step": 12503 + }, + { + "epoch": 5.084993899959333, + "grad_norm": 1.3241392742498512, + "learning_rate": 1.3904974366015288e-05, + "loss": 0.0213, + "step": 12504 + }, + { + "epoch": 5.0854005693371285, + "grad_norm": 9.855568078383836, + "learning_rate": 1.3904041600848437e-05, + "loss": 0.3731, + "step": 12505 + }, + { + "epoch": 5.085807238714925, + "grad_norm": 21.200948751111, + "learning_rate": 1.3903108795604631e-05, + "loss": 0.3687, + "step": 12506 + }, + { + "epoch": 5.086213908092721, + "grad_norm": 0.0888114835516485, + "learning_rate": 1.390217595029345e-05, + "loss": 0.0011, + "step": 12507 + }, + { + "epoch": 5.086620577470517, + "grad_norm": 11.200466494977961, + "learning_rate": 1.390124306492447e-05, + "loss": 0.4691, + "step": 12508 + }, + { + "epoch": 5.087027246848312, + "grad_norm": 9.560980675702195, + "learning_rate": 1.3900310139507265e-05, + "loss": 0.4054, + "step": 12509 + }, + { + "epoch": 5.087433916226108, + "grad_norm": 1.2613718730851684, + "learning_rate": 1.3899377174051413e-05, + "loss": 0.0238, + "step": 12510 + }, + { + "epoch": 5.087840585603904, + "grad_norm": 0.7826552112515907, + "learning_rate": 1.3898444168566495e-05, + "loss": 0.0106, + "step": 12511 + }, + { + "epoch": 5.0882472549817, + "grad_norm": 4.172254614505886, + "learning_rate": 1.389751112306208e-05, + "loss": 0.1202, + "step": 12512 + }, + { + "epoch": 5.0886539243594955, + "grad_norm": 7.608873399978533, + "learning_rate": 1.3896578037547756e-05, + "loss": 0.0476, + "step": 12513 + }, + { + "epoch": 5.089060593737291, + "grad_norm": 5.444059452682892, + "learning_rate": 1.3895644912033097e-05, + "loss": 0.2351, + "step": 12514 + }, + { + "epoch": 5.089467263115087, + "grad_norm": 2.3425596805230113, + "learning_rate": 1.3894711746527683e-05, + "loss": 0.0596, + "step": 12515 + }, + { + "epoch": 5.089873932492884, + "grad_norm": 4.425972183889759, + "learning_rate": 1.389377854104109e-05, + "loss": 0.068, + "step": 12516 + }, + { + "epoch": 5.090280601870679, + "grad_norm": 6.807430753315197, + "learning_rate": 1.3892845295582903e-05, + "loss": 0.1538, + "step": 12517 + }, + { + "epoch": 5.090687271248475, + "grad_norm": 1.7692960564808358, + "learning_rate": 1.3891912010162697e-05, + "loss": 0.0247, + "step": 12518 + }, + { + "epoch": 5.091093940626271, + "grad_norm": 11.316963479837291, + "learning_rate": 1.3890978684790057e-05, + "loss": 0.4128, + "step": 12519 + }, + { + "epoch": 5.091500610004067, + "grad_norm": 2.1388198236209806, + "learning_rate": 1.3890045319474564e-05, + "loss": 0.0452, + "step": 12520 + }, + { + "epoch": 5.0919072793818625, + "grad_norm": 0.17028761432310652, + "learning_rate": 1.3889111914225796e-05, + "loss": 0.0022, + "step": 12521 + }, + { + "epoch": 5.092313948759658, + "grad_norm": 0.10725182095418338, + "learning_rate": 1.3888178469053338e-05, + "loss": 0.0022, + "step": 12522 + }, + { + "epoch": 5.092720618137454, + "grad_norm": 2.4850121695668936, + "learning_rate": 1.388724498396677e-05, + "loss": 0.0574, + "step": 12523 + }, + { + "epoch": 5.09312728751525, + "grad_norm": 3.8369070383387713, + "learning_rate": 1.3886311458975678e-05, + "loss": 0.0823, + "step": 12524 + }, + { + "epoch": 5.0935339568930456, + "grad_norm": 1.742343314007176, + "learning_rate": 1.388537789408964e-05, + "loss": 0.012, + "step": 12525 + }, + { + "epoch": 5.093940626270842, + "grad_norm": 0.2022825392138274, + "learning_rate": 1.3884444289318244e-05, + "loss": 0.0024, + "step": 12526 + }, + { + "epoch": 5.094347295648638, + "grad_norm": 1.2209652686396033, + "learning_rate": 1.3883510644671075e-05, + "loss": 0.0197, + "step": 12527 + }, + { + "epoch": 5.094753965026434, + "grad_norm": 0.3917742581564213, + "learning_rate": 1.3882576960157712e-05, + "loss": 0.0052, + "step": 12528 + }, + { + "epoch": 5.0951606344042295, + "grad_norm": 2.2567899984353916, + "learning_rate": 1.388164323578774e-05, + "loss": 0.0392, + "step": 12529 + }, + { + "epoch": 5.095567303782025, + "grad_norm": 5.430332250948129, + "learning_rate": 1.388070947157075e-05, + "loss": 0.1213, + "step": 12530 + }, + { + "epoch": 5.095973973159821, + "grad_norm": 4.344122572817107, + "learning_rate": 1.3879775667516324e-05, + "loss": 0.1208, + "step": 12531 + }, + { + "epoch": 5.096380642537617, + "grad_norm": 5.919906062069171, + "learning_rate": 1.3878841823634045e-05, + "loss": 0.1999, + "step": 12532 + }, + { + "epoch": 5.096787311915413, + "grad_norm": 9.119262297425857, + "learning_rate": 1.3877907939933506e-05, + "loss": 0.4375, + "step": 12533 + }, + { + "epoch": 5.097193981293208, + "grad_norm": 1.0877363146453174, + "learning_rate": 1.3876974016424289e-05, + "loss": 0.0156, + "step": 12534 + }, + { + "epoch": 5.097600650671004, + "grad_norm": 8.8174178939985, + "learning_rate": 1.3876040053115983e-05, + "loss": 0.2535, + "step": 12535 + }, + { + "epoch": 5.098007320048801, + "grad_norm": 13.945387336786604, + "learning_rate": 1.3875106050018175e-05, + "loss": 0.8272, + "step": 12536 + }, + { + "epoch": 5.0984139894265965, + "grad_norm": 0.10678119663367369, + "learning_rate": 1.3874172007140452e-05, + "loss": 0.0015, + "step": 12537 + }, + { + "epoch": 5.098820658804392, + "grad_norm": 4.64068712252521, + "learning_rate": 1.3873237924492403e-05, + "loss": 0.0935, + "step": 12538 + }, + { + "epoch": 5.099227328182188, + "grad_norm": 1.4736153023564587, + "learning_rate": 1.387230380208362e-05, + "loss": 0.0265, + "step": 12539 + }, + { + "epoch": 5.099633997559984, + "grad_norm": 5.583964167008288, + "learning_rate": 1.3871369639923689e-05, + "loss": 0.1502, + "step": 12540 + }, + { + "epoch": 5.10004066693778, + "grad_norm": 5.502371138563159, + "learning_rate": 1.38704354380222e-05, + "loss": 0.1499, + "step": 12541 + }, + { + "epoch": 5.100447336315575, + "grad_norm": 0.09114611909868457, + "learning_rate": 1.3869501196388742e-05, + "loss": 0.0014, + "step": 12542 + }, + { + "epoch": 5.100854005693371, + "grad_norm": 0.14547775982360747, + "learning_rate": 1.3868566915032906e-05, + "loss": 0.0021, + "step": 12543 + }, + { + "epoch": 5.101260675071167, + "grad_norm": 8.355699196937325, + "learning_rate": 1.3867632593964287e-05, + "loss": 0.1641, + "step": 12544 + }, + { + "epoch": 5.101667344448963, + "grad_norm": 8.136380942891746, + "learning_rate": 1.386669823319247e-05, + "loss": 0.5483, + "step": 12545 + }, + { + "epoch": 5.102074013826758, + "grad_norm": 10.999751231444773, + "learning_rate": 1.3865763832727051e-05, + "loss": 0.4317, + "step": 12546 + }, + { + "epoch": 5.102480683204555, + "grad_norm": 11.268250229948578, + "learning_rate": 1.3864829392577619e-05, + "loss": 0.4425, + "step": 12547 + }, + { + "epoch": 5.102887352582351, + "grad_norm": 1.5714415907917667, + "learning_rate": 1.3863894912753768e-05, + "loss": 0.0331, + "step": 12548 + }, + { + "epoch": 5.103294021960147, + "grad_norm": 4.353066265683153, + "learning_rate": 1.3862960393265095e-05, + "loss": 0.0632, + "step": 12549 + }, + { + "epoch": 5.103700691337942, + "grad_norm": 0.022509683491643617, + "learning_rate": 1.3862025834121187e-05, + "loss": 0.0003, + "step": 12550 + }, + { + "epoch": 5.104107360715738, + "grad_norm": 3.3350439553431586, + "learning_rate": 1.386109123533164e-05, + "loss": 0.0406, + "step": 12551 + }, + { + "epoch": 5.104514030093534, + "grad_norm": 6.4494371624409705, + "learning_rate": 1.3860156596906049e-05, + "loss": 0.1619, + "step": 12552 + }, + { + "epoch": 5.10492069947133, + "grad_norm": 1.5331812368412925, + "learning_rate": 1.3859221918854006e-05, + "loss": 0.0334, + "step": 12553 + }, + { + "epoch": 5.105327368849125, + "grad_norm": 2.9120474387467095, + "learning_rate": 1.3858287201185108e-05, + "loss": 0.1513, + "step": 12554 + }, + { + "epoch": 5.105734038226921, + "grad_norm": 7.557672930476282, + "learning_rate": 1.3857352443908952e-05, + "loss": 0.0857, + "step": 12555 + }, + { + "epoch": 5.106140707604717, + "grad_norm": 9.564517829098051, + "learning_rate": 1.385641764703513e-05, + "loss": 0.3541, + "step": 12556 + }, + { + "epoch": 5.106547376982514, + "grad_norm": 5.220331072438875, + "learning_rate": 1.385548281057324e-05, + "loss": 0.1842, + "step": 12557 + }, + { + "epoch": 5.106954046360309, + "grad_norm": 3.783935270230063, + "learning_rate": 1.3854547934532878e-05, + "loss": 0.0542, + "step": 12558 + }, + { + "epoch": 5.107360715738105, + "grad_norm": 5.051827148230473, + "learning_rate": 1.3853613018923644e-05, + "loss": 0.2347, + "step": 12559 + }, + { + "epoch": 5.107767385115901, + "grad_norm": 2.8078534932413226, + "learning_rate": 1.3852678063755132e-05, + "loss": 0.0429, + "step": 12560 + }, + { + "epoch": 5.108174054493697, + "grad_norm": 2.4863554186738805, + "learning_rate": 1.3851743069036938e-05, + "loss": 0.0517, + "step": 12561 + }, + { + "epoch": 5.108580723871492, + "grad_norm": 11.252012451629957, + "learning_rate": 1.3850808034778664e-05, + "loss": 0.2634, + "step": 12562 + }, + { + "epoch": 5.108987393249288, + "grad_norm": 3.635272420274173, + "learning_rate": 1.384987296098991e-05, + "loss": 0.1208, + "step": 12563 + }, + { + "epoch": 5.109394062627084, + "grad_norm": 2.0660224597625345, + "learning_rate": 1.384893784768027e-05, + "loss": 0.0286, + "step": 12564 + }, + { + "epoch": 5.10980073200488, + "grad_norm": 6.758420115835249, + "learning_rate": 1.3848002694859349e-05, + "loss": 0.2284, + "step": 12565 + }, + { + "epoch": 5.1102074013826755, + "grad_norm": 1.3983810465813673, + "learning_rate": 1.384706750253674e-05, + "loss": 0.0176, + "step": 12566 + }, + { + "epoch": 5.110614070760472, + "grad_norm": 3.9913432333948506, + "learning_rate": 1.3846132270722045e-05, + "loss": 0.061, + "step": 12567 + }, + { + "epoch": 5.111020740138268, + "grad_norm": 5.614357064086944, + "learning_rate": 1.3845196999424872e-05, + "loss": 0.0636, + "step": 12568 + }, + { + "epoch": 5.111427409516064, + "grad_norm": 1.4291580073934496, + "learning_rate": 1.3844261688654814e-05, + "loss": 0.0174, + "step": 12569 + }, + { + "epoch": 5.111834078893859, + "grad_norm": 4.245216881823328, + "learning_rate": 1.3843326338421475e-05, + "loss": 0.1323, + "step": 12570 + }, + { + "epoch": 5.112240748271655, + "grad_norm": 0.037729808243091946, + "learning_rate": 1.3842390948734456e-05, + "loss": 0.0005, + "step": 12571 + }, + { + "epoch": 5.112647417649451, + "grad_norm": 2.1303042179829053, + "learning_rate": 1.3841455519603364e-05, + "loss": 0.0417, + "step": 12572 + }, + { + "epoch": 5.113054087027247, + "grad_norm": 2.3351717125462703, + "learning_rate": 1.3840520051037793e-05, + "loss": 0.0381, + "step": 12573 + }, + { + "epoch": 5.1134607564050425, + "grad_norm": 11.967859984123471, + "learning_rate": 1.3839584543047352e-05, + "loss": 0.406, + "step": 12574 + }, + { + "epoch": 5.113867425782838, + "grad_norm": 4.020779492077601, + "learning_rate": 1.3838648995641645e-05, + "loss": 0.1094, + "step": 12575 + }, + { + "epoch": 5.114274095160634, + "grad_norm": 2.9941291962832857, + "learning_rate": 1.3837713408830273e-05, + "loss": 0.0743, + "step": 12576 + }, + { + "epoch": 5.114680764538431, + "grad_norm": 6.382302735468865, + "learning_rate": 1.383677778262284e-05, + "loss": 0.0636, + "step": 12577 + }, + { + "epoch": 5.1150874339162264, + "grad_norm": 1.2792383013576514, + "learning_rate": 1.3835842117028954e-05, + "loss": 0.0163, + "step": 12578 + }, + { + "epoch": 5.115494103294022, + "grad_norm": 3.0365441013229812, + "learning_rate": 1.3834906412058219e-05, + "loss": 0.0939, + "step": 12579 + }, + { + "epoch": 5.115900772671818, + "grad_norm": 0.08051046735975557, + "learning_rate": 1.3833970667720236e-05, + "loss": 0.0014, + "step": 12580 + }, + { + "epoch": 5.116307442049614, + "grad_norm": 8.192654421042098, + "learning_rate": 1.3833034884024617e-05, + "loss": 0.1885, + "step": 12581 + }, + { + "epoch": 5.1167141114274095, + "grad_norm": 5.77093178367657, + "learning_rate": 1.3832099060980966e-05, + "loss": 0.1926, + "step": 12582 + }, + { + "epoch": 5.117120780805205, + "grad_norm": 8.69267173777517, + "learning_rate": 1.3831163198598887e-05, + "loss": 0.2901, + "step": 12583 + }, + { + "epoch": 5.117527450183001, + "grad_norm": 5.489743200153938, + "learning_rate": 1.3830227296887993e-05, + "loss": 0.1247, + "step": 12584 + }, + { + "epoch": 5.117934119560797, + "grad_norm": 3.3689534446819795, + "learning_rate": 1.3829291355857888e-05, + "loss": 0.1195, + "step": 12585 + }, + { + "epoch": 5.118340788938593, + "grad_norm": 5.494095981130382, + "learning_rate": 1.382835537551818e-05, + "loss": 0.0885, + "step": 12586 + }, + { + "epoch": 5.118747458316388, + "grad_norm": 20.181063355179084, + "learning_rate": 1.3827419355878473e-05, + "loss": 0.5319, + "step": 12587 + }, + { + "epoch": 5.119154127694185, + "grad_norm": 0.4880927508251928, + "learning_rate": 1.3826483296948383e-05, + "loss": 0.0073, + "step": 12588 + }, + { + "epoch": 5.119560797071981, + "grad_norm": 6.195258653997232, + "learning_rate": 1.3825547198737516e-05, + "loss": 0.2169, + "step": 12589 + }, + { + "epoch": 5.1199674664497765, + "grad_norm": 0.28598345218490256, + "learning_rate": 1.3824611061255481e-05, + "loss": 0.004, + "step": 12590 + }, + { + "epoch": 5.120374135827572, + "grad_norm": 0.05186226169609431, + "learning_rate": 1.3823674884511889e-05, + "loss": 0.0008, + "step": 12591 + }, + { + "epoch": 5.120780805205368, + "grad_norm": 10.745021312424225, + "learning_rate": 1.382273866851635e-05, + "loss": 0.3546, + "step": 12592 + }, + { + "epoch": 5.121187474583164, + "grad_norm": 0.8732490021526186, + "learning_rate": 1.3821802413278475e-05, + "loss": 0.0118, + "step": 12593 + }, + { + "epoch": 5.12159414396096, + "grad_norm": 0.8462638378235723, + "learning_rate": 1.3820866118807874e-05, + "loss": 0.0089, + "step": 12594 + }, + { + "epoch": 5.122000813338755, + "grad_norm": 4.660343384442796, + "learning_rate": 1.3819929785114162e-05, + "loss": 0.1473, + "step": 12595 + }, + { + "epoch": 5.122407482716551, + "grad_norm": 2.5274170177633235, + "learning_rate": 1.3818993412206942e-05, + "loss": 0.06, + "step": 12596 + }, + { + "epoch": 5.122814152094347, + "grad_norm": 6.869637639373465, + "learning_rate": 1.381805700009584e-05, + "loss": 0.2112, + "step": 12597 + }, + { + "epoch": 5.1232208214721435, + "grad_norm": 6.3340106154494, + "learning_rate": 1.3817120548790458e-05, + "loss": 0.1177, + "step": 12598 + }, + { + "epoch": 5.123627490849939, + "grad_norm": 1.9261539034714847, + "learning_rate": 1.3816184058300413e-05, + "loss": 0.0335, + "step": 12599 + }, + { + "epoch": 5.124034160227735, + "grad_norm": 0.41379345450388233, + "learning_rate": 1.3815247528635314e-05, + "loss": 0.0038, + "step": 12600 + }, + { + "epoch": 5.124440829605531, + "grad_norm": 5.431469649353945, + "learning_rate": 1.3814310959804786e-05, + "loss": 0.0909, + "step": 12601 + }, + { + "epoch": 5.124847498983327, + "grad_norm": 0.07022966640899048, + "learning_rate": 1.381337435181843e-05, + "loss": 0.0013, + "step": 12602 + }, + { + "epoch": 5.125254168361122, + "grad_norm": 8.348798401471386, + "learning_rate": 1.3812437704685872e-05, + "loss": 0.2139, + "step": 12603 + }, + { + "epoch": 5.125660837738918, + "grad_norm": 2.3692949485362176, + "learning_rate": 1.381150101841672e-05, + "loss": 0.0231, + "step": 12604 + }, + { + "epoch": 5.126067507116714, + "grad_norm": 9.706542536744497, + "learning_rate": 1.3810564293020592e-05, + "loss": 0.2055, + "step": 12605 + }, + { + "epoch": 5.12647417649451, + "grad_norm": 4.955118822652904, + "learning_rate": 1.3809627528507101e-05, + "loss": 0.2072, + "step": 12606 + }, + { + "epoch": 5.126880845872305, + "grad_norm": 4.5748650041792835, + "learning_rate": 1.380869072488587e-05, + "loss": 0.26, + "step": 12607 + }, + { + "epoch": 5.127287515250102, + "grad_norm": 6.2502066141464825, + "learning_rate": 1.380775388216651e-05, + "loss": 0.138, + "step": 12608 + }, + { + "epoch": 5.127694184627898, + "grad_norm": 0.00917022674737957, + "learning_rate": 1.3806817000358638e-05, + "loss": 0.0001, + "step": 12609 + }, + { + "epoch": 5.128100854005694, + "grad_norm": 3.346152473488076, + "learning_rate": 1.3805880079471878e-05, + "loss": 0.0629, + "step": 12610 + }, + { + "epoch": 5.128507523383489, + "grad_norm": 10.156223147796933, + "learning_rate": 1.380494311951584e-05, + "loss": 0.4332, + "step": 12611 + }, + { + "epoch": 5.128914192761285, + "grad_norm": 7.344498199029008, + "learning_rate": 1.3804006120500145e-05, + "loss": 0.0765, + "step": 12612 + }, + { + "epoch": 5.129320862139081, + "grad_norm": 0.5333649902795474, + "learning_rate": 1.3803069082434417e-05, + "loss": 0.0091, + "step": 12613 + }, + { + "epoch": 5.129727531516877, + "grad_norm": 9.351311463751673, + "learning_rate": 1.380213200532827e-05, + "loss": 0.4667, + "step": 12614 + }, + { + "epoch": 5.130134200894672, + "grad_norm": 11.543951622028851, + "learning_rate": 1.380119488919132e-05, + "loss": 0.3931, + "step": 12615 + }, + { + "epoch": 5.130540870272468, + "grad_norm": 0.5639330683960257, + "learning_rate": 1.3800257734033195e-05, + "loss": 0.0082, + "step": 12616 + }, + { + "epoch": 5.130947539650264, + "grad_norm": 0.04088358411003731, + "learning_rate": 1.3799320539863511e-05, + "loss": 0.0008, + "step": 12617 + }, + { + "epoch": 5.131354209028061, + "grad_norm": 11.872337697033482, + "learning_rate": 1.3798383306691894e-05, + "loss": 0.5376, + "step": 12618 + }, + { + "epoch": 5.131760878405856, + "grad_norm": 1.2407410023348864, + "learning_rate": 1.3797446034527955e-05, + "loss": 0.0157, + "step": 12619 + }, + { + "epoch": 5.132167547783652, + "grad_norm": 0.11707927611450698, + "learning_rate": 1.3796508723381322e-05, + "loss": 0.0014, + "step": 12620 + }, + { + "epoch": 5.132574217161448, + "grad_norm": 5.6429430548816875, + "learning_rate": 1.3795571373261619e-05, + "loss": 0.0918, + "step": 12621 + }, + { + "epoch": 5.132980886539244, + "grad_norm": 4.042758227334734, + "learning_rate": 1.3794633984178461e-05, + "loss": 0.0735, + "step": 12622 + }, + { + "epoch": 5.133387555917039, + "grad_norm": 11.98814886240048, + "learning_rate": 1.3793696556141476e-05, + "loss": 0.4601, + "step": 12623 + }, + { + "epoch": 5.133794225294835, + "grad_norm": 1.4783821847292753, + "learning_rate": 1.379275908916029e-05, + "loss": 0.0258, + "step": 12624 + }, + { + "epoch": 5.134200894672631, + "grad_norm": 7.360871635996229, + "learning_rate": 1.379182158324452e-05, + "loss": 0.1388, + "step": 12625 + }, + { + "epoch": 5.134607564050427, + "grad_norm": 7.197779268339515, + "learning_rate": 1.3790884038403796e-05, + "loss": 0.2283, + "step": 12626 + }, + { + "epoch": 5.1350142334282225, + "grad_norm": 2.0676509956225932, + "learning_rate": 1.3789946454647737e-05, + "loss": 0.0246, + "step": 12627 + }, + { + "epoch": 5.135420902806018, + "grad_norm": 3.618384792632307, + "learning_rate": 1.3789008831985971e-05, + "loss": 0.0568, + "step": 12628 + }, + { + "epoch": 5.135827572183815, + "grad_norm": 0.4571001659461575, + "learning_rate": 1.3788071170428121e-05, + "loss": 0.0051, + "step": 12629 + }, + { + "epoch": 5.136234241561611, + "grad_norm": 13.136517499215309, + "learning_rate": 1.3787133469983816e-05, + "loss": 0.3814, + "step": 12630 + }, + { + "epoch": 5.1366409109394064, + "grad_norm": 6.8454585065684705, + "learning_rate": 1.3786195730662679e-05, + "loss": 0.1528, + "step": 12631 + }, + { + "epoch": 5.137047580317202, + "grad_norm": 3.0826557345506864, + "learning_rate": 1.3785257952474338e-05, + "loss": 0.0445, + "step": 12632 + }, + { + "epoch": 5.137454249694998, + "grad_norm": 10.579626133048803, + "learning_rate": 1.378432013542842e-05, + "loss": 0.4108, + "step": 12633 + }, + { + "epoch": 5.137860919072794, + "grad_norm": 5.007122481926065, + "learning_rate": 1.3783382279534549e-05, + "loss": 0.1333, + "step": 12634 + }, + { + "epoch": 5.1382675884505895, + "grad_norm": 0.2626542399301091, + "learning_rate": 1.3782444384802354e-05, + "loss": 0.0043, + "step": 12635 + }, + { + "epoch": 5.138674257828385, + "grad_norm": 14.362316342981522, + "learning_rate": 1.3781506451241466e-05, + "loss": 0.3628, + "step": 12636 + }, + { + "epoch": 5.139080927206181, + "grad_norm": 5.078279677382857, + "learning_rate": 1.378056847886151e-05, + "loss": 0.0999, + "step": 12637 + }, + { + "epoch": 5.139487596583977, + "grad_norm": 0.1407013694392839, + "learning_rate": 1.3779630467672116e-05, + "loss": 0.0026, + "step": 12638 + }, + { + "epoch": 5.1398942659617735, + "grad_norm": 0.07625042247780703, + "learning_rate": 1.3778692417682912e-05, + "loss": 0.0015, + "step": 12639 + }, + { + "epoch": 5.140300935339569, + "grad_norm": 13.278976308637034, + "learning_rate": 1.3777754328903532e-05, + "loss": 0.3008, + "step": 12640 + }, + { + "epoch": 5.140707604717365, + "grad_norm": 4.462390912374524, + "learning_rate": 1.37768162013436e-05, + "loss": 0.0882, + "step": 12641 + }, + { + "epoch": 5.141114274095161, + "grad_norm": 7.210391089137329, + "learning_rate": 1.3775878035012748e-05, + "loss": 0.1433, + "step": 12642 + }, + { + "epoch": 5.1415209434729565, + "grad_norm": 6.44808518335253, + "learning_rate": 1.3774939829920611e-05, + "loss": 0.2459, + "step": 12643 + }, + { + "epoch": 5.141927612850752, + "grad_norm": 4.957572937851212, + "learning_rate": 1.3774001586076813e-05, + "loss": 0.0981, + "step": 12644 + }, + { + "epoch": 5.142334282228548, + "grad_norm": 8.143331418863411, + "learning_rate": 1.377306330349099e-05, + "loss": 0.2424, + "step": 12645 + }, + { + "epoch": 5.142740951606344, + "grad_norm": 5.683155297133758, + "learning_rate": 1.3772124982172774e-05, + "loss": 0.1827, + "step": 12646 + }, + { + "epoch": 5.14314762098414, + "grad_norm": 0.4366776243794547, + "learning_rate": 1.3771186622131798e-05, + "loss": 0.0059, + "step": 12647 + }, + { + "epoch": 5.143554290361935, + "grad_norm": 2.3997239286166008, + "learning_rate": 1.3770248223377691e-05, + "loss": 0.0617, + "step": 12648 + }, + { + "epoch": 5.143960959739732, + "grad_norm": 2.6592624671910485, + "learning_rate": 1.376930978592009e-05, + "loss": 0.0171, + "step": 12649 + }, + { + "epoch": 5.144367629117528, + "grad_norm": 1.953374123778517, + "learning_rate": 1.3768371309768628e-05, + "loss": 0.0254, + "step": 12650 + }, + { + "epoch": 5.1447742984953235, + "grad_norm": 8.630318221249956, + "learning_rate": 1.3767432794932935e-05, + "loss": 0.2425, + "step": 12651 + }, + { + "epoch": 5.145180967873119, + "grad_norm": 3.372008855736369, + "learning_rate": 1.376649424142265e-05, + "loss": 0.0406, + "step": 12652 + }, + { + "epoch": 5.145587637250915, + "grad_norm": 8.315998656003684, + "learning_rate": 1.3765555649247406e-05, + "loss": 0.2787, + "step": 12653 + }, + { + "epoch": 5.145994306628711, + "grad_norm": 9.071004269462234, + "learning_rate": 1.3764617018416838e-05, + "loss": 0.2371, + "step": 12654 + }, + { + "epoch": 5.146400976006507, + "grad_norm": 12.119911557405663, + "learning_rate": 1.376367834894058e-05, + "loss": 0.3657, + "step": 12655 + }, + { + "epoch": 5.146807645384302, + "grad_norm": 0.09594036060813672, + "learning_rate": 1.3762739640828271e-05, + "loss": 0.0013, + "step": 12656 + }, + { + "epoch": 5.147214314762098, + "grad_norm": 0.07600476690395203, + "learning_rate": 1.3761800894089545e-05, + "loss": 0.0008, + "step": 12657 + }, + { + "epoch": 5.147620984139894, + "grad_norm": 9.108900051572393, + "learning_rate": 1.3760862108734041e-05, + "loss": 0.2315, + "step": 12658 + }, + { + "epoch": 5.1480276535176905, + "grad_norm": 17.251752995829147, + "learning_rate": 1.3759923284771394e-05, + "loss": 0.7632, + "step": 12659 + }, + { + "epoch": 5.148434322895486, + "grad_norm": 0.4853494354468677, + "learning_rate": 1.3758984422211241e-05, + "loss": 0.0073, + "step": 12660 + }, + { + "epoch": 5.148840992273282, + "grad_norm": 2.4270160628902415, + "learning_rate": 1.3758045521063221e-05, + "loss": 0.0851, + "step": 12661 + }, + { + "epoch": 5.149247661651078, + "grad_norm": 2.0574623161381553, + "learning_rate": 1.3757106581336974e-05, + "loss": 0.0415, + "step": 12662 + }, + { + "epoch": 5.149654331028874, + "grad_norm": 13.608078068143453, + "learning_rate": 1.3756167603042139e-05, + "loss": 0.7079, + "step": 12663 + }, + { + "epoch": 5.150061000406669, + "grad_norm": 3.5978170560873313, + "learning_rate": 1.3755228586188346e-05, + "loss": 0.0681, + "step": 12664 + }, + { + "epoch": 5.150467669784465, + "grad_norm": 3.726365140921053, + "learning_rate": 1.375428953078525e-05, + "loss": 0.069, + "step": 12665 + }, + { + "epoch": 5.150874339162261, + "grad_norm": 1.9545338979895497, + "learning_rate": 1.3753350436842476e-05, + "loss": 0.038, + "step": 12666 + }, + { + "epoch": 5.151281008540057, + "grad_norm": 5.2090892456237725, + "learning_rate": 1.3752411304369672e-05, + "loss": 0.0947, + "step": 12667 + }, + { + "epoch": 5.151687677917852, + "grad_norm": 11.24504381686751, + "learning_rate": 1.375147213337648e-05, + "loss": 0.2137, + "step": 12668 + }, + { + "epoch": 5.152094347295648, + "grad_norm": 2.0382740405073267, + "learning_rate": 1.375053292387254e-05, + "loss": 0.0356, + "step": 12669 + }, + { + "epoch": 5.152501016673445, + "grad_norm": 12.20083531232582, + "learning_rate": 1.3749593675867488e-05, + "loss": 0.505, + "step": 12670 + }, + { + "epoch": 5.152907686051241, + "grad_norm": 6.602234680614154, + "learning_rate": 1.374865438937097e-05, + "loss": 0.1436, + "step": 12671 + }, + { + "epoch": 5.153314355429036, + "grad_norm": 2.565207233956301, + "learning_rate": 1.374771506439263e-05, + "loss": 0.0431, + "step": 12672 + }, + { + "epoch": 5.153721024806832, + "grad_norm": 5.255568008566854, + "learning_rate": 1.3746775700942105e-05, + "loss": 0.1703, + "step": 12673 + }, + { + "epoch": 5.154127694184628, + "grad_norm": 14.889372269571295, + "learning_rate": 1.3745836299029046e-05, + "loss": 0.6215, + "step": 12674 + }, + { + "epoch": 5.154534363562424, + "grad_norm": 1.0917170758290873, + "learning_rate": 1.3744896858663091e-05, + "loss": 0.0237, + "step": 12675 + }, + { + "epoch": 5.154941032940219, + "grad_norm": 2.3775254864366975, + "learning_rate": 1.3743957379853885e-05, + "loss": 0.043, + "step": 12676 + }, + { + "epoch": 5.155347702318015, + "grad_norm": 4.611007229288414, + "learning_rate": 1.374301786261107e-05, + "loss": 0.0864, + "step": 12677 + }, + { + "epoch": 5.155754371695811, + "grad_norm": 0.12173777664415271, + "learning_rate": 1.3742078306944296e-05, + "loss": 0.0022, + "step": 12678 + }, + { + "epoch": 5.156161041073607, + "grad_norm": 0.0989570445141447, + "learning_rate": 1.3741138712863203e-05, + "loss": 0.0017, + "step": 12679 + }, + { + "epoch": 5.156567710451403, + "grad_norm": 0.09480629664441398, + "learning_rate": 1.3740199080377436e-05, + "loss": 0.0013, + "step": 12680 + }, + { + "epoch": 5.156974379829199, + "grad_norm": 5.057731038962167, + "learning_rate": 1.3739259409496649e-05, + "loss": 0.0876, + "step": 12681 + }, + { + "epoch": 5.157381049206995, + "grad_norm": 17.54026497348728, + "learning_rate": 1.3738319700230478e-05, + "loss": 0.3791, + "step": 12682 + }, + { + "epoch": 5.157787718584791, + "grad_norm": 1.2811642749768597, + "learning_rate": 1.3737379952588573e-05, + "loss": 0.0237, + "step": 12683 + }, + { + "epoch": 5.1581943879625864, + "grad_norm": 4.174077196983454, + "learning_rate": 1.3736440166580583e-05, + "loss": 0.1295, + "step": 12684 + }, + { + "epoch": 5.158601057340382, + "grad_norm": 0.7940160885413551, + "learning_rate": 1.3735500342216157e-05, + "loss": 0.0129, + "step": 12685 + }, + { + "epoch": 5.159007726718178, + "grad_norm": 7.120554098099482, + "learning_rate": 1.3734560479504934e-05, + "loss": 0.2212, + "step": 12686 + }, + { + "epoch": 5.159414396095974, + "grad_norm": 4.565364328824337, + "learning_rate": 1.3733620578456575e-05, + "loss": 0.0844, + "step": 12687 + }, + { + "epoch": 5.1598210654737695, + "grad_norm": 5.119867283738416, + "learning_rate": 1.3732680639080718e-05, + "loss": 0.1417, + "step": 12688 + }, + { + "epoch": 5.160227734851565, + "grad_norm": 18.419367226311525, + "learning_rate": 1.3731740661387017e-05, + "loss": 0.3993, + "step": 12689 + }, + { + "epoch": 5.160634404229362, + "grad_norm": 1.614220832244199, + "learning_rate": 1.3730800645385117e-05, + "loss": 0.0278, + "step": 12690 + }, + { + "epoch": 5.161041073607158, + "grad_norm": 3.6435809184336403, + "learning_rate": 1.3729860591084672e-05, + "loss": 0.0596, + "step": 12691 + }, + { + "epoch": 5.1614477429849535, + "grad_norm": 2.225183584895034, + "learning_rate": 1.3728920498495333e-05, + "loss": 0.0323, + "step": 12692 + }, + { + "epoch": 5.161854412362749, + "grad_norm": 7.854808861935257, + "learning_rate": 1.3727980367626746e-05, + "loss": 0.2228, + "step": 12693 + }, + { + "epoch": 5.162261081740545, + "grad_norm": 7.073506080837647, + "learning_rate": 1.3727040198488566e-05, + "loss": 0.1354, + "step": 12694 + }, + { + "epoch": 5.162667751118341, + "grad_norm": 22.146978551591047, + "learning_rate": 1.372609999109044e-05, + "loss": 0.6027, + "step": 12695 + }, + { + "epoch": 5.1630744204961365, + "grad_norm": 12.325416237438152, + "learning_rate": 1.3725159745442023e-05, + "loss": 0.7975, + "step": 12696 + }, + { + "epoch": 5.163481089873932, + "grad_norm": 17.54083906491972, + "learning_rate": 1.3724219461552968e-05, + "loss": 0.8309, + "step": 12697 + }, + { + "epoch": 5.163887759251728, + "grad_norm": 0.3141346965117412, + "learning_rate": 1.3723279139432926e-05, + "loss": 0.0042, + "step": 12698 + }, + { + "epoch": 5.164294428629524, + "grad_norm": 12.293576270832487, + "learning_rate": 1.3722338779091548e-05, + "loss": 0.5215, + "step": 12699 + }, + { + "epoch": 5.1647010980073205, + "grad_norm": 2.794623444560484, + "learning_rate": 1.3721398380538493e-05, + "loss": 0.0184, + "step": 12700 + }, + { + "epoch": 5.165107767385116, + "grad_norm": 0.19184120900824933, + "learning_rate": 1.3720457943783408e-05, + "loss": 0.0031, + "step": 12701 + }, + { + "epoch": 5.165514436762912, + "grad_norm": 3.9242203977969363, + "learning_rate": 1.3719517468835948e-05, + "loss": 0.1032, + "step": 12702 + }, + { + "epoch": 5.165921106140708, + "grad_norm": 0.5778007586420877, + "learning_rate": 1.371857695570577e-05, + "loss": 0.0086, + "step": 12703 + }, + { + "epoch": 5.1663277755185035, + "grad_norm": 0.8180939199317249, + "learning_rate": 1.371763640440253e-05, + "loss": 0.0103, + "step": 12704 + }, + { + "epoch": 5.166734444896299, + "grad_norm": 0.03814312403159297, + "learning_rate": 1.3716695814935879e-05, + "loss": 0.0006, + "step": 12705 + }, + { + "epoch": 5.167141114274095, + "grad_norm": 8.798309228628844, + "learning_rate": 1.3715755187315477e-05, + "loss": 0.1429, + "step": 12706 + }, + { + "epoch": 5.167547783651891, + "grad_norm": 4.4903458415097175, + "learning_rate": 1.3714814521550977e-05, + "loss": 0.1206, + "step": 12707 + }, + { + "epoch": 5.167954453029687, + "grad_norm": 16.046273290798965, + "learning_rate": 1.3713873817652034e-05, + "loss": 0.1629, + "step": 12708 + }, + { + "epoch": 5.168361122407482, + "grad_norm": 0.022316705013922807, + "learning_rate": 1.371293307562831e-05, + "loss": 0.0004, + "step": 12709 + }, + { + "epoch": 5.168767791785278, + "grad_norm": 14.849904741166737, + "learning_rate": 1.3711992295489458e-05, + "loss": 1.1112, + "step": 12710 + }, + { + "epoch": 5.169174461163075, + "grad_norm": 5.251831108419377, + "learning_rate": 1.3711051477245138e-05, + "loss": 0.1423, + "step": 12711 + }, + { + "epoch": 5.1695811305408705, + "grad_norm": 2.015134479198466, + "learning_rate": 1.3710110620905007e-05, + "loss": 0.0303, + "step": 12712 + }, + { + "epoch": 5.169987799918666, + "grad_norm": 18.72270182335996, + "learning_rate": 1.370916972647872e-05, + "loss": 0.3197, + "step": 12713 + }, + { + "epoch": 5.170394469296462, + "grad_norm": 5.919704715819719, + "learning_rate": 1.3708228793975942e-05, + "loss": 0.1669, + "step": 12714 + }, + { + "epoch": 5.170801138674258, + "grad_norm": 2.6875660752248822, + "learning_rate": 1.3707287823406328e-05, + "loss": 0.0493, + "step": 12715 + }, + { + "epoch": 5.171207808052054, + "grad_norm": 9.324835030573652, + "learning_rate": 1.3706346814779539e-05, + "loss": 0.3394, + "step": 12716 + }, + { + "epoch": 5.171614477429849, + "grad_norm": 8.028318265655502, + "learning_rate": 1.3705405768105236e-05, + "loss": 0.3963, + "step": 12717 + }, + { + "epoch": 5.172021146807645, + "grad_norm": 6.712919136171822, + "learning_rate": 1.3704464683393076e-05, + "loss": 0.2626, + "step": 12718 + }, + { + "epoch": 5.172427816185441, + "grad_norm": 3.585705014870443, + "learning_rate": 1.370352356065272e-05, + "loss": 0.0441, + "step": 12719 + }, + { + "epoch": 5.172834485563237, + "grad_norm": 5.398681376556444, + "learning_rate": 1.3702582399893833e-05, + "loss": 0.1448, + "step": 12720 + }, + { + "epoch": 5.173241154941033, + "grad_norm": 6.203857487618142, + "learning_rate": 1.3701641201126078e-05, + "loss": 0.1566, + "step": 12721 + }, + { + "epoch": 5.173647824318829, + "grad_norm": 1.8158012843913651, + "learning_rate": 1.3700699964359106e-05, + "loss": 0.045, + "step": 12722 + }, + { + "epoch": 5.174054493696625, + "grad_norm": 11.13593901458846, + "learning_rate": 1.3699758689602591e-05, + "loss": 0.7024, + "step": 12723 + }, + { + "epoch": 5.174461163074421, + "grad_norm": 6.957907035549048, + "learning_rate": 1.369881737686619e-05, + "loss": 0.2529, + "step": 12724 + }, + { + "epoch": 5.174867832452216, + "grad_norm": 5.1529480240511525, + "learning_rate": 1.3697876026159565e-05, + "loss": 0.0815, + "step": 12725 + }, + { + "epoch": 5.175274501830012, + "grad_norm": 11.589342048192053, + "learning_rate": 1.3696934637492382e-05, + "loss": 0.271, + "step": 12726 + }, + { + "epoch": 5.175681171207808, + "grad_norm": 5.717225679291345, + "learning_rate": 1.3695993210874307e-05, + "loss": 0.1037, + "step": 12727 + }, + { + "epoch": 5.176087840585604, + "grad_norm": 1.8417893969879815, + "learning_rate": 1.3695051746314998e-05, + "loss": 0.0363, + "step": 12728 + }, + { + "epoch": 5.176494509963399, + "grad_norm": 6.802236239053351, + "learning_rate": 1.3694110243824127e-05, + "loss": 0.1542, + "step": 12729 + }, + { + "epoch": 5.176901179341195, + "grad_norm": 4.285208621053885, + "learning_rate": 1.3693168703411352e-05, + "loss": 0.1008, + "step": 12730 + }, + { + "epoch": 5.177307848718992, + "grad_norm": 6.837652272682248, + "learning_rate": 1.369222712508634e-05, + "loss": 0.146, + "step": 12731 + }, + { + "epoch": 5.177714518096788, + "grad_norm": 4.458185174722183, + "learning_rate": 1.3691285508858764e-05, + "loss": 0.1102, + "step": 12732 + }, + { + "epoch": 5.178121187474583, + "grad_norm": 9.63712005645567, + "learning_rate": 1.3690343854738282e-05, + "loss": 0.3673, + "step": 12733 + }, + { + "epoch": 5.178527856852379, + "grad_norm": 2.1064537436596096, + "learning_rate": 1.3689402162734562e-05, + "loss": 0.0521, + "step": 12734 + }, + { + "epoch": 5.178934526230175, + "grad_norm": 5.650873827950525, + "learning_rate": 1.3688460432857272e-05, + "loss": 0.2844, + "step": 12735 + }, + { + "epoch": 5.179341195607971, + "grad_norm": 3.005371226100392, + "learning_rate": 1.368751866511608e-05, + "loss": 0.1002, + "step": 12736 + }, + { + "epoch": 5.1797478649857664, + "grad_norm": 10.687471653008547, + "learning_rate": 1.3686576859520652e-05, + "loss": 0.4084, + "step": 12737 + }, + { + "epoch": 5.180154534363562, + "grad_norm": 5.6159954668976635, + "learning_rate": 1.3685635016080659e-05, + "loss": 0.1033, + "step": 12738 + }, + { + "epoch": 5.180561203741358, + "grad_norm": 2.359208013010554, + "learning_rate": 1.3684693134805767e-05, + "loss": 0.0451, + "step": 12739 + }, + { + "epoch": 5.180967873119154, + "grad_norm": 4.503972197946853, + "learning_rate": 1.3683751215705644e-05, + "loss": 0.0852, + "step": 12740 + }, + { + "epoch": 5.18137454249695, + "grad_norm": 0.37481920604218427, + "learning_rate": 1.3682809258789962e-05, + "loss": 0.0052, + "step": 12741 + }, + { + "epoch": 5.181781211874746, + "grad_norm": 4.125129912768587, + "learning_rate": 1.368186726406839e-05, + "loss": 0.0717, + "step": 12742 + }, + { + "epoch": 5.182187881252542, + "grad_norm": 4.651383558463152, + "learning_rate": 1.3680925231550596e-05, + "loss": 0.0909, + "step": 12743 + }, + { + "epoch": 5.182594550630338, + "grad_norm": 7.6356702095171585, + "learning_rate": 1.3679983161246252e-05, + "loss": 0.171, + "step": 12744 + }, + { + "epoch": 5.1830012200081335, + "grad_norm": 0.6302838752364646, + "learning_rate": 1.367904105316503e-05, + "loss": 0.0095, + "step": 12745 + }, + { + "epoch": 5.183407889385929, + "grad_norm": 0.20966916379983327, + "learning_rate": 1.3678098907316599e-05, + "loss": 0.004, + "step": 12746 + }, + { + "epoch": 5.183814558763725, + "grad_norm": 7.68644186332104, + "learning_rate": 1.3677156723710632e-05, + "loss": 0.1621, + "step": 12747 + }, + { + "epoch": 5.184221228141521, + "grad_norm": 8.71700081403987, + "learning_rate": 1.3676214502356799e-05, + "loss": 0.1357, + "step": 12748 + }, + { + "epoch": 5.1846278975193165, + "grad_norm": 6.812120939833423, + "learning_rate": 1.3675272243264777e-05, + "loss": 0.179, + "step": 12749 + }, + { + "epoch": 5.185034566897112, + "grad_norm": 0.4304416188388021, + "learning_rate": 1.3674329946444234e-05, + "loss": 0.0071, + "step": 12750 + }, + { + "epoch": 5.185441236274908, + "grad_norm": 10.869445251555653, + "learning_rate": 1.3673387611904844e-05, + "loss": 0.4441, + "step": 12751 + }, + { + "epoch": 5.185847905652705, + "grad_norm": 0.16527601599232414, + "learning_rate": 1.3672445239656285e-05, + "loss": 0.004, + "step": 12752 + }, + { + "epoch": 5.1862545750305005, + "grad_norm": 5.111113441825182, + "learning_rate": 1.3671502829708223e-05, + "loss": 0.0914, + "step": 12753 + }, + { + "epoch": 5.186661244408296, + "grad_norm": 3.933897683872277, + "learning_rate": 1.367056038207034e-05, + "loss": 0.0917, + "step": 12754 + }, + { + "epoch": 5.187067913786092, + "grad_norm": 5.32275242094535, + "learning_rate": 1.3669617896752306e-05, + "loss": 0.1028, + "step": 12755 + }, + { + "epoch": 5.187474583163888, + "grad_norm": 0.355523856445127, + "learning_rate": 1.3668675373763796e-05, + "loss": 0.0058, + "step": 12756 + }, + { + "epoch": 5.1878812525416835, + "grad_norm": 11.068711960802299, + "learning_rate": 1.3667732813114487e-05, + "loss": 0.4794, + "step": 12757 + }, + { + "epoch": 5.188287921919479, + "grad_norm": 4.905711731472597, + "learning_rate": 1.3666790214814056e-05, + "loss": 0.1637, + "step": 12758 + }, + { + "epoch": 5.188694591297275, + "grad_norm": 9.45503765747196, + "learning_rate": 1.3665847578872178e-05, + "loss": 0.3626, + "step": 12759 + }, + { + "epoch": 5.189101260675071, + "grad_norm": 23.89519796771179, + "learning_rate": 1.3664904905298529e-05, + "loss": 1.4763, + "step": 12760 + }, + { + "epoch": 5.189507930052867, + "grad_norm": 5.152559868068268, + "learning_rate": 1.3663962194102786e-05, + "loss": 0.0976, + "step": 12761 + }, + { + "epoch": 5.189914599430663, + "grad_norm": 2.69454563420761, + "learning_rate": 1.3663019445294629e-05, + "loss": 0.0534, + "step": 12762 + }, + { + "epoch": 5.190321268808459, + "grad_norm": 5.7162461306329, + "learning_rate": 1.3662076658883731e-05, + "loss": 0.1155, + "step": 12763 + }, + { + "epoch": 5.190727938186255, + "grad_norm": 4.956992145441204, + "learning_rate": 1.3661133834879777e-05, + "loss": 0.2034, + "step": 12764 + }, + { + "epoch": 5.1911346075640505, + "grad_norm": 0.8095727393432434, + "learning_rate": 1.3660190973292438e-05, + "loss": 0.0154, + "step": 12765 + }, + { + "epoch": 5.191541276941846, + "grad_norm": 7.459087765414864, + "learning_rate": 1.36592480741314e-05, + "loss": 0.6486, + "step": 12766 + }, + { + "epoch": 5.191947946319642, + "grad_norm": 7.281078760748948, + "learning_rate": 1.3658305137406333e-05, + "loss": 0.239, + "step": 12767 + }, + { + "epoch": 5.192354615697438, + "grad_norm": 7.8910552507127605, + "learning_rate": 1.3657362163126928e-05, + "loss": 0.1907, + "step": 12768 + }, + { + "epoch": 5.192761285075234, + "grad_norm": 1.0010462283687003, + "learning_rate": 1.365641915130286e-05, + "loss": 0.0156, + "step": 12769 + }, + { + "epoch": 5.193167954453029, + "grad_norm": 0.938341861345566, + "learning_rate": 1.3655476101943803e-05, + "loss": 0.0101, + "step": 12770 + }, + { + "epoch": 5.193574623830825, + "grad_norm": 4.272381949321287, + "learning_rate": 1.365453301505945e-05, + "loss": 0.1659, + "step": 12771 + }, + { + "epoch": 5.193981293208622, + "grad_norm": 4.8229565490708115, + "learning_rate": 1.3653589890659475e-05, + "loss": 0.1374, + "step": 12772 + }, + { + "epoch": 5.1943879625864176, + "grad_norm": 10.26157483410921, + "learning_rate": 1.3652646728753557e-05, + "loss": 0.2877, + "step": 12773 + }, + { + "epoch": 5.194794631964213, + "grad_norm": 2.3410962639059267, + "learning_rate": 1.3651703529351387e-05, + "loss": 0.0356, + "step": 12774 + }, + { + "epoch": 5.195201301342009, + "grad_norm": 4.414948789373572, + "learning_rate": 1.3650760292462644e-05, + "loss": 0.1497, + "step": 12775 + }, + { + "epoch": 5.195607970719805, + "grad_norm": 6.081871533486465, + "learning_rate": 1.3649817018097005e-05, + "loss": 0.2808, + "step": 12776 + }, + { + "epoch": 5.196014640097601, + "grad_norm": 0.013776477275477046, + "learning_rate": 1.3648873706264159e-05, + "loss": 0.0003, + "step": 12777 + }, + { + "epoch": 5.196421309475396, + "grad_norm": 5.57169990997223, + "learning_rate": 1.3647930356973786e-05, + "loss": 0.119, + "step": 12778 + }, + { + "epoch": 5.196827978853192, + "grad_norm": 0.01113967512482282, + "learning_rate": 1.3646986970235573e-05, + "loss": 0.0002, + "step": 12779 + }, + { + "epoch": 5.197234648230988, + "grad_norm": 8.556569140596503, + "learning_rate": 1.3646043546059202e-05, + "loss": 0.4738, + "step": 12780 + }, + { + "epoch": 5.197641317608784, + "grad_norm": 1.7833188188566678, + "learning_rate": 1.3645100084454363e-05, + "loss": 0.0262, + "step": 12781 + }, + { + "epoch": 5.19804798698658, + "grad_norm": 0.179700147005969, + "learning_rate": 1.3644156585430735e-05, + "loss": 0.0027, + "step": 12782 + }, + { + "epoch": 5.198454656364376, + "grad_norm": 1.487717365213126, + "learning_rate": 1.3643213048998006e-05, + "loss": 0.0188, + "step": 12783 + }, + { + "epoch": 5.198861325742172, + "grad_norm": 0.5453423631993317, + "learning_rate": 1.364226947516586e-05, + "loss": 0.0125, + "step": 12784 + }, + { + "epoch": 5.199267995119968, + "grad_norm": 14.572419909476114, + "learning_rate": 1.3641325863943987e-05, + "loss": 0.5552, + "step": 12785 + }, + { + "epoch": 5.199674664497763, + "grad_norm": 3.933444234877884, + "learning_rate": 1.3640382215342067e-05, + "loss": 0.0655, + "step": 12786 + }, + { + "epoch": 5.200081333875559, + "grad_norm": 5.720769081707408, + "learning_rate": 1.3639438529369796e-05, + "loss": 0.0807, + "step": 12787 + }, + { + "epoch": 5.200488003253355, + "grad_norm": 3.853254079643464, + "learning_rate": 1.3638494806036854e-05, + "loss": 0.0779, + "step": 12788 + }, + { + "epoch": 5.200894672631151, + "grad_norm": 4.256430641489796, + "learning_rate": 1.3637551045352933e-05, + "loss": 0.0824, + "step": 12789 + }, + { + "epoch": 5.2013013420089464, + "grad_norm": 1.7918550831917495, + "learning_rate": 1.3636607247327718e-05, + "loss": 0.0105, + "step": 12790 + }, + { + "epoch": 5.201708011386742, + "grad_norm": 10.571651324739683, + "learning_rate": 1.36356634119709e-05, + "loss": 0.3503, + "step": 12791 + }, + { + "epoch": 5.202114680764538, + "grad_norm": 3.5511927076126133, + "learning_rate": 1.3634719539292171e-05, + "loss": 0.0823, + "step": 12792 + }, + { + "epoch": 5.202521350142335, + "grad_norm": 6.297521066119788, + "learning_rate": 1.3633775629301212e-05, + "loss": 0.1362, + "step": 12793 + }, + { + "epoch": 5.20292801952013, + "grad_norm": 0.7603348000728781, + "learning_rate": 1.363283168200772e-05, + "loss": 0.0168, + "step": 12794 + }, + { + "epoch": 5.203334688897926, + "grad_norm": 6.529203576678865, + "learning_rate": 1.3631887697421382e-05, + "loss": 0.1895, + "step": 12795 + }, + { + "epoch": 5.203741358275722, + "grad_norm": 2.9367070753534494, + "learning_rate": 1.3630943675551887e-05, + "loss": 0.0536, + "step": 12796 + }, + { + "epoch": 5.204148027653518, + "grad_norm": 1.5758315025218899, + "learning_rate": 1.3629999616408929e-05, + "loss": 0.0305, + "step": 12797 + }, + { + "epoch": 5.2045546970313135, + "grad_norm": 0.16659546886570895, + "learning_rate": 1.36290555200022e-05, + "loss": 0.0032, + "step": 12798 + }, + { + "epoch": 5.204961366409109, + "grad_norm": 4.172657577774857, + "learning_rate": 1.3628111386341384e-05, + "loss": 0.0789, + "step": 12799 + }, + { + "epoch": 5.205368035786905, + "grad_norm": 2.0912815846694826, + "learning_rate": 1.3627167215436186e-05, + "loss": 0.0451, + "step": 12800 + }, + { + "epoch": 5.205774705164701, + "grad_norm": 8.586256377517952, + "learning_rate": 1.3626223007296287e-05, + "loss": 0.1967, + "step": 12801 + }, + { + "epoch": 5.2061813745424965, + "grad_norm": 10.763514324980953, + "learning_rate": 1.3625278761931382e-05, + "loss": 0.619, + "step": 12802 + }, + { + "epoch": 5.206588043920293, + "grad_norm": 7.669574289141879, + "learning_rate": 1.3624334479351168e-05, + "loss": 0.4826, + "step": 12803 + }, + { + "epoch": 5.206994713298089, + "grad_norm": 7.916118548802056, + "learning_rate": 1.3623390159565339e-05, + "loss": 0.1495, + "step": 12804 + }, + { + "epoch": 5.207401382675885, + "grad_norm": 13.225345209969149, + "learning_rate": 1.3622445802583582e-05, + "loss": 0.1971, + "step": 12805 + }, + { + "epoch": 5.2078080520536805, + "grad_norm": 0.1757475889758268, + "learning_rate": 1.3621501408415598e-05, + "loss": 0.0027, + "step": 12806 + }, + { + "epoch": 5.208214721431476, + "grad_norm": 1.4835961341101152, + "learning_rate": 1.362055697707108e-05, + "loss": 0.0233, + "step": 12807 + }, + { + "epoch": 5.208621390809272, + "grad_norm": 9.72037864612577, + "learning_rate": 1.361961250855972e-05, + "loss": 0.327, + "step": 12808 + }, + { + "epoch": 5.209028060187068, + "grad_norm": 0.9853081832292735, + "learning_rate": 1.3618668002891218e-05, + "loss": 0.0182, + "step": 12809 + }, + { + "epoch": 5.2094347295648635, + "grad_norm": 4.120489138956751, + "learning_rate": 1.3617723460075267e-05, + "loss": 0.0566, + "step": 12810 + }, + { + "epoch": 5.209841398942659, + "grad_norm": 3.7816580537938758, + "learning_rate": 1.3616778880121562e-05, + "loss": 0.1922, + "step": 12811 + }, + { + "epoch": 5.210248068320455, + "grad_norm": 8.045885314368153, + "learning_rate": 1.3615834263039804e-05, + "loss": 0.3628, + "step": 12812 + }, + { + "epoch": 5.210654737698252, + "grad_norm": 8.36701961468226, + "learning_rate": 1.3614889608839687e-05, + "loss": 0.217, + "step": 12813 + }, + { + "epoch": 5.2110614070760475, + "grad_norm": 6.5891663127375875, + "learning_rate": 1.361394491753091e-05, + "loss": 0.1055, + "step": 12814 + }, + { + "epoch": 5.211468076453843, + "grad_norm": 6.908748446374726, + "learning_rate": 1.3613000189123167e-05, + "loss": 0.2974, + "step": 12815 + }, + { + "epoch": 5.211874745831639, + "grad_norm": 6.578178881811888, + "learning_rate": 1.3612055423626161e-05, + "loss": 0.2288, + "step": 12816 + }, + { + "epoch": 5.212281415209435, + "grad_norm": 3.585860618688039, + "learning_rate": 1.3611110621049588e-05, + "loss": 0.1149, + "step": 12817 + }, + { + "epoch": 5.2126880845872305, + "grad_norm": 2.212555690018159, + "learning_rate": 1.3610165781403144e-05, + "loss": 0.0448, + "step": 12818 + }, + { + "epoch": 5.213094753965026, + "grad_norm": 19.273214890949898, + "learning_rate": 1.3609220904696534e-05, + "loss": 1.0833, + "step": 12819 + }, + { + "epoch": 5.213501423342822, + "grad_norm": 0.6559915198535978, + "learning_rate": 1.3608275990939457e-05, + "loss": 0.0129, + "step": 12820 + }, + { + "epoch": 5.213908092720618, + "grad_norm": 5.444141831979168, + "learning_rate": 1.3607331040141607e-05, + "loss": 0.0919, + "step": 12821 + }, + { + "epoch": 5.214314762098414, + "grad_norm": 3.772701074147209, + "learning_rate": 1.360638605231269e-05, + "loss": 0.0603, + "step": 12822 + }, + { + "epoch": 5.21472143147621, + "grad_norm": 14.85239451113123, + "learning_rate": 1.3605441027462405e-05, + "loss": 0.2817, + "step": 12823 + }, + { + "epoch": 5.215128100854006, + "grad_norm": 7.7566078424565434, + "learning_rate": 1.3604495965600453e-05, + "loss": 0.2579, + "step": 12824 + }, + { + "epoch": 5.215534770231802, + "grad_norm": 10.103148629598657, + "learning_rate": 1.3603550866736534e-05, + "loss": 0.4087, + "step": 12825 + }, + { + "epoch": 5.2159414396095976, + "grad_norm": 1.2123186547160556, + "learning_rate": 1.3602605730880355e-05, + "loss": 0.0128, + "step": 12826 + }, + { + "epoch": 5.216348108987393, + "grad_norm": 2.742973921634315, + "learning_rate": 1.3601660558041614e-05, + "loss": 0.0415, + "step": 12827 + }, + { + "epoch": 5.216754778365189, + "grad_norm": 0.12043518989702767, + "learning_rate": 1.3600715348230013e-05, + "loss": 0.0018, + "step": 12828 + }, + { + "epoch": 5.217161447742985, + "grad_norm": 6.361196585394131, + "learning_rate": 1.3599770101455258e-05, + "loss": 0.3178, + "step": 12829 + }, + { + "epoch": 5.217568117120781, + "grad_norm": 4.002850329796022, + "learning_rate": 1.3598824817727052e-05, + "loss": 0.0598, + "step": 12830 + }, + { + "epoch": 5.217974786498576, + "grad_norm": 2.19182348024651, + "learning_rate": 1.3597879497055098e-05, + "loss": 0.0233, + "step": 12831 + }, + { + "epoch": 5.218381455876372, + "grad_norm": 4.736810122524003, + "learning_rate": 1.3596934139449099e-05, + "loss": 0.1037, + "step": 12832 + }, + { + "epoch": 5.218788125254168, + "grad_norm": 7.084393170806288, + "learning_rate": 1.3595988744918762e-05, + "loss": 0.4691, + "step": 12833 + }, + { + "epoch": 5.219194794631965, + "grad_norm": 0.10762992974407071, + "learning_rate": 1.3595043313473787e-05, + "loss": 0.0014, + "step": 12834 + }, + { + "epoch": 5.21960146400976, + "grad_norm": 10.359265266998083, + "learning_rate": 1.3594097845123888e-05, + "loss": 0.4032, + "step": 12835 + }, + { + "epoch": 5.220008133387556, + "grad_norm": 0.2296516899164167, + "learning_rate": 1.3593152339878764e-05, + "loss": 0.0046, + "step": 12836 + }, + { + "epoch": 5.220414802765352, + "grad_norm": 7.4095681769801365, + "learning_rate": 1.3592206797748122e-05, + "loss": 0.3173, + "step": 12837 + }, + { + "epoch": 5.220821472143148, + "grad_norm": 9.415311236261283, + "learning_rate": 1.359126121874167e-05, + "loss": 0.5962, + "step": 12838 + }, + { + "epoch": 5.221228141520943, + "grad_norm": 2.986822827622878, + "learning_rate": 1.3590315602869116e-05, + "loss": 0.0535, + "step": 12839 + }, + { + "epoch": 5.221634810898739, + "grad_norm": 11.237351730472312, + "learning_rate": 1.3589369950140161e-05, + "loss": 0.6162, + "step": 12840 + }, + { + "epoch": 5.222041480276535, + "grad_norm": 5.605941506173171, + "learning_rate": 1.3588424260564518e-05, + "loss": 0.1417, + "step": 12841 + }, + { + "epoch": 5.222448149654331, + "grad_norm": 7.061395641083688, + "learning_rate": 1.3587478534151897e-05, + "loss": 0.1572, + "step": 12842 + }, + { + "epoch": 5.222854819032127, + "grad_norm": 6.313033865670322, + "learning_rate": 1.3586532770912002e-05, + "loss": 0.1838, + "step": 12843 + }, + { + "epoch": 5.223261488409923, + "grad_norm": 2.687713981707485, + "learning_rate": 1.358558697085454e-05, + "loss": 0.046, + "step": 12844 + }, + { + "epoch": 5.223668157787719, + "grad_norm": 7.052978434548322, + "learning_rate": 1.3584641133989227e-05, + "loss": 0.2158, + "step": 12845 + }, + { + "epoch": 5.224074827165515, + "grad_norm": 4.816227236957275, + "learning_rate": 1.3583695260325769e-05, + "loss": 0.1009, + "step": 12846 + }, + { + "epoch": 5.22448149654331, + "grad_norm": 8.049541138765035, + "learning_rate": 1.3582749349873871e-05, + "loss": 0.2506, + "step": 12847 + }, + { + "epoch": 5.224888165921106, + "grad_norm": 1.0193737184682001, + "learning_rate": 1.3581803402643252e-05, + "loss": 0.0109, + "step": 12848 + }, + { + "epoch": 5.225294835298902, + "grad_norm": 8.936381082002276, + "learning_rate": 1.3580857418643618e-05, + "loss": 0.3573, + "step": 12849 + }, + { + "epoch": 5.225701504676698, + "grad_norm": 14.398298884866582, + "learning_rate": 1.357991139788468e-05, + "loss": 0.5262, + "step": 12850 + }, + { + "epoch": 5.2261081740544935, + "grad_norm": 3.298229680155797, + "learning_rate": 1.3578965340376149e-05, + "loss": 0.0674, + "step": 12851 + }, + { + "epoch": 5.226514843432289, + "grad_norm": 4.362325572373128, + "learning_rate": 1.3578019246127738e-05, + "loss": 0.1447, + "step": 12852 + }, + { + "epoch": 5.226921512810085, + "grad_norm": 23.204268436964476, + "learning_rate": 1.3577073115149157e-05, + "loss": 0.5974, + "step": 12853 + }, + { + "epoch": 5.227328182187882, + "grad_norm": 0.23025765507874651, + "learning_rate": 1.3576126947450123e-05, + "loss": 0.0039, + "step": 12854 + }, + { + "epoch": 5.227734851565677, + "grad_norm": 0.05737705064260382, + "learning_rate": 1.3575180743040346e-05, + "loss": 0.001, + "step": 12855 + }, + { + "epoch": 5.228141520943473, + "grad_norm": 5.922013186330878, + "learning_rate": 1.3574234501929535e-05, + "loss": 0.1081, + "step": 12856 + }, + { + "epoch": 5.228548190321269, + "grad_norm": 5.856107498119022, + "learning_rate": 1.3573288224127413e-05, + "loss": 0.1161, + "step": 12857 + }, + { + "epoch": 5.228954859699065, + "grad_norm": 14.007730134848426, + "learning_rate": 1.3572341909643687e-05, + "loss": 0.4757, + "step": 12858 + }, + { + "epoch": 5.2293615290768605, + "grad_norm": 9.185617806585382, + "learning_rate": 1.3571395558488073e-05, + "loss": 0.2614, + "step": 12859 + }, + { + "epoch": 5.229768198454656, + "grad_norm": 1.5293028001317284, + "learning_rate": 1.3570449170670288e-05, + "loss": 0.0282, + "step": 12860 + }, + { + "epoch": 5.230174867832452, + "grad_norm": 15.538115075996146, + "learning_rate": 1.3569502746200043e-05, + "loss": 0.3462, + "step": 12861 + }, + { + "epoch": 5.230581537210248, + "grad_norm": 0.2061635532581028, + "learning_rate": 1.3568556285087059e-05, + "loss": 0.0032, + "step": 12862 + }, + { + "epoch": 5.2309882065880435, + "grad_norm": 9.324731978661577, + "learning_rate": 1.3567609787341043e-05, + "loss": 0.1752, + "step": 12863 + }, + { + "epoch": 5.23139487596584, + "grad_norm": 8.75161995229114, + "learning_rate": 1.3566663252971721e-05, + "loss": 0.5114, + "step": 12864 + }, + { + "epoch": 5.231801545343636, + "grad_norm": 8.89596399414211, + "learning_rate": 1.3565716681988804e-05, + "loss": 0.5038, + "step": 12865 + }, + { + "epoch": 5.232208214721432, + "grad_norm": 8.76140872765685, + "learning_rate": 1.3564770074402011e-05, + "loss": 0.6405, + "step": 12866 + }, + { + "epoch": 5.2326148840992275, + "grad_norm": 11.916750146673756, + "learning_rate": 1.3563823430221061e-05, + "loss": 0.3879, + "step": 12867 + }, + { + "epoch": 5.233021553477023, + "grad_norm": 10.741371934101931, + "learning_rate": 1.356287674945567e-05, + "loss": 0.7378, + "step": 12868 + }, + { + "epoch": 5.233428222854819, + "grad_norm": 13.592253176724192, + "learning_rate": 1.356193003211555e-05, + "loss": 0.668, + "step": 12869 + }, + { + "epoch": 5.233834892232615, + "grad_norm": 4.594840277583168, + "learning_rate": 1.3560983278210433e-05, + "loss": 0.0811, + "step": 12870 + }, + { + "epoch": 5.2342415616104105, + "grad_norm": 7.887667439940357, + "learning_rate": 1.3560036487750029e-05, + "loss": 0.2586, + "step": 12871 + }, + { + "epoch": 5.234648230988206, + "grad_norm": 0.6385408378572027, + "learning_rate": 1.3559089660744056e-05, + "loss": 0.0095, + "step": 12872 + }, + { + "epoch": 5.235054900366002, + "grad_norm": 0.43959583673899577, + "learning_rate": 1.3558142797202236e-05, + "loss": 0.0093, + "step": 12873 + }, + { + "epoch": 5.235461569743798, + "grad_norm": 0.5207559803036795, + "learning_rate": 1.3557195897134291e-05, + "loss": 0.0079, + "step": 12874 + }, + { + "epoch": 5.2358682391215945, + "grad_norm": 6.557910983760848, + "learning_rate": 1.3556248960549943e-05, + "loss": 0.1236, + "step": 12875 + }, + { + "epoch": 5.23627490849939, + "grad_norm": 15.29787900004771, + "learning_rate": 1.3555301987458904e-05, + "loss": 0.4489, + "step": 12876 + }, + { + "epoch": 5.236681577877186, + "grad_norm": 10.33656105083819, + "learning_rate": 1.3554354977870902e-05, + "loss": 0.1097, + "step": 12877 + }, + { + "epoch": 5.237088247254982, + "grad_norm": 2.5997751638617377, + "learning_rate": 1.3553407931795662e-05, + "loss": 0.0595, + "step": 12878 + }, + { + "epoch": 5.2374949166327776, + "grad_norm": 0.7580902345557382, + "learning_rate": 1.3552460849242895e-05, + "loss": 0.0126, + "step": 12879 + }, + { + "epoch": 5.237901586010573, + "grad_norm": 7.37409486408926, + "learning_rate": 1.3551513730222333e-05, + "loss": 0.2756, + "step": 12880 + }, + { + "epoch": 5.238308255388369, + "grad_norm": 13.069042019224506, + "learning_rate": 1.3550566574743695e-05, + "loss": 0.6603, + "step": 12881 + }, + { + "epoch": 5.238714924766165, + "grad_norm": 4.8876823032598535, + "learning_rate": 1.3549619382816703e-05, + "loss": 0.1124, + "step": 12882 + }, + { + "epoch": 5.239121594143961, + "grad_norm": 0.7527236259911172, + "learning_rate": 1.354867215445108e-05, + "loss": 0.0113, + "step": 12883 + }, + { + "epoch": 5.239528263521757, + "grad_norm": 1.28089571376455, + "learning_rate": 1.3547724889656555e-05, + "loss": 0.0232, + "step": 12884 + }, + { + "epoch": 5.239934932899553, + "grad_norm": 0.5315786372607, + "learning_rate": 1.3546777588442848e-05, + "loss": 0.0105, + "step": 12885 + }, + { + "epoch": 5.240341602277349, + "grad_norm": 7.368104954015414, + "learning_rate": 1.354583025081968e-05, + "loss": 0.3989, + "step": 12886 + }, + { + "epoch": 5.240748271655145, + "grad_norm": 2.5852357241085526, + "learning_rate": 1.3544882876796783e-05, + "loss": 0.043, + "step": 12887 + }, + { + "epoch": 5.24115494103294, + "grad_norm": 4.628261077922299, + "learning_rate": 1.354393546638388e-05, + "loss": 0.1362, + "step": 12888 + }, + { + "epoch": 5.241561610410736, + "grad_norm": 14.452952568211257, + "learning_rate": 1.3542988019590694e-05, + "loss": 0.6368, + "step": 12889 + }, + { + "epoch": 5.241968279788532, + "grad_norm": 9.923070385497406, + "learning_rate": 1.3542040536426954e-05, + "loss": 0.3732, + "step": 12890 + }, + { + "epoch": 5.242374949166328, + "grad_norm": 13.409039672571739, + "learning_rate": 1.3541093016902384e-05, + "loss": 0.5144, + "step": 12891 + }, + { + "epoch": 5.242781618544123, + "grad_norm": 6.662471563516631, + "learning_rate": 1.3540145461026712e-05, + "loss": 0.5393, + "step": 12892 + }, + { + "epoch": 5.243188287921919, + "grad_norm": 11.51999920436311, + "learning_rate": 1.3539197868809665e-05, + "loss": 0.3689, + "step": 12893 + }, + { + "epoch": 5.243594957299715, + "grad_norm": 7.335104963159324, + "learning_rate": 1.3538250240260972e-05, + "loss": 0.1596, + "step": 12894 + }, + { + "epoch": 5.244001626677512, + "grad_norm": 7.797853051115597, + "learning_rate": 1.353730257539036e-05, + "loss": 0.1489, + "step": 12895 + }, + { + "epoch": 5.244408296055307, + "grad_norm": 7.531353807327498, + "learning_rate": 1.3536354874207555e-05, + "loss": 0.1101, + "step": 12896 + }, + { + "epoch": 5.244814965433103, + "grad_norm": 6.403210811549929, + "learning_rate": 1.353540713672229e-05, + "loss": 0.3154, + "step": 12897 + }, + { + "epoch": 5.245221634810899, + "grad_norm": 1.5569185403890389, + "learning_rate": 1.353445936294429e-05, + "loss": 0.0126, + "step": 12898 + }, + { + "epoch": 5.245628304188695, + "grad_norm": 0.18115232777944884, + "learning_rate": 1.3533511552883282e-05, + "loss": 0.0037, + "step": 12899 + }, + { + "epoch": 5.24603497356649, + "grad_norm": 6.96242407791295, + "learning_rate": 1.3532563706549006e-05, + "loss": 0.2421, + "step": 12900 + }, + { + "epoch": 5.246441642944286, + "grad_norm": 3.366186281101958, + "learning_rate": 1.3531615823951181e-05, + "loss": 0.0761, + "step": 12901 + }, + { + "epoch": 5.246848312322082, + "grad_norm": 4.926044185516738, + "learning_rate": 1.3530667905099547e-05, + "loss": 0.1379, + "step": 12902 + }, + { + "epoch": 5.247254981699878, + "grad_norm": 5.066831088879739, + "learning_rate": 1.3529719950003826e-05, + "loss": 0.0951, + "step": 12903 + }, + { + "epoch": 5.2476616510776735, + "grad_norm": 0.560472022916955, + "learning_rate": 1.3528771958673754e-05, + "loss": 0.0088, + "step": 12904 + }, + { + "epoch": 5.24806832045547, + "grad_norm": 7.132203756855809, + "learning_rate": 1.3527823931119059e-05, + "loss": 0.1603, + "step": 12905 + }, + { + "epoch": 5.248474989833266, + "grad_norm": 5.5912450787713865, + "learning_rate": 1.352687586734948e-05, + "loss": 0.2898, + "step": 12906 + }, + { + "epoch": 5.248881659211062, + "grad_norm": 7.223423108631777, + "learning_rate": 1.3525927767374744e-05, + "loss": 0.0662, + "step": 12907 + }, + { + "epoch": 5.249288328588857, + "grad_norm": 3.9159856550456364, + "learning_rate": 1.3524979631204584e-05, + "loss": 0.0536, + "step": 12908 + }, + { + "epoch": 5.249694997966653, + "grad_norm": 2.02873079375832, + "learning_rate": 1.3524031458848735e-05, + "loss": 0.0325, + "step": 12909 + }, + { + "epoch": 5.250101667344449, + "grad_norm": 28.969673556098172, + "learning_rate": 1.3523083250316931e-05, + "loss": 0.127, + "step": 12910 + }, + { + "epoch": 5.250508336722245, + "grad_norm": 9.853983304481526, + "learning_rate": 1.35221350056189e-05, + "loss": 0.4373, + "step": 12911 + }, + { + "epoch": 5.2509150061000405, + "grad_norm": 2.4750697953136185, + "learning_rate": 1.3521186724764384e-05, + "loss": 0.0296, + "step": 12912 + }, + { + "epoch": 5.251321675477836, + "grad_norm": 4.926353042953185, + "learning_rate": 1.3520238407763113e-05, + "loss": 0.088, + "step": 12913 + }, + { + "epoch": 5.251728344855632, + "grad_norm": 0.1019428300269809, + "learning_rate": 1.3519290054624821e-05, + "loss": 0.001, + "step": 12914 + }, + { + "epoch": 5.252135014233428, + "grad_norm": 4.652267580946231, + "learning_rate": 1.3518341665359248e-05, + "loss": 0.1088, + "step": 12915 + }, + { + "epoch": 5.252541683611224, + "grad_norm": 3.0939546880529725, + "learning_rate": 1.3517393239976126e-05, + "loss": 0.1467, + "step": 12916 + }, + { + "epoch": 5.25294835298902, + "grad_norm": 8.48516000437918, + "learning_rate": 1.3516444778485193e-05, + "loss": 0.2206, + "step": 12917 + }, + { + "epoch": 5.253355022366816, + "grad_norm": 1.7804481619364692, + "learning_rate": 1.3515496280896185e-05, + "loss": 0.0243, + "step": 12918 + }, + { + "epoch": 5.253761691744612, + "grad_norm": 18.22211813847692, + "learning_rate": 1.3514547747218838e-05, + "loss": 0.8106, + "step": 12919 + }, + { + "epoch": 5.2541683611224075, + "grad_norm": 11.398680853688933, + "learning_rate": 1.3513599177462889e-05, + "loss": 0.5702, + "step": 12920 + }, + { + "epoch": 5.254575030500203, + "grad_norm": 7.782407654726682, + "learning_rate": 1.3512650571638076e-05, + "loss": 0.5251, + "step": 12921 + }, + { + "epoch": 5.254981699877999, + "grad_norm": 14.551137210153051, + "learning_rate": 1.3511701929754137e-05, + "loss": 0.6408, + "step": 12922 + }, + { + "epoch": 5.255388369255795, + "grad_norm": 0.45492732521721513, + "learning_rate": 1.3510753251820812e-05, + "loss": 0.0056, + "step": 12923 + }, + { + "epoch": 5.2557950386335905, + "grad_norm": 4.713840018619474, + "learning_rate": 1.3509804537847836e-05, + "loss": 0.087, + "step": 12924 + }, + { + "epoch": 5.256201708011387, + "grad_norm": 0.039915792616031065, + "learning_rate": 1.350885578784495e-05, + "loss": 0.0008, + "step": 12925 + }, + { + "epoch": 5.256608377389183, + "grad_norm": 1.1012543011395621, + "learning_rate": 1.3507907001821894e-05, + "loss": 0.019, + "step": 12926 + }, + { + "epoch": 5.257015046766979, + "grad_norm": 8.852811094629207, + "learning_rate": 1.3506958179788408e-05, + "loss": 0.2647, + "step": 12927 + }, + { + "epoch": 5.2574217161447745, + "grad_norm": 2.1655036273418835, + "learning_rate": 1.3506009321754227e-05, + "loss": 0.0329, + "step": 12928 + }, + { + "epoch": 5.25782838552257, + "grad_norm": 6.488329823880623, + "learning_rate": 1.35050604277291e-05, + "loss": 0.2579, + "step": 12929 + }, + { + "epoch": 5.258235054900366, + "grad_norm": 2.792881498743327, + "learning_rate": 1.3504111497722764e-05, + "loss": 0.0665, + "step": 12930 + }, + { + "epoch": 5.258641724278162, + "grad_norm": 8.250185443431297, + "learning_rate": 1.3503162531744956e-05, + "loss": 0.1914, + "step": 12931 + }, + { + "epoch": 5.2590483936559576, + "grad_norm": 8.459736860368778, + "learning_rate": 1.3502213529805425e-05, + "loss": 0.1043, + "step": 12932 + }, + { + "epoch": 5.259455063033753, + "grad_norm": 8.433793838569237, + "learning_rate": 1.3501264491913909e-05, + "loss": 0.3197, + "step": 12933 + }, + { + "epoch": 5.259861732411549, + "grad_norm": 3.4884023503858397, + "learning_rate": 1.350031541808015e-05, + "loss": 0.0561, + "step": 12934 + }, + { + "epoch": 5.260268401789345, + "grad_norm": 5.300321401522443, + "learning_rate": 1.3499366308313891e-05, + "loss": 0.0756, + "step": 12935 + }, + { + "epoch": 5.2606750711671415, + "grad_norm": 10.106550618274667, + "learning_rate": 1.3498417162624877e-05, + "loss": 0.4767, + "step": 12936 + }, + { + "epoch": 5.261081740544937, + "grad_norm": 5.736333992594314, + "learning_rate": 1.3497467981022848e-05, + "loss": 0.241, + "step": 12937 + }, + { + "epoch": 5.261488409922733, + "grad_norm": 0.25938565082361253, + "learning_rate": 1.3496518763517551e-05, + "loss": 0.0037, + "step": 12938 + }, + { + "epoch": 5.261895079300529, + "grad_norm": 6.747591164696166, + "learning_rate": 1.3495569510118732e-05, + "loss": 0.1695, + "step": 12939 + }, + { + "epoch": 5.262301748678325, + "grad_norm": 2.218228172877957, + "learning_rate": 1.3494620220836132e-05, + "loss": 0.0596, + "step": 12940 + }, + { + "epoch": 5.26270841805612, + "grad_norm": 3.0122349539715114, + "learning_rate": 1.3493670895679492e-05, + "loss": 0.0855, + "step": 12941 + }, + { + "epoch": 5.263115087433916, + "grad_norm": 1.642810730362881, + "learning_rate": 1.3492721534658566e-05, + "loss": 0.0259, + "step": 12942 + }, + { + "epoch": 5.263521756811712, + "grad_norm": 0.47073757935860894, + "learning_rate": 1.3491772137783094e-05, + "loss": 0.0068, + "step": 12943 + }, + { + "epoch": 5.263928426189508, + "grad_norm": 8.146303018525844, + "learning_rate": 1.3490822705062825e-05, + "loss": 0.2811, + "step": 12944 + }, + { + "epoch": 5.264335095567303, + "grad_norm": 5.717320885083593, + "learning_rate": 1.3489873236507503e-05, + "loss": 0.2133, + "step": 12945 + }, + { + "epoch": 5.2647417649451, + "grad_norm": 7.534284724877307, + "learning_rate": 1.3488923732126877e-05, + "loss": 0.4078, + "step": 12946 + }, + { + "epoch": 5.265148434322896, + "grad_norm": 10.635276959520672, + "learning_rate": 1.348797419193069e-05, + "loss": 0.6615, + "step": 12947 + }, + { + "epoch": 5.265555103700692, + "grad_norm": 1.015175975920767, + "learning_rate": 1.3487024615928697e-05, + "loss": 0.014, + "step": 12948 + }, + { + "epoch": 5.265961773078487, + "grad_norm": 12.049143646389298, + "learning_rate": 1.3486075004130639e-05, + "loss": 0.7913, + "step": 12949 + }, + { + "epoch": 5.266368442456283, + "grad_norm": 8.82362093640329, + "learning_rate": 1.3485125356546265e-05, + "loss": 0.1949, + "step": 12950 + }, + { + "epoch": 5.266775111834079, + "grad_norm": 3.7153239862357674, + "learning_rate": 1.3484175673185327e-05, + "loss": 0.148, + "step": 12951 + }, + { + "epoch": 5.267181781211875, + "grad_norm": 7.005145814798644, + "learning_rate": 1.3483225954057573e-05, + "loss": 0.1627, + "step": 12952 + }, + { + "epoch": 5.26758845058967, + "grad_norm": 6.854153650301286, + "learning_rate": 1.3482276199172751e-05, + "loss": 0.2354, + "step": 12953 + }, + { + "epoch": 5.267995119967466, + "grad_norm": 4.96215836293876, + "learning_rate": 1.348132640854061e-05, + "loss": 0.1463, + "step": 12954 + }, + { + "epoch": 5.268401789345262, + "grad_norm": 3.4947623536588868, + "learning_rate": 1.3480376582170901e-05, + "loss": 0.0998, + "step": 12955 + }, + { + "epoch": 5.268808458723058, + "grad_norm": 1.2585756871619165, + "learning_rate": 1.3479426720073374e-05, + "loss": 0.0312, + "step": 12956 + }, + { + "epoch": 5.269215128100854, + "grad_norm": 14.149365263567843, + "learning_rate": 1.3478476822257782e-05, + "loss": 0.3812, + "step": 12957 + }, + { + "epoch": 5.26962179747865, + "grad_norm": 6.379600478657202, + "learning_rate": 1.3477526888733876e-05, + "loss": 0.2214, + "step": 12958 + }, + { + "epoch": 5.270028466856446, + "grad_norm": 4.582687848474745, + "learning_rate": 1.3476576919511403e-05, + "loss": 0.27, + "step": 12959 + }, + { + "epoch": 5.270435136234242, + "grad_norm": 1.55347109008962, + "learning_rate": 1.3475626914600121e-05, + "loss": 0.034, + "step": 12960 + }, + { + "epoch": 5.270841805612037, + "grad_norm": 9.159128865943849, + "learning_rate": 1.347467687400978e-05, + "loss": 0.3024, + "step": 12961 + }, + { + "epoch": 5.271248474989833, + "grad_norm": 0.39873943189497474, + "learning_rate": 1.3473726797750133e-05, + "loss": 0.0071, + "step": 12962 + }, + { + "epoch": 5.271655144367629, + "grad_norm": 0.8734699246559513, + "learning_rate": 1.3472776685830926e-05, + "loss": 0.0136, + "step": 12963 + }, + { + "epoch": 5.272061813745425, + "grad_norm": 8.904282077216186, + "learning_rate": 1.3471826538261924e-05, + "loss": 0.3676, + "step": 12964 + }, + { + "epoch": 5.2724684831232205, + "grad_norm": 7.12241185453532, + "learning_rate": 1.3470876355052875e-05, + "loss": 0.16, + "step": 12965 + }, + { + "epoch": 5.272875152501017, + "grad_norm": 0.2432942237066163, + "learning_rate": 1.346992613621353e-05, + "loss": 0.004, + "step": 12966 + }, + { + "epoch": 5.273281821878813, + "grad_norm": 7.554389667748588, + "learning_rate": 1.346897588175365e-05, + "loss": 0.3019, + "step": 12967 + }, + { + "epoch": 5.273688491256609, + "grad_norm": 0.9486467361818156, + "learning_rate": 1.3468025591682987e-05, + "loss": 0.0218, + "step": 12968 + }, + { + "epoch": 5.274095160634404, + "grad_norm": 0.30560686671675574, + "learning_rate": 1.3467075266011294e-05, + "loss": 0.0041, + "step": 12969 + }, + { + "epoch": 5.2745018300122, + "grad_norm": 8.111045478495395, + "learning_rate": 1.3466124904748328e-05, + "loss": 0.4191, + "step": 12970 + }, + { + "epoch": 5.274908499389996, + "grad_norm": 0.20420180731025991, + "learning_rate": 1.3465174507903847e-05, + "loss": 0.0048, + "step": 12971 + }, + { + "epoch": 5.275315168767792, + "grad_norm": 10.046386781045907, + "learning_rate": 1.3464224075487605e-05, + "loss": 0.4277, + "step": 12972 + }, + { + "epoch": 5.2757218381455875, + "grad_norm": 3.920138705726212, + "learning_rate": 1.346327360750936e-05, + "loss": 0.1457, + "step": 12973 + }, + { + "epoch": 5.276128507523383, + "grad_norm": 4.66530485158999, + "learning_rate": 1.3462323103978867e-05, + "loss": 0.1038, + "step": 12974 + }, + { + "epoch": 5.276535176901179, + "grad_norm": 2.654731568403769, + "learning_rate": 1.3461372564905886e-05, + "loss": 0.0753, + "step": 12975 + }, + { + "epoch": 5.276941846278975, + "grad_norm": 8.636512645678115, + "learning_rate": 1.3460421990300173e-05, + "loss": 0.3164, + "step": 12976 + }, + { + "epoch": 5.277348515656771, + "grad_norm": 1.2468992254217734, + "learning_rate": 1.3459471380171486e-05, + "loss": 0.0187, + "step": 12977 + }, + { + "epoch": 5.277755185034567, + "grad_norm": 1.1303240076746472, + "learning_rate": 1.3458520734529585e-05, + "loss": 0.0154, + "step": 12978 + }, + { + "epoch": 5.278161854412363, + "grad_norm": 3.314052007199708, + "learning_rate": 1.3457570053384225e-05, + "loss": 0.0942, + "step": 12979 + }, + { + "epoch": 5.278568523790159, + "grad_norm": 7.115071779655633, + "learning_rate": 1.3456619336745173e-05, + "loss": 0.5101, + "step": 12980 + }, + { + "epoch": 5.2789751931679545, + "grad_norm": 0.3332881082855345, + "learning_rate": 1.3455668584622184e-05, + "loss": 0.0061, + "step": 12981 + }, + { + "epoch": 5.27938186254575, + "grad_norm": 5.809545399288112, + "learning_rate": 1.3454717797025015e-05, + "loss": 0.0983, + "step": 12982 + }, + { + "epoch": 5.279788531923546, + "grad_norm": 4.9706554534214975, + "learning_rate": 1.345376697396343e-05, + "loss": 0.108, + "step": 12983 + }, + { + "epoch": 5.280195201301342, + "grad_norm": 3.8171384969381275, + "learning_rate": 1.3452816115447189e-05, + "loss": 0.1017, + "step": 12984 + }, + { + "epoch": 5.2806018706791376, + "grad_norm": 4.673968391666481, + "learning_rate": 1.3451865221486053e-05, + "loss": 0.0846, + "step": 12985 + }, + { + "epoch": 5.281008540056933, + "grad_norm": 4.995565816175946, + "learning_rate": 1.3450914292089781e-05, + "loss": 0.2022, + "step": 12986 + }, + { + "epoch": 5.28141520943473, + "grad_norm": 5.292364677440433, + "learning_rate": 1.3449963327268139e-05, + "loss": 0.2626, + "step": 12987 + }, + { + "epoch": 5.281821878812526, + "grad_norm": 5.657772936678706, + "learning_rate": 1.3449012327030889e-05, + "loss": 0.1218, + "step": 12988 + }, + { + "epoch": 5.2822285481903215, + "grad_norm": 0.1258701199170493, + "learning_rate": 1.3448061291387786e-05, + "loss": 0.0022, + "step": 12989 + }, + { + "epoch": 5.282635217568117, + "grad_norm": 0.06954091974215987, + "learning_rate": 1.3447110220348602e-05, + "loss": 0.0014, + "step": 12990 + }, + { + "epoch": 5.283041886945913, + "grad_norm": 7.972005381486975, + "learning_rate": 1.3446159113923099e-05, + "loss": 0.4495, + "step": 12991 + }, + { + "epoch": 5.283448556323709, + "grad_norm": 2.846873340677522, + "learning_rate": 1.3445207972121032e-05, + "loss": 0.0352, + "step": 12992 + }, + { + "epoch": 5.283855225701505, + "grad_norm": 5.006909854526323, + "learning_rate": 1.3444256794952177e-05, + "loss": 0.1113, + "step": 12993 + }, + { + "epoch": 5.2842618950793, + "grad_norm": 14.155312255248223, + "learning_rate": 1.3443305582426291e-05, + "loss": 0.7362, + "step": 12994 + }, + { + "epoch": 5.284668564457096, + "grad_norm": 0.11159978662474414, + "learning_rate": 1.3442354334553138e-05, + "loss": 0.002, + "step": 12995 + }, + { + "epoch": 5.285075233834892, + "grad_norm": 0.41588894125234693, + "learning_rate": 1.344140305134249e-05, + "loss": 0.0049, + "step": 12996 + }, + { + "epoch": 5.285481903212688, + "grad_norm": 0.06244485685614258, + "learning_rate": 1.3440451732804105e-05, + "loss": 0.0017, + "step": 12997 + }, + { + "epoch": 5.285888572590484, + "grad_norm": 0.12595081869862412, + "learning_rate": 1.3439500378947752e-05, + "loss": 0.0013, + "step": 12998 + }, + { + "epoch": 5.28629524196828, + "grad_norm": 2.9581491865375846, + "learning_rate": 1.3438548989783196e-05, + "loss": 0.0467, + "step": 12999 + }, + { + "epoch": 5.286701911346076, + "grad_norm": 6.61433787193942, + "learning_rate": 1.3437597565320204e-05, + "loss": 0.1327, + "step": 13000 + }, + { + "epoch": 5.287108580723872, + "grad_norm": 7.598174559766322, + "learning_rate": 1.343664610556854e-05, + "loss": 0.1497, + "step": 13001 + }, + { + "epoch": 5.287515250101667, + "grad_norm": 3.874806920814101, + "learning_rate": 1.343569461053798e-05, + "loss": 0.1945, + "step": 13002 + }, + { + "epoch": 5.287921919479463, + "grad_norm": 7.727594772462493, + "learning_rate": 1.3434743080238282e-05, + "loss": 0.1916, + "step": 13003 + }, + { + "epoch": 5.288328588857259, + "grad_norm": 5.9374040291388095, + "learning_rate": 1.3433791514679215e-05, + "loss": 0.3113, + "step": 13004 + }, + { + "epoch": 5.288735258235055, + "grad_norm": 9.069053980286142, + "learning_rate": 1.3432839913870555e-05, + "loss": 0.4178, + "step": 13005 + }, + { + "epoch": 5.28914192761285, + "grad_norm": 5.546231182456995, + "learning_rate": 1.3431888277822065e-05, + "loss": 0.2505, + "step": 13006 + }, + { + "epoch": 5.289548596990647, + "grad_norm": 1.264742570695918, + "learning_rate": 1.3430936606543513e-05, + "loss": 0.0217, + "step": 13007 + }, + { + "epoch": 5.289955266368443, + "grad_norm": 0.6977423523320196, + "learning_rate": 1.3429984900044668e-05, + "loss": 0.0065, + "step": 13008 + }, + { + "epoch": 5.290361935746239, + "grad_norm": 6.309091855689445, + "learning_rate": 1.3429033158335305e-05, + "loss": 0.4194, + "step": 13009 + }, + { + "epoch": 5.290768605124034, + "grad_norm": 0.22667491599203052, + "learning_rate": 1.3428081381425191e-05, + "loss": 0.0021, + "step": 13010 + }, + { + "epoch": 5.29117527450183, + "grad_norm": 8.62749306006278, + "learning_rate": 1.3427129569324091e-05, + "loss": 0.2233, + "step": 13011 + }, + { + "epoch": 5.291581943879626, + "grad_norm": 12.906927542416728, + "learning_rate": 1.3426177722041786e-05, + "loss": 0.2528, + "step": 13012 + }, + { + "epoch": 5.291988613257422, + "grad_norm": 4.203034307729398, + "learning_rate": 1.3425225839588043e-05, + "loss": 0.2005, + "step": 13013 + }, + { + "epoch": 5.292395282635217, + "grad_norm": 6.175371993286095, + "learning_rate": 1.3424273921972631e-05, + "loss": 0.2024, + "step": 13014 + }, + { + "epoch": 5.292801952013013, + "grad_norm": 1.4318358991553746, + "learning_rate": 1.3423321969205324e-05, + "loss": 0.0225, + "step": 13015 + }, + { + "epoch": 5.293208621390809, + "grad_norm": 0.9032861970406919, + "learning_rate": 1.3422369981295895e-05, + "loss": 0.0116, + "step": 13016 + }, + { + "epoch": 5.293615290768605, + "grad_norm": 0.15099986738260254, + "learning_rate": 1.3421417958254113e-05, + "loss": 0.0037, + "step": 13017 + }, + { + "epoch": 5.294021960146401, + "grad_norm": 2.304456220959315, + "learning_rate": 1.3420465900089757e-05, + "loss": 0.0386, + "step": 13018 + }, + { + "epoch": 5.294428629524197, + "grad_norm": 10.540089344599942, + "learning_rate": 1.3419513806812594e-05, + "loss": 0.5115, + "step": 13019 + }, + { + "epoch": 5.294835298901993, + "grad_norm": 4.214903165413506, + "learning_rate": 1.3418561678432402e-05, + "loss": 0.1343, + "step": 13020 + }, + { + "epoch": 5.295241968279789, + "grad_norm": 0.2577245324777159, + "learning_rate": 1.3417609514958954e-05, + "loss": 0.0051, + "step": 13021 + }, + { + "epoch": 5.295648637657584, + "grad_norm": 8.137370785296714, + "learning_rate": 1.3416657316402026e-05, + "loss": 0.2251, + "step": 13022 + }, + { + "epoch": 5.29605530703538, + "grad_norm": 0.19146835003143942, + "learning_rate": 1.341570508277139e-05, + "loss": 0.0028, + "step": 13023 + }, + { + "epoch": 5.296461976413176, + "grad_norm": 9.914858041288907, + "learning_rate": 1.341475281407682e-05, + "loss": 0.4163, + "step": 13024 + }, + { + "epoch": 5.296868645790972, + "grad_norm": 11.74430159549349, + "learning_rate": 1.3413800510328096e-05, + "loss": 0.1733, + "step": 13025 + }, + { + "epoch": 5.2972753151687675, + "grad_norm": 0.041649571608811366, + "learning_rate": 1.3412848171534991e-05, + "loss": 0.0006, + "step": 13026 + }, + { + "epoch": 5.297681984546563, + "grad_norm": 9.906195763761383, + "learning_rate": 1.3411895797707282e-05, + "loss": 0.264, + "step": 13027 + }, + { + "epoch": 5.29808865392436, + "grad_norm": 3.216405080793946, + "learning_rate": 1.3410943388854745e-05, + "loss": 0.0411, + "step": 13028 + }, + { + "epoch": 5.298495323302156, + "grad_norm": 2.181602655218515, + "learning_rate": 1.3409990944987159e-05, + "loss": 0.038, + "step": 13029 + }, + { + "epoch": 5.298901992679951, + "grad_norm": 0.638612277531457, + "learning_rate": 1.3409038466114296e-05, + "loss": 0.0124, + "step": 13030 + }, + { + "epoch": 5.299308662057747, + "grad_norm": 0.5448852663280904, + "learning_rate": 1.3408085952245942e-05, + "loss": 0.008, + "step": 13031 + }, + { + "epoch": 5.299715331435543, + "grad_norm": 17.503227396517204, + "learning_rate": 1.340713340339187e-05, + "loss": 0.7256, + "step": 13032 + }, + { + "epoch": 5.300122000813339, + "grad_norm": 0.43532860251112265, + "learning_rate": 1.3406180819561859e-05, + "loss": 0.0066, + "step": 13033 + }, + { + "epoch": 5.3005286701911345, + "grad_norm": 7.669467420212574, + "learning_rate": 1.3405228200765684e-05, + "loss": 0.2148, + "step": 13034 + }, + { + "epoch": 5.30093533956893, + "grad_norm": 6.588724979937031, + "learning_rate": 1.340427554701313e-05, + "loss": 0.2011, + "step": 13035 + }, + { + "epoch": 5.301342008946726, + "grad_norm": 8.47281613128715, + "learning_rate": 1.3403322858313974e-05, + "loss": 0.4212, + "step": 13036 + }, + { + "epoch": 5.301748678324522, + "grad_norm": 7.093423345429211, + "learning_rate": 1.3402370134677993e-05, + "loss": 0.172, + "step": 13037 + }, + { + "epoch": 5.302155347702318, + "grad_norm": 4.32769132629395, + "learning_rate": 1.3401417376114973e-05, + "loss": 0.1851, + "step": 13038 + }, + { + "epoch": 5.302562017080114, + "grad_norm": 2.852607087790468, + "learning_rate": 1.3400464582634694e-05, + "loss": 0.0359, + "step": 13039 + }, + { + "epoch": 5.30296868645791, + "grad_norm": 8.58099665437113, + "learning_rate": 1.3399511754246932e-05, + "loss": 0.4024, + "step": 13040 + }, + { + "epoch": 5.303375355835706, + "grad_norm": 3.651991876377091, + "learning_rate": 1.3398558890961471e-05, + "loss": 0.0718, + "step": 13041 + }, + { + "epoch": 5.3037820252135015, + "grad_norm": 0.07745321875248076, + "learning_rate": 1.3397605992788094e-05, + "loss": 0.001, + "step": 13042 + }, + { + "epoch": 5.304188694591297, + "grad_norm": 5.525267688346701, + "learning_rate": 1.339665305973658e-05, + "loss": 0.1535, + "step": 13043 + }, + { + "epoch": 5.304595363969093, + "grad_norm": 1.160648754365867, + "learning_rate": 1.3395700091816712e-05, + "loss": 0.016, + "step": 13044 + }, + { + "epoch": 5.305002033346889, + "grad_norm": 1.985863113138019, + "learning_rate": 1.3394747089038276e-05, + "loss": 0.04, + "step": 13045 + }, + { + "epoch": 5.305408702724685, + "grad_norm": 12.385900302579424, + "learning_rate": 1.3393794051411052e-05, + "loss": 0.294, + "step": 13046 + }, + { + "epoch": 5.30581537210248, + "grad_norm": 7.752306215775303, + "learning_rate": 1.3392840978944821e-05, + "loss": 0.4072, + "step": 13047 + }, + { + "epoch": 5.306222041480277, + "grad_norm": 5.4078781374523475, + "learning_rate": 1.3391887871649374e-05, + "loss": 0.0986, + "step": 13048 + }, + { + "epoch": 5.306628710858073, + "grad_norm": 3.4988873943985066, + "learning_rate": 1.3390934729534489e-05, + "loss": 0.0509, + "step": 13049 + }, + { + "epoch": 5.3070353802358685, + "grad_norm": 3.411785917895592, + "learning_rate": 1.3389981552609952e-05, + "loss": 0.063, + "step": 13050 + }, + { + "epoch": 5.307442049613664, + "grad_norm": 1.2741618983039318, + "learning_rate": 1.3389028340885548e-05, + "loss": 0.0231, + "step": 13051 + }, + { + "epoch": 5.30784871899146, + "grad_norm": 9.804576474583213, + "learning_rate": 1.3388075094371064e-05, + "loss": 0.3213, + "step": 13052 + }, + { + "epoch": 5.308255388369256, + "grad_norm": 0.2882384600195626, + "learning_rate": 1.3387121813076282e-05, + "loss": 0.0034, + "step": 13053 + }, + { + "epoch": 5.308662057747052, + "grad_norm": 9.803634113170736, + "learning_rate": 1.3386168497010991e-05, + "loss": 0.2169, + "step": 13054 + }, + { + "epoch": 5.309068727124847, + "grad_norm": 6.2026070583042445, + "learning_rate": 1.3385215146184977e-05, + "loss": 0.1935, + "step": 13055 + }, + { + "epoch": 5.309475396502643, + "grad_norm": 0.11907463059479828, + "learning_rate": 1.3384261760608024e-05, + "loss": 0.0014, + "step": 13056 + }, + { + "epoch": 5.309882065880439, + "grad_norm": 1.2479391190022193, + "learning_rate": 1.3383308340289923e-05, + "loss": 0.0159, + "step": 13057 + }, + { + "epoch": 5.310288735258235, + "grad_norm": 0.07452966302003908, + "learning_rate": 1.3382354885240459e-05, + "loss": 0.0009, + "step": 13058 + }, + { + "epoch": 5.310695404636031, + "grad_norm": 3.616521156611488, + "learning_rate": 1.3381401395469415e-05, + "loss": 0.0859, + "step": 13059 + }, + { + "epoch": 5.311102074013827, + "grad_norm": 3.521954032951377, + "learning_rate": 1.3380447870986588e-05, + "loss": 0.1161, + "step": 13060 + }, + { + "epoch": 5.311508743391623, + "grad_norm": 0.2575763211895459, + "learning_rate": 1.3379494311801762e-05, + "loss": 0.0045, + "step": 13061 + }, + { + "epoch": 5.311915412769419, + "grad_norm": 5.559924695304741, + "learning_rate": 1.3378540717924725e-05, + "loss": 0.1403, + "step": 13062 + }, + { + "epoch": 5.312322082147214, + "grad_norm": 6.865459046411343, + "learning_rate": 1.3377587089365269e-05, + "loss": 0.3071, + "step": 13063 + }, + { + "epoch": 5.31272875152501, + "grad_norm": 13.562725828994546, + "learning_rate": 1.3376633426133179e-05, + "loss": 0.1492, + "step": 13064 + }, + { + "epoch": 5.313135420902806, + "grad_norm": 8.012519751513636, + "learning_rate": 1.337567972823825e-05, + "loss": 0.2322, + "step": 13065 + }, + { + "epoch": 5.313542090280602, + "grad_norm": 6.476473994694312, + "learning_rate": 1.3374725995690268e-05, + "loss": 0.2814, + "step": 13066 + }, + { + "epoch": 5.313948759658397, + "grad_norm": 8.60119149628637, + "learning_rate": 1.3373772228499027e-05, + "loss": 0.1836, + "step": 13067 + }, + { + "epoch": 5.314355429036193, + "grad_norm": 9.100999409320911, + "learning_rate": 1.3372818426674315e-05, + "loss": 0.147, + "step": 13068 + }, + { + "epoch": 5.31476209841399, + "grad_norm": 3.6352031701661214, + "learning_rate": 1.3371864590225922e-05, + "loss": 0.0623, + "step": 13069 + }, + { + "epoch": 5.315168767791786, + "grad_norm": 6.756491040492521, + "learning_rate": 1.3370910719163646e-05, + "loss": 0.1847, + "step": 13070 + }, + { + "epoch": 5.315575437169581, + "grad_norm": 5.66199497807707, + "learning_rate": 1.3369956813497272e-05, + "loss": 0.3616, + "step": 13071 + }, + { + "epoch": 5.315982106547377, + "grad_norm": 12.004403477281983, + "learning_rate": 1.3369002873236597e-05, + "loss": 0.1237, + "step": 13072 + }, + { + "epoch": 5.316388775925173, + "grad_norm": 5.918211827935633, + "learning_rate": 1.336804889839141e-05, + "loss": 0.1629, + "step": 13073 + }, + { + "epoch": 5.316795445302969, + "grad_norm": 2.7066816439252808, + "learning_rate": 1.3367094888971508e-05, + "loss": 0.0595, + "step": 13074 + }, + { + "epoch": 5.317202114680764, + "grad_norm": 0.1670839443597461, + "learning_rate": 1.3366140844986678e-05, + "loss": 0.0037, + "step": 13075 + }, + { + "epoch": 5.31760878405856, + "grad_norm": 14.681992675672928, + "learning_rate": 1.3365186766446723e-05, + "loss": 0.2884, + "step": 13076 + }, + { + "epoch": 5.318015453436356, + "grad_norm": 8.580001574710371, + "learning_rate": 1.3364232653361433e-05, + "loss": 0.1981, + "step": 13077 + }, + { + "epoch": 5.318422122814152, + "grad_norm": 15.230352907542509, + "learning_rate": 1.33632785057406e-05, + "loss": 0.3993, + "step": 13078 + }, + { + "epoch": 5.3188287921919475, + "grad_norm": 11.042580554212554, + "learning_rate": 1.3362324323594016e-05, + "loss": 0.3409, + "step": 13079 + }, + { + "epoch": 5.319235461569744, + "grad_norm": 8.364251843877335, + "learning_rate": 1.3361370106931486e-05, + "loss": 0.2384, + "step": 13080 + }, + { + "epoch": 5.31964213094754, + "grad_norm": 0.2797547278284976, + "learning_rate": 1.3360415855762799e-05, + "loss": 0.0042, + "step": 13081 + }, + { + "epoch": 5.320048800325336, + "grad_norm": 8.068196946942157, + "learning_rate": 1.335946157009775e-05, + "loss": 0.1621, + "step": 13082 + }, + { + "epoch": 5.320455469703131, + "grad_norm": 0.12668189313273284, + "learning_rate": 1.335850724994614e-05, + "loss": 0.0014, + "step": 13083 + }, + { + "epoch": 5.320862139080927, + "grad_norm": 8.708632480782123, + "learning_rate": 1.3357552895317763e-05, + "loss": 0.1851, + "step": 13084 + }, + { + "epoch": 5.321268808458723, + "grad_norm": 2.13007058479947, + "learning_rate": 1.3356598506222412e-05, + "loss": 0.041, + "step": 13085 + }, + { + "epoch": 5.321675477836519, + "grad_norm": 8.139160220213517, + "learning_rate": 1.3355644082669889e-05, + "loss": 0.3455, + "step": 13086 + }, + { + "epoch": 5.3220821472143145, + "grad_norm": 5.985683377582242, + "learning_rate": 1.3354689624669993e-05, + "loss": 0.075, + "step": 13087 + }, + { + "epoch": 5.32248881659211, + "grad_norm": 2.68972319214091, + "learning_rate": 1.3353735132232517e-05, + "loss": 0.0874, + "step": 13088 + }, + { + "epoch": 5.322895485969907, + "grad_norm": 6.5658594682969165, + "learning_rate": 1.3352780605367263e-05, + "loss": 0.245, + "step": 13089 + }, + { + "epoch": 5.323302155347703, + "grad_norm": 5.601470466752454, + "learning_rate": 1.335182604408403e-05, + "loss": 0.2068, + "step": 13090 + }, + { + "epoch": 5.3237088247254984, + "grad_norm": 5.790388779221142, + "learning_rate": 1.3350871448392617e-05, + "loss": 0.118, + "step": 13091 + }, + { + "epoch": 5.324115494103294, + "grad_norm": 2.563854203109935, + "learning_rate": 1.3349916818302817e-05, + "loss": 0.0272, + "step": 13092 + }, + { + "epoch": 5.32452216348109, + "grad_norm": 7.330209109078526, + "learning_rate": 1.3348962153824439e-05, + "loss": 0.2907, + "step": 13093 + }, + { + "epoch": 5.324928832858886, + "grad_norm": 0.6193643662352097, + "learning_rate": 1.3348007454967277e-05, + "loss": 0.0099, + "step": 13094 + }, + { + "epoch": 5.3253355022366815, + "grad_norm": 0.18551991166652712, + "learning_rate": 1.3347052721741133e-05, + "loss": 0.0026, + "step": 13095 + }, + { + "epoch": 5.325742171614477, + "grad_norm": 9.548653822530301, + "learning_rate": 1.334609795415581e-05, + "loss": 0.4461, + "step": 13096 + }, + { + "epoch": 5.326148840992273, + "grad_norm": 34.16088467885132, + "learning_rate": 1.3345143152221107e-05, + "loss": 1.4543, + "step": 13097 + }, + { + "epoch": 5.326555510370069, + "grad_norm": 4.405333776199246, + "learning_rate": 1.3344188315946827e-05, + "loss": 0.1147, + "step": 13098 + }, + { + "epoch": 5.326962179747865, + "grad_norm": 8.85702820458225, + "learning_rate": 1.3343233445342771e-05, + "loss": 0.3348, + "step": 13099 + }, + { + "epoch": 5.327368849125661, + "grad_norm": 6.930897581585539, + "learning_rate": 1.334227854041874e-05, + "loss": 0.2544, + "step": 13100 + }, + { + "epoch": 5.327775518503457, + "grad_norm": 7.042713497789301, + "learning_rate": 1.3341323601184536e-05, + "loss": 0.3022, + "step": 13101 + }, + { + "epoch": 5.328182187881253, + "grad_norm": 7.813446265417456, + "learning_rate": 1.3340368627649967e-05, + "loss": 0.2788, + "step": 13102 + }, + { + "epoch": 5.3285888572590485, + "grad_norm": 2.2360249666093215, + "learning_rate": 1.333941361982483e-05, + "loss": 0.1155, + "step": 13103 + }, + { + "epoch": 5.328995526636844, + "grad_norm": 1.7601052729409663, + "learning_rate": 1.3338458577718931e-05, + "loss": 0.0308, + "step": 13104 + }, + { + "epoch": 5.32940219601464, + "grad_norm": 0.08382863562813052, + "learning_rate": 1.3337503501342076e-05, + "loss": 0.0015, + "step": 13105 + }, + { + "epoch": 5.329808865392436, + "grad_norm": 2.249841371754292, + "learning_rate": 1.3336548390704069e-05, + "loss": 0.0294, + "step": 13106 + }, + { + "epoch": 5.330215534770232, + "grad_norm": 4.911148398692828, + "learning_rate": 1.333559324581471e-05, + "loss": 0.088, + "step": 13107 + }, + { + "epoch": 5.330622204148027, + "grad_norm": 8.077744681132565, + "learning_rate": 1.3334638066683812e-05, + "loss": 0.3236, + "step": 13108 + }, + { + "epoch": 5.331028873525823, + "grad_norm": 0.13725958331150895, + "learning_rate": 1.3333682853321174e-05, + "loss": 0.0024, + "step": 13109 + }, + { + "epoch": 5.33143554290362, + "grad_norm": 0.019387278020635584, + "learning_rate": 1.3332727605736604e-05, + "loss": 0.0004, + "step": 13110 + }, + { + "epoch": 5.3318422122814155, + "grad_norm": 8.315748454618337, + "learning_rate": 1.3331772323939907e-05, + "loss": 0.2672, + "step": 13111 + }, + { + "epoch": 5.332248881659211, + "grad_norm": 0.4320281532426509, + "learning_rate": 1.3330817007940891e-05, + "loss": 0.0074, + "step": 13112 + }, + { + "epoch": 5.332655551037007, + "grad_norm": 8.000139704500524, + "learning_rate": 1.3329861657749362e-05, + "loss": 0.1827, + "step": 13113 + }, + { + "epoch": 5.333062220414803, + "grad_norm": 7.494394173494909, + "learning_rate": 1.3328906273375124e-05, + "loss": 0.13, + "step": 13114 + }, + { + "epoch": 5.333468889792599, + "grad_norm": 7.603405004129516, + "learning_rate": 1.3327950854827992e-05, + "loss": 0.2434, + "step": 13115 + }, + { + "epoch": 5.333875559170394, + "grad_norm": 11.287189397587628, + "learning_rate": 1.3326995402117768e-05, + "loss": 0.4841, + "step": 13116 + }, + { + "epoch": 5.33428222854819, + "grad_norm": 0.1038365315227369, + "learning_rate": 1.332603991525426e-05, + "loss": 0.0024, + "step": 13117 + }, + { + "epoch": 5.334688897925986, + "grad_norm": 1.8535927763939222, + "learning_rate": 1.332508439424728e-05, + "loss": 0.0341, + "step": 13118 + }, + { + "epoch": 5.335095567303782, + "grad_norm": 6.322669478590794, + "learning_rate": 1.3324128839106637e-05, + "loss": 0.1777, + "step": 13119 + }, + { + "epoch": 5.335502236681577, + "grad_norm": 3.120138826336246, + "learning_rate": 1.3323173249842137e-05, + "loss": 0.0816, + "step": 13120 + }, + { + "epoch": 5.335908906059374, + "grad_norm": 7.837196718136573, + "learning_rate": 1.332221762646359e-05, + "loss": 0.4232, + "step": 13121 + }, + { + "epoch": 5.33631557543717, + "grad_norm": 0.03390841964251077, + "learning_rate": 1.3321261968980808e-05, + "loss": 0.0008, + "step": 13122 + }, + { + "epoch": 5.336722244814966, + "grad_norm": 7.233233890629221, + "learning_rate": 1.33203062774036e-05, + "loss": 0.2642, + "step": 13123 + }, + { + "epoch": 5.337128914192761, + "grad_norm": 0.975242138009538, + "learning_rate": 1.3319350551741776e-05, + "loss": 0.0223, + "step": 13124 + }, + { + "epoch": 5.337535583570557, + "grad_norm": 1.2213953336767358, + "learning_rate": 1.331839479200515e-05, + "loss": 0.0133, + "step": 13125 + }, + { + "epoch": 5.337942252948353, + "grad_norm": 0.4386190367139942, + "learning_rate": 1.3317438998203532e-05, + "loss": 0.0056, + "step": 13126 + }, + { + "epoch": 5.338348922326149, + "grad_norm": 8.208430920706874, + "learning_rate": 1.331648317034673e-05, + "loss": 0.3234, + "step": 13127 + }, + { + "epoch": 5.338755591703944, + "grad_norm": 8.654390429869434, + "learning_rate": 1.331552730844456e-05, + "loss": 0.5941, + "step": 13128 + }, + { + "epoch": 5.33916226108174, + "grad_norm": 12.785188067149251, + "learning_rate": 1.3314571412506836e-05, + "loss": 0.5768, + "step": 13129 + }, + { + "epoch": 5.339568930459537, + "grad_norm": 3.0078628660837077, + "learning_rate": 1.3313615482543366e-05, + "loss": 0.0571, + "step": 13130 + }, + { + "epoch": 5.339975599837333, + "grad_norm": 5.178657895815915, + "learning_rate": 1.3312659518563967e-05, + "loss": 0.0742, + "step": 13131 + }, + { + "epoch": 5.340382269215128, + "grad_norm": 5.504330781162086, + "learning_rate": 1.331170352057845e-05, + "loss": 0.2091, + "step": 13132 + }, + { + "epoch": 5.340788938592924, + "grad_norm": 0.05168101808860453, + "learning_rate": 1.3310747488596625e-05, + "loss": 0.001, + "step": 13133 + }, + { + "epoch": 5.34119560797072, + "grad_norm": 6.317010773916629, + "learning_rate": 1.3309791422628315e-05, + "loss": 0.135, + "step": 13134 + }, + { + "epoch": 5.341602277348516, + "grad_norm": 1.2227934718240565, + "learning_rate": 1.3308835322683333e-05, + "loss": 0.0222, + "step": 13135 + }, + { + "epoch": 5.342008946726311, + "grad_norm": 12.674245960468804, + "learning_rate": 1.3307879188771488e-05, + "loss": 0.6566, + "step": 13136 + }, + { + "epoch": 5.342415616104107, + "grad_norm": 6.321285515488566, + "learning_rate": 1.3306923020902597e-05, + "loss": 0.1188, + "step": 13137 + }, + { + "epoch": 5.342822285481903, + "grad_norm": 4.359226293488608, + "learning_rate": 1.330596681908648e-05, + "loss": 0.0927, + "step": 13138 + }, + { + "epoch": 5.343228954859699, + "grad_norm": 14.056288920035277, + "learning_rate": 1.3305010583332949e-05, + "loss": 0.1126, + "step": 13139 + }, + { + "epoch": 5.3436356242374945, + "grad_norm": 0.09692117069331929, + "learning_rate": 1.3304054313651817e-05, + "loss": 0.0018, + "step": 13140 + }, + { + "epoch": 5.344042293615291, + "grad_norm": 0.4969822365507718, + "learning_rate": 1.330309801005291e-05, + "loss": 0.0066, + "step": 13141 + }, + { + "epoch": 5.344448962993087, + "grad_norm": 3.3260739829228734, + "learning_rate": 1.3302141672546039e-05, + "loss": 0.1953, + "step": 13142 + }, + { + "epoch": 5.344855632370883, + "grad_norm": 0.15987219351272558, + "learning_rate": 1.3301185301141017e-05, + "loss": 0.0026, + "step": 13143 + }, + { + "epoch": 5.3452623017486784, + "grad_norm": 4.243658994254248, + "learning_rate": 1.3300228895847671e-05, + "loss": 0.1591, + "step": 13144 + }, + { + "epoch": 5.345668971126474, + "grad_norm": 12.706477455117152, + "learning_rate": 1.3299272456675813e-05, + "loss": 0.7209, + "step": 13145 + }, + { + "epoch": 5.34607564050427, + "grad_norm": 1.4475356503567451, + "learning_rate": 1.3298315983635262e-05, + "loss": 0.0174, + "step": 13146 + }, + { + "epoch": 5.346482309882066, + "grad_norm": 0.8213944023296159, + "learning_rate": 1.3297359476735839e-05, + "loss": 0.0129, + "step": 13147 + }, + { + "epoch": 5.3468889792598615, + "grad_norm": 6.037864544033442, + "learning_rate": 1.329640293598736e-05, + "loss": 0.1203, + "step": 13148 + }, + { + "epoch": 5.347295648637657, + "grad_norm": 6.488258702108625, + "learning_rate": 1.3295446361399647e-05, + "loss": 0.1767, + "step": 13149 + }, + { + "epoch": 5.347702318015453, + "grad_norm": 29.528511188802412, + "learning_rate": 1.3294489752982518e-05, + "loss": 0.3245, + "step": 13150 + }, + { + "epoch": 5.34810898739325, + "grad_norm": 1.3016522967119308, + "learning_rate": 1.3293533110745795e-05, + "loss": 0.0222, + "step": 13151 + }, + { + "epoch": 5.3485156567710455, + "grad_norm": 13.981744664046273, + "learning_rate": 1.3292576434699294e-05, + "loss": 0.2023, + "step": 13152 + }, + { + "epoch": 5.348922326148841, + "grad_norm": 2.781361667656714, + "learning_rate": 1.329161972485284e-05, + "loss": 0.0398, + "step": 13153 + }, + { + "epoch": 5.349328995526637, + "grad_norm": 0.2150055414968394, + "learning_rate": 1.3290662981216255e-05, + "loss": 0.003, + "step": 13154 + }, + { + "epoch": 5.349735664904433, + "grad_norm": 3.277710239819122, + "learning_rate": 1.3289706203799357e-05, + "loss": 0.0497, + "step": 13155 + }, + { + "epoch": 5.3501423342822285, + "grad_norm": 2.2083963659563888, + "learning_rate": 1.3288749392611968e-05, + "loss": 0.0442, + "step": 13156 + }, + { + "epoch": 5.350549003660024, + "grad_norm": 17.418353314976226, + "learning_rate": 1.328779254766391e-05, + "loss": 0.5136, + "step": 13157 + }, + { + "epoch": 5.35095567303782, + "grad_norm": 5.836472060280888, + "learning_rate": 1.3286835668965013e-05, + "loss": 0.259, + "step": 13158 + }, + { + "epoch": 5.351362342415616, + "grad_norm": 3.8978658116737686, + "learning_rate": 1.3285878756525086e-05, + "loss": 0.0655, + "step": 13159 + }, + { + "epoch": 5.351769011793412, + "grad_norm": 5.313278616180434, + "learning_rate": 1.3284921810353964e-05, + "loss": 0.0616, + "step": 13160 + }, + { + "epoch": 5.352175681171207, + "grad_norm": 9.430883747120372, + "learning_rate": 1.3283964830461466e-05, + "loss": 0.2116, + "step": 13161 + }, + { + "epoch": 5.352582350549004, + "grad_norm": 23.101983111857713, + "learning_rate": 1.3283007816857416e-05, + "loss": 0.8261, + "step": 13162 + }, + { + "epoch": 5.3529890199268, + "grad_norm": 2.769339390294576, + "learning_rate": 1.3282050769551638e-05, + "loss": 0.0295, + "step": 13163 + }, + { + "epoch": 5.3533956893045955, + "grad_norm": 3.2958899257073164, + "learning_rate": 1.3281093688553957e-05, + "loss": 0.0871, + "step": 13164 + }, + { + "epoch": 5.353802358682391, + "grad_norm": 3.1850369698461356, + "learning_rate": 1.3280136573874199e-05, + "loss": 0.0655, + "step": 13165 + }, + { + "epoch": 5.354209028060187, + "grad_norm": 3.157427185876307, + "learning_rate": 1.3279179425522186e-05, + "loss": 0.0703, + "step": 13166 + }, + { + "epoch": 5.354615697437983, + "grad_norm": 1.412460318494312, + "learning_rate": 1.3278222243507748e-05, + "loss": 0.0138, + "step": 13167 + }, + { + "epoch": 5.355022366815779, + "grad_norm": 4.386904493493655, + "learning_rate": 1.327726502784071e-05, + "loss": 0.2073, + "step": 13168 + }, + { + "epoch": 5.355429036193574, + "grad_norm": 1.860761192609514, + "learning_rate": 1.3276307778530892e-05, + "loss": 0.0319, + "step": 13169 + }, + { + "epoch": 5.35583570557137, + "grad_norm": 4.873108434718305, + "learning_rate": 1.327535049558813e-05, + "loss": 0.0931, + "step": 13170 + }, + { + "epoch": 5.356242374949167, + "grad_norm": 2.8202268609421286, + "learning_rate": 1.3274393179022247e-05, + "loss": 0.0453, + "step": 13171 + }, + { + "epoch": 5.3566490443269625, + "grad_norm": 1.947606407365337, + "learning_rate": 1.3273435828843067e-05, + "loss": 0.0366, + "step": 13172 + }, + { + "epoch": 5.357055713704758, + "grad_norm": 2.7912356915216106, + "learning_rate": 1.3272478445060423e-05, + "loss": 0.0399, + "step": 13173 + }, + { + "epoch": 5.357462383082554, + "grad_norm": 0.919387940600492, + "learning_rate": 1.327152102768414e-05, + "loss": 0.0175, + "step": 13174 + }, + { + "epoch": 5.35786905246035, + "grad_norm": 3.608731379952578, + "learning_rate": 1.3270563576724044e-05, + "loss": 0.4097, + "step": 13175 + }, + { + "epoch": 5.358275721838146, + "grad_norm": 0.590726826942444, + "learning_rate": 1.3269606092189972e-05, + "loss": 0.0081, + "step": 13176 + }, + { + "epoch": 5.358682391215941, + "grad_norm": 8.698870385632214, + "learning_rate": 1.3268648574091747e-05, + "loss": 0.2582, + "step": 13177 + }, + { + "epoch": 5.359089060593737, + "grad_norm": 1.610400389132941, + "learning_rate": 1.3267691022439197e-05, + "loss": 0.0078, + "step": 13178 + }, + { + "epoch": 5.359495729971533, + "grad_norm": 3.4182296798490555, + "learning_rate": 1.3266733437242156e-05, + "loss": 0.0481, + "step": 13179 + }, + { + "epoch": 5.359902399349329, + "grad_norm": 0.5607806317954593, + "learning_rate": 1.3265775818510452e-05, + "loss": 0.0092, + "step": 13180 + }, + { + "epoch": 5.360309068727124, + "grad_norm": 7.5686476762065995, + "learning_rate": 1.3264818166253917e-05, + "loss": 0.1878, + "step": 13181 + }, + { + "epoch": 5.360715738104921, + "grad_norm": 6.200185569775657, + "learning_rate": 1.3263860480482376e-05, + "loss": 0.1148, + "step": 13182 + }, + { + "epoch": 5.361122407482717, + "grad_norm": 14.625806189950262, + "learning_rate": 1.3262902761205667e-05, + "loss": 0.1883, + "step": 13183 + }, + { + "epoch": 5.361529076860513, + "grad_norm": 9.538606089628319, + "learning_rate": 1.3261945008433621e-05, + "loss": 0.275, + "step": 13184 + }, + { + "epoch": 5.361935746238308, + "grad_norm": 0.1477239945782097, + "learning_rate": 1.3260987222176063e-05, + "loss": 0.0018, + "step": 13185 + }, + { + "epoch": 5.362342415616104, + "grad_norm": 4.57641619043561, + "learning_rate": 1.3260029402442834e-05, + "loss": 0.1452, + "step": 13186 + }, + { + "epoch": 5.3627490849939, + "grad_norm": 5.677891998494635, + "learning_rate": 1.325907154924376e-05, + "loss": 0.2458, + "step": 13187 + }, + { + "epoch": 5.363155754371696, + "grad_norm": 0.7000914682737314, + "learning_rate": 1.3258113662588677e-05, + "loss": 0.0131, + "step": 13188 + }, + { + "epoch": 5.363562423749491, + "grad_norm": 3.84158047234843, + "learning_rate": 1.3257155742487417e-05, + "loss": 0.0593, + "step": 13189 + }, + { + "epoch": 5.363969093127287, + "grad_norm": 8.416675017872008, + "learning_rate": 1.3256197788949816e-05, + "loss": 0.1923, + "step": 13190 + }, + { + "epoch": 5.364375762505083, + "grad_norm": 2.9249016932368033, + "learning_rate": 1.3255239801985704e-05, + "loss": 0.0547, + "step": 13191 + }, + { + "epoch": 5.36478243188288, + "grad_norm": 2.931444918906784, + "learning_rate": 1.3254281781604918e-05, + "loss": 0.1359, + "step": 13192 + }, + { + "epoch": 5.365189101260675, + "grad_norm": 7.546362192526891, + "learning_rate": 1.3253323727817294e-05, + "loss": 0.1924, + "step": 13193 + }, + { + "epoch": 5.365595770638471, + "grad_norm": 6.918018944131174, + "learning_rate": 1.3252365640632663e-05, + "loss": 0.2098, + "step": 13194 + }, + { + "epoch": 5.366002440016267, + "grad_norm": 8.052043794280529, + "learning_rate": 1.325140752006086e-05, + "loss": 0.2488, + "step": 13195 + }, + { + "epoch": 5.366409109394063, + "grad_norm": 0.01615785469543548, + "learning_rate": 1.3250449366111721e-05, + "loss": 0.0003, + "step": 13196 + }, + { + "epoch": 5.3668157787718584, + "grad_norm": 7.425399127598807, + "learning_rate": 1.3249491178795089e-05, + "loss": 0.2427, + "step": 13197 + }, + { + "epoch": 5.367222448149654, + "grad_norm": 0.31270420132258164, + "learning_rate": 1.324853295812079e-05, + "loss": 0.0027, + "step": 13198 + }, + { + "epoch": 5.36762911752745, + "grad_norm": 11.518733266017783, + "learning_rate": 1.3247574704098667e-05, + "loss": 0.3935, + "step": 13199 + }, + { + "epoch": 5.368035786905246, + "grad_norm": 8.557684812650777, + "learning_rate": 1.3246616416738557e-05, + "loss": 0.2268, + "step": 13200 + }, + { + "epoch": 5.3684424562830415, + "grad_norm": 12.444407325090236, + "learning_rate": 1.3245658096050291e-05, + "loss": 0.8854, + "step": 13201 + }, + { + "epoch": 5.368849125660837, + "grad_norm": 0.07105415742271007, + "learning_rate": 1.3244699742043714e-05, + "loss": 0.0014, + "step": 13202 + }, + { + "epoch": 5.369255795038634, + "grad_norm": 8.408299044096294, + "learning_rate": 1.324374135472866e-05, + "loss": 0.2727, + "step": 13203 + }, + { + "epoch": 5.36966246441643, + "grad_norm": 4.30918677575536, + "learning_rate": 1.3242782934114968e-05, + "loss": 0.1433, + "step": 13204 + }, + { + "epoch": 5.3700691337942255, + "grad_norm": 11.304863478234, + "learning_rate": 1.324182448021248e-05, + "loss": 0.311, + "step": 13205 + }, + { + "epoch": 5.370475803172021, + "grad_norm": 2.6160995024300755, + "learning_rate": 1.324086599303103e-05, + "loss": 0.0346, + "step": 13206 + }, + { + "epoch": 5.370882472549817, + "grad_norm": 2.07095221083517, + "learning_rate": 1.3239907472580458e-05, + "loss": 0.0715, + "step": 13207 + }, + { + "epoch": 5.371289141927613, + "grad_norm": 6.139680454600861, + "learning_rate": 1.3238948918870609e-05, + "loss": 0.3607, + "step": 13208 + }, + { + "epoch": 5.3716958113054085, + "grad_norm": 1.4650724141639673, + "learning_rate": 1.3237990331911318e-05, + "loss": 0.0295, + "step": 13209 + }, + { + "epoch": 5.372102480683204, + "grad_norm": 5.942797318109464, + "learning_rate": 1.3237031711712422e-05, + "loss": 0.2808, + "step": 13210 + }, + { + "epoch": 5.372509150061, + "grad_norm": 4.946166558739809, + "learning_rate": 1.3236073058283771e-05, + "loss": 0.496, + "step": 13211 + }, + { + "epoch": 5.372915819438797, + "grad_norm": 21.471870983991163, + "learning_rate": 1.32351143716352e-05, + "loss": 0.214, + "step": 13212 + }, + { + "epoch": 5.3733224888165925, + "grad_norm": 9.634810971691381, + "learning_rate": 1.3234155651776555e-05, + "loss": 0.5446, + "step": 13213 + }, + { + "epoch": 5.373729158194388, + "grad_norm": 5.463739626062729, + "learning_rate": 1.3233196898717671e-05, + "loss": 0.1218, + "step": 13214 + }, + { + "epoch": 5.374135827572184, + "grad_norm": 5.2329528934344784, + "learning_rate": 1.3232238112468392e-05, + "loss": 0.0837, + "step": 13215 + }, + { + "epoch": 5.37454249694998, + "grad_norm": 7.173934617668481, + "learning_rate": 1.3231279293038565e-05, + "loss": 0.16, + "step": 13216 + }, + { + "epoch": 5.3749491663277755, + "grad_norm": 6.352825434399872, + "learning_rate": 1.323032044043803e-05, + "loss": 0.4954, + "step": 13217 + }, + { + "epoch": 5.375355835705571, + "grad_norm": 0.6246428562964904, + "learning_rate": 1.3229361554676628e-05, + "loss": 0.0097, + "step": 13218 + }, + { + "epoch": 5.375762505083367, + "grad_norm": 3.2713227849879436, + "learning_rate": 1.3228402635764206e-05, + "loss": 0.0452, + "step": 13219 + }, + { + "epoch": 5.376169174461163, + "grad_norm": 5.154430335638133, + "learning_rate": 1.3227443683710603e-05, + "loss": 0.2641, + "step": 13220 + }, + { + "epoch": 5.376575843838959, + "grad_norm": 7.823662276915095, + "learning_rate": 1.322648469852567e-05, + "loss": 0.3961, + "step": 13221 + }, + { + "epoch": 5.376982513216754, + "grad_norm": 8.275949760420438, + "learning_rate": 1.3225525680219247e-05, + "loss": 0.282, + "step": 13222 + }, + { + "epoch": 5.377389182594551, + "grad_norm": 0.6996019631053241, + "learning_rate": 1.3224566628801179e-05, + "loss": 0.0102, + "step": 13223 + }, + { + "epoch": 5.377795851972347, + "grad_norm": 8.921145248783288, + "learning_rate": 1.322360754428131e-05, + "loss": 0.3407, + "step": 13224 + }, + { + "epoch": 5.3782025213501425, + "grad_norm": 0.9401211708343579, + "learning_rate": 1.3222648426669492e-05, + "loss": 0.0153, + "step": 13225 + }, + { + "epoch": 5.378609190727938, + "grad_norm": 7.0983407723671315, + "learning_rate": 1.3221689275975563e-05, + "loss": 0.2713, + "step": 13226 + }, + { + "epoch": 5.379015860105734, + "grad_norm": 0.49968414698858776, + "learning_rate": 1.3220730092209368e-05, + "loss": 0.0056, + "step": 13227 + }, + { + "epoch": 5.37942252948353, + "grad_norm": 11.758075600730756, + "learning_rate": 1.3219770875380765e-05, + "loss": 0.444, + "step": 13228 + }, + { + "epoch": 5.379829198861326, + "grad_norm": 2.9176259616861104, + "learning_rate": 1.3218811625499587e-05, + "loss": 0.0364, + "step": 13229 + }, + { + "epoch": 5.380235868239121, + "grad_norm": 21.276582495929393, + "learning_rate": 1.3217852342575692e-05, + "loss": 0.3526, + "step": 13230 + }, + { + "epoch": 5.380642537616917, + "grad_norm": 0.28483573371636006, + "learning_rate": 1.3216893026618921e-05, + "loss": 0.0039, + "step": 13231 + }, + { + "epoch": 5.381049206994713, + "grad_norm": 7.983658641109485, + "learning_rate": 1.3215933677639126e-05, + "loss": 0.2136, + "step": 13232 + }, + { + "epoch": 5.3814558763725096, + "grad_norm": 11.860671264176263, + "learning_rate": 1.321497429564615e-05, + "loss": 0.4732, + "step": 13233 + }, + { + "epoch": 5.381862545750305, + "grad_norm": 12.647715123944563, + "learning_rate": 1.3214014880649848e-05, + "loss": 0.56, + "step": 13234 + }, + { + "epoch": 5.382269215128101, + "grad_norm": 0.21118160514214526, + "learning_rate": 1.3213055432660066e-05, + "loss": 0.0041, + "step": 13235 + }, + { + "epoch": 5.382675884505897, + "grad_norm": 10.211028204048423, + "learning_rate": 1.321209595168665e-05, + "loss": 0.3238, + "step": 13236 + }, + { + "epoch": 5.383082553883693, + "grad_norm": 7.146869382941554, + "learning_rate": 1.3211136437739454e-05, + "loss": 0.1415, + "step": 13237 + }, + { + "epoch": 5.383489223261488, + "grad_norm": 4.77848016324653, + "learning_rate": 1.3210176890828325e-05, + "loss": 0.1534, + "step": 13238 + }, + { + "epoch": 5.383895892639284, + "grad_norm": 6.203320147792176, + "learning_rate": 1.3209217310963119e-05, + "loss": 0.1704, + "step": 13239 + }, + { + "epoch": 5.38430256201708, + "grad_norm": 19.4295964216844, + "learning_rate": 1.3208257698153677e-05, + "loss": 0.2418, + "step": 13240 + }, + { + "epoch": 5.384709231394876, + "grad_norm": 5.991323600897362, + "learning_rate": 1.3207298052409858e-05, + "loss": 0.0561, + "step": 13241 + }, + { + "epoch": 5.385115900772671, + "grad_norm": 10.490382700759108, + "learning_rate": 1.3206338373741509e-05, + "loss": 0.4045, + "step": 13242 + }, + { + "epoch": 5.385522570150467, + "grad_norm": 3.716427445907565, + "learning_rate": 1.3205378662158483e-05, + "loss": 0.0713, + "step": 13243 + }, + { + "epoch": 5.385929239528264, + "grad_norm": 5.822456167694081, + "learning_rate": 1.3204418917670632e-05, + "loss": 0.1466, + "step": 13244 + }, + { + "epoch": 5.38633590890606, + "grad_norm": 7.134486208487375, + "learning_rate": 1.3203459140287809e-05, + "loss": 0.2161, + "step": 13245 + }, + { + "epoch": 5.386742578283855, + "grad_norm": 2.266201069812964, + "learning_rate": 1.3202499330019862e-05, + "loss": 0.1627, + "step": 13246 + }, + { + "epoch": 5.387149247661651, + "grad_norm": 7.947247536075684, + "learning_rate": 1.3201539486876651e-05, + "loss": 0.2524, + "step": 13247 + }, + { + "epoch": 5.387555917039447, + "grad_norm": 9.373596824601712, + "learning_rate": 1.3200579610868025e-05, + "loss": 0.3268, + "step": 13248 + }, + { + "epoch": 5.387962586417243, + "grad_norm": 6.695409365714717, + "learning_rate": 1.319961970200384e-05, + "loss": 0.1656, + "step": 13249 + }, + { + "epoch": 5.3883692557950384, + "grad_norm": 19.641223855095593, + "learning_rate": 1.3198659760293948e-05, + "loss": 0.5871, + "step": 13250 + }, + { + "epoch": 5.388775925172834, + "grad_norm": 4.02615576151274, + "learning_rate": 1.3197699785748204e-05, + "loss": 0.1177, + "step": 13251 + }, + { + "epoch": 5.38918259455063, + "grad_norm": 22.952970095242243, + "learning_rate": 1.319673977837646e-05, + "loss": 0.5106, + "step": 13252 + }, + { + "epoch": 5.389589263928427, + "grad_norm": 0.016356794312513287, + "learning_rate": 1.3195779738188574e-05, + "loss": 0.0003, + "step": 13253 + }, + { + "epoch": 5.389995933306222, + "grad_norm": 8.028057795054815, + "learning_rate": 1.3194819665194402e-05, + "loss": 0.1346, + "step": 13254 + }, + { + "epoch": 5.390402602684018, + "grad_norm": 0.6069659906711432, + "learning_rate": 1.3193859559403796e-05, + "loss": 0.0104, + "step": 13255 + }, + { + "epoch": 5.390809272061814, + "grad_norm": 9.613178742446213, + "learning_rate": 1.3192899420826616e-05, + "loss": 0.2644, + "step": 13256 + }, + { + "epoch": 5.39121594143961, + "grad_norm": 6.160786849832562, + "learning_rate": 1.3191939249472716e-05, + "loss": 0.0445, + "step": 13257 + }, + { + "epoch": 5.3916226108174055, + "grad_norm": 8.693195588704729, + "learning_rate": 1.3190979045351955e-05, + "loss": 0.2374, + "step": 13258 + }, + { + "epoch": 5.392029280195201, + "grad_norm": 5.096202455775282, + "learning_rate": 1.3190018808474183e-05, + "loss": 0.1265, + "step": 13259 + }, + { + "epoch": 5.392435949572997, + "grad_norm": 9.855499141659863, + "learning_rate": 1.3189058538849266e-05, + "loss": 0.7657, + "step": 13260 + }, + { + "epoch": 5.392842618950793, + "grad_norm": 19.51092259784599, + "learning_rate": 1.3188098236487058e-05, + "loss": 0.1521, + "step": 13261 + }, + { + "epoch": 5.3932492883285885, + "grad_norm": 0.5228211841536974, + "learning_rate": 1.3187137901397415e-05, + "loss": 0.0095, + "step": 13262 + }, + { + "epoch": 5.393655957706384, + "grad_norm": 15.903090016961293, + "learning_rate": 1.31861775335902e-05, + "loss": 0.5383, + "step": 13263 + }, + { + "epoch": 5.394062627084181, + "grad_norm": 8.074836354841429, + "learning_rate": 1.318521713307527e-05, + "loss": 0.4179, + "step": 13264 + }, + { + "epoch": 5.394469296461977, + "grad_norm": 0.17605475238039234, + "learning_rate": 1.3184256699862478e-05, + "loss": 0.0032, + "step": 13265 + }, + { + "epoch": 5.3948759658397725, + "grad_norm": 2.3037952171534584, + "learning_rate": 1.3183296233961693e-05, + "loss": 0.0421, + "step": 13266 + }, + { + "epoch": 5.395282635217568, + "grad_norm": 3.9117722491014715, + "learning_rate": 1.318233573538277e-05, + "loss": 0.1093, + "step": 13267 + }, + { + "epoch": 5.395689304595364, + "grad_norm": 5.6990192397119905, + "learning_rate": 1.3181375204135564e-05, + "loss": 0.1427, + "step": 13268 + }, + { + "epoch": 5.39609597397316, + "grad_norm": 4.154776759951168, + "learning_rate": 1.3180414640229946e-05, + "loss": 0.0814, + "step": 13269 + }, + { + "epoch": 5.3965026433509555, + "grad_norm": 4.148141131976786, + "learning_rate": 1.317945404367577e-05, + "loss": 0.0777, + "step": 13270 + }, + { + "epoch": 5.396909312728751, + "grad_norm": 9.342511769008969, + "learning_rate": 1.3178493414482898e-05, + "loss": 0.2739, + "step": 13271 + }, + { + "epoch": 5.397315982106547, + "grad_norm": 12.839531922710075, + "learning_rate": 1.3177532752661188e-05, + "loss": 0.8234, + "step": 13272 + }, + { + "epoch": 5.397722651484343, + "grad_norm": 0.8740874215225063, + "learning_rate": 1.317657205822051e-05, + "loss": 0.0147, + "step": 13273 + }, + { + "epoch": 5.3981293208621395, + "grad_norm": 1.967496076412841, + "learning_rate": 1.3175611331170719e-05, + "loss": 0.0354, + "step": 13274 + }, + { + "epoch": 5.398535990239935, + "grad_norm": 0.21115567360011803, + "learning_rate": 1.3174650571521677e-05, + "loss": 0.0039, + "step": 13275 + }, + { + "epoch": 5.398942659617731, + "grad_norm": 22.989266504258772, + "learning_rate": 1.3173689779283251e-05, + "loss": 0.1514, + "step": 13276 + }, + { + "epoch": 5.399349328995527, + "grad_norm": 1.0192612268965018, + "learning_rate": 1.3172728954465304e-05, + "loss": 0.0314, + "step": 13277 + }, + { + "epoch": 5.3997559983733225, + "grad_norm": 12.770377476344985, + "learning_rate": 1.3171768097077695e-05, + "loss": 0.7547, + "step": 13278 + }, + { + "epoch": 5.400162667751118, + "grad_norm": 7.748013021651427, + "learning_rate": 1.3170807207130291e-05, + "loss": 0.2426, + "step": 13279 + }, + { + "epoch": 5.400569337128914, + "grad_norm": 8.841222106701029, + "learning_rate": 1.3169846284632955e-05, + "loss": 0.2321, + "step": 13280 + }, + { + "epoch": 5.40097600650671, + "grad_norm": 4.393202525064481, + "learning_rate": 1.316888532959555e-05, + "loss": 0.127, + "step": 13281 + }, + { + "epoch": 5.401382675884506, + "grad_norm": 8.086992253051168, + "learning_rate": 1.3167924342027947e-05, + "loss": 0.3316, + "step": 13282 + }, + { + "epoch": 5.401789345262301, + "grad_norm": 2.4014565061987434, + "learning_rate": 1.3166963321940003e-05, + "loss": 0.0527, + "step": 13283 + }, + { + "epoch": 5.402196014640097, + "grad_norm": 0.4606692956652088, + "learning_rate": 1.3166002269341588e-05, + "loss": 0.0091, + "step": 13284 + }, + { + "epoch": 5.402602684017894, + "grad_norm": 11.118421775317222, + "learning_rate": 1.3165041184242562e-05, + "loss": 0.707, + "step": 13285 + }, + { + "epoch": 5.4030093533956896, + "grad_norm": 8.360196290269196, + "learning_rate": 1.3164080066652801e-05, + "loss": 0.2909, + "step": 13286 + }, + { + "epoch": 5.403416022773485, + "grad_norm": 18.14115183292792, + "learning_rate": 1.3163118916582162e-05, + "loss": 0.6388, + "step": 13287 + }, + { + "epoch": 5.403822692151281, + "grad_norm": 12.570482024246388, + "learning_rate": 1.3162157734040515e-05, + "loss": 0.7011, + "step": 13288 + }, + { + "epoch": 5.404229361529077, + "grad_norm": 1.425950770392387, + "learning_rate": 1.3161196519037729e-05, + "loss": 0.0222, + "step": 13289 + }, + { + "epoch": 5.404636030906873, + "grad_norm": 2.3329738363575956, + "learning_rate": 1.316023527158367e-05, + "loss": 0.0523, + "step": 13290 + }, + { + "epoch": 5.405042700284668, + "grad_norm": 8.123307126579878, + "learning_rate": 1.3159273991688205e-05, + "loss": 0.2061, + "step": 13291 + }, + { + "epoch": 5.405449369662464, + "grad_norm": 0.4899319905675238, + "learning_rate": 1.3158312679361202e-05, + "loss": 0.0091, + "step": 13292 + }, + { + "epoch": 5.40585603904026, + "grad_norm": 6.136805453012753, + "learning_rate": 1.3157351334612532e-05, + "loss": 0.1467, + "step": 13293 + }, + { + "epoch": 5.406262708418057, + "grad_norm": 8.238104257179103, + "learning_rate": 1.3156389957452059e-05, + "loss": 0.3752, + "step": 13294 + }, + { + "epoch": 5.406669377795852, + "grad_norm": 7.147096731413703, + "learning_rate": 1.3155428547889653e-05, + "loss": 0.2363, + "step": 13295 + }, + { + "epoch": 5.407076047173648, + "grad_norm": 0.4011026041784135, + "learning_rate": 1.315446710593519e-05, + "loss": 0.0079, + "step": 13296 + }, + { + "epoch": 5.407482716551444, + "grad_norm": 2.589908308695359, + "learning_rate": 1.315350563159853e-05, + "loss": 0.1313, + "step": 13297 + }, + { + "epoch": 5.40788938592924, + "grad_norm": 100.37419922154146, + "learning_rate": 1.315254412488955e-05, + "loss": 0.5025, + "step": 13298 + }, + { + "epoch": 5.408296055307035, + "grad_norm": 6.338832230982837, + "learning_rate": 1.3151582585818119e-05, + "loss": 0.209, + "step": 13299 + }, + { + "epoch": 5.408702724684831, + "grad_norm": 5.187796308421306, + "learning_rate": 1.3150621014394103e-05, + "loss": 0.2055, + "step": 13300 + }, + { + "epoch": 5.409109394062627, + "grad_norm": 11.307667166545665, + "learning_rate": 1.3149659410627378e-05, + "loss": 0.3713, + "step": 13301 + }, + { + "epoch": 5.409516063440423, + "grad_norm": 9.145592127002468, + "learning_rate": 1.3148697774527816e-05, + "loss": 0.1963, + "step": 13302 + }, + { + "epoch": 5.4099227328182184, + "grad_norm": 0.6396219294798343, + "learning_rate": 1.3147736106105284e-05, + "loss": 0.0089, + "step": 13303 + }, + { + "epoch": 5.410329402196014, + "grad_norm": 0.2786449004909928, + "learning_rate": 1.3146774405369658e-05, + "loss": 0.0057, + "step": 13304 + }, + { + "epoch": 5.410736071573811, + "grad_norm": 5.855628738848351, + "learning_rate": 1.314581267233081e-05, + "loss": 0.1426, + "step": 13305 + }, + { + "epoch": 5.411142740951607, + "grad_norm": 3.80189928401076, + "learning_rate": 1.314485090699861e-05, + "loss": 0.0906, + "step": 13306 + }, + { + "epoch": 5.411549410329402, + "grad_norm": 9.842776624015006, + "learning_rate": 1.314388910938293e-05, + "loss": 0.4124, + "step": 13307 + }, + { + "epoch": 5.411956079707198, + "grad_norm": 10.932975190635984, + "learning_rate": 1.3142927279493649e-05, + "loss": 0.4385, + "step": 13308 + }, + { + "epoch": 5.412362749084994, + "grad_norm": 7.940588517899948, + "learning_rate": 1.314196541734064e-05, + "loss": 0.2916, + "step": 13309 + }, + { + "epoch": 5.41276941846279, + "grad_norm": 11.137838382295394, + "learning_rate": 1.314100352293377e-05, + "loss": 0.2564, + "step": 13310 + }, + { + "epoch": 5.4131760878405855, + "grad_norm": 6.590796197541756, + "learning_rate": 1.3140041596282923e-05, + "loss": 0.1869, + "step": 13311 + }, + { + "epoch": 5.413582757218381, + "grad_norm": 8.423125386196162, + "learning_rate": 1.3139079637397965e-05, + "loss": 0.0288, + "step": 13312 + }, + { + "epoch": 5.413989426596177, + "grad_norm": 7.5070402000169745, + "learning_rate": 1.3138117646288773e-05, + "loss": 0.5725, + "step": 13313 + }, + { + "epoch": 5.414396095973973, + "grad_norm": 9.592979939989588, + "learning_rate": 1.3137155622965228e-05, + "loss": 0.3534, + "step": 13314 + }, + { + "epoch": 5.414802765351769, + "grad_norm": 2.5639861639170176, + "learning_rate": 1.3136193567437201e-05, + "loss": 0.0261, + "step": 13315 + }, + { + "epoch": 5.415209434729565, + "grad_norm": 0.3782108042354423, + "learning_rate": 1.3135231479714568e-05, + "loss": 0.0102, + "step": 13316 + }, + { + "epoch": 5.415616104107361, + "grad_norm": 9.333238552380946, + "learning_rate": 1.3134269359807203e-05, + "loss": 0.1956, + "step": 13317 + }, + { + "epoch": 5.416022773485157, + "grad_norm": 2.620853588419267, + "learning_rate": 1.313330720772499e-05, + "loss": 0.0478, + "step": 13318 + }, + { + "epoch": 5.4164294428629525, + "grad_norm": 10.026659855444993, + "learning_rate": 1.31323450234778e-05, + "loss": 0.5132, + "step": 13319 + }, + { + "epoch": 5.416836112240748, + "grad_norm": 4.185706367593504, + "learning_rate": 1.313138280707551e-05, + "loss": 0.258, + "step": 13320 + }, + { + "epoch": 5.417242781618544, + "grad_norm": 8.046266031601526, + "learning_rate": 1.3130420558528001e-05, + "loss": 0.0969, + "step": 13321 + }, + { + "epoch": 5.41764945099634, + "grad_norm": 0.2865984249198246, + "learning_rate": 1.312945827784515e-05, + "loss": 0.0047, + "step": 13322 + }, + { + "epoch": 5.4180561203741355, + "grad_norm": 4.087272831328965, + "learning_rate": 1.3128495965036833e-05, + "loss": 0.0906, + "step": 13323 + }, + { + "epoch": 5.418462789751931, + "grad_norm": 0.24961503138722232, + "learning_rate": 1.3127533620112934e-05, + "loss": 0.0031, + "step": 13324 + }, + { + "epoch": 5.418869459129727, + "grad_norm": 0.17869072445341444, + "learning_rate": 1.3126571243083326e-05, + "loss": 0.0034, + "step": 13325 + }, + { + "epoch": 5.419276128507524, + "grad_norm": 12.826748185870585, + "learning_rate": 1.312560883395789e-05, + "loss": 0.5295, + "step": 13326 + }, + { + "epoch": 5.4196827978853195, + "grad_norm": 0.6661717823552786, + "learning_rate": 1.3124646392746506e-05, + "loss": 0.0094, + "step": 13327 + }, + { + "epoch": 5.420089467263115, + "grad_norm": 9.241424748747827, + "learning_rate": 1.3123683919459058e-05, + "loss": 0.2473, + "step": 13328 + }, + { + "epoch": 5.420496136640911, + "grad_norm": 4.0677572402021696, + "learning_rate": 1.3122721414105419e-05, + "loss": 0.0575, + "step": 13329 + }, + { + "epoch": 5.420902806018707, + "grad_norm": 9.112178422843924, + "learning_rate": 1.3121758876695472e-05, + "loss": 0.4835, + "step": 13330 + }, + { + "epoch": 5.4213094753965025, + "grad_norm": 0.5836021488345723, + "learning_rate": 1.3120796307239103e-05, + "loss": 0.011, + "step": 13331 + }, + { + "epoch": 5.421716144774298, + "grad_norm": 6.280685302526331, + "learning_rate": 1.3119833705746186e-05, + "loss": 0.1361, + "step": 13332 + }, + { + "epoch": 5.422122814152094, + "grad_norm": 8.661956736656212, + "learning_rate": 1.3118871072226606e-05, + "loss": 0.3282, + "step": 13333 + }, + { + "epoch": 5.42252948352989, + "grad_norm": 6.082795948928181, + "learning_rate": 1.3117908406690246e-05, + "loss": 0.1727, + "step": 13334 + }, + { + "epoch": 5.4229361529076865, + "grad_norm": 0.8915687394687682, + "learning_rate": 1.3116945709146986e-05, + "loss": 0.0119, + "step": 13335 + }, + { + "epoch": 5.423342822285482, + "grad_norm": 8.551938357395537, + "learning_rate": 1.3115982979606709e-05, + "loss": 0.2593, + "step": 13336 + }, + { + "epoch": 5.423749491663278, + "grad_norm": 0.421719519331068, + "learning_rate": 1.31150202180793e-05, + "loss": 0.007, + "step": 13337 + }, + { + "epoch": 5.424156161041074, + "grad_norm": 9.523846507213126, + "learning_rate": 1.3114057424574642e-05, + "loss": 0.3364, + "step": 13338 + }, + { + "epoch": 5.4245628304188696, + "grad_norm": 8.719891924149385, + "learning_rate": 1.3113094599102614e-05, + "loss": 0.2041, + "step": 13339 + }, + { + "epoch": 5.424969499796665, + "grad_norm": 4.9383540156214165, + "learning_rate": 1.3112131741673105e-05, + "loss": 0.1532, + "step": 13340 + }, + { + "epoch": 5.425376169174461, + "grad_norm": 4.772182477405232, + "learning_rate": 1.3111168852295997e-05, + "loss": 0.0823, + "step": 13341 + }, + { + "epoch": 5.425782838552257, + "grad_norm": 0.5023347610061308, + "learning_rate": 1.3110205930981175e-05, + "loss": 0.0075, + "step": 13342 + }, + { + "epoch": 5.426189507930053, + "grad_norm": 6.758235730164817, + "learning_rate": 1.3109242977738523e-05, + "loss": 0.2593, + "step": 13343 + }, + { + "epoch": 5.426596177307848, + "grad_norm": 0.6867912250502276, + "learning_rate": 1.3108279992577928e-05, + "loss": 0.0097, + "step": 13344 + }, + { + "epoch": 5.427002846685644, + "grad_norm": 3.015641200365751, + "learning_rate": 1.3107316975509274e-05, + "loss": 0.095, + "step": 13345 + }, + { + "epoch": 5.427409516063441, + "grad_norm": 8.798843875324746, + "learning_rate": 1.3106353926542448e-05, + "loss": 0.2964, + "step": 13346 + }, + { + "epoch": 5.427816185441237, + "grad_norm": 2.236030746328963, + "learning_rate": 1.3105390845687333e-05, + "loss": 0.0985, + "step": 13347 + }, + { + "epoch": 5.428222854819032, + "grad_norm": 9.803722617937234, + "learning_rate": 1.3104427732953821e-05, + "loss": 0.181, + "step": 13348 + }, + { + "epoch": 5.428629524196828, + "grad_norm": 3.1898687273519584, + "learning_rate": 1.3103464588351798e-05, + "loss": 0.0416, + "step": 13349 + }, + { + "epoch": 5.429036193574624, + "grad_norm": 2.5270359186571714, + "learning_rate": 1.3102501411891146e-05, + "loss": 0.0382, + "step": 13350 + }, + { + "epoch": 5.42944286295242, + "grad_norm": 7.472654242479152, + "learning_rate": 1.3101538203581755e-05, + "loss": 0.2972, + "step": 13351 + }, + { + "epoch": 5.429849532330215, + "grad_norm": 7.614257477362424, + "learning_rate": 1.3100574963433514e-05, + "loss": 0.1423, + "step": 13352 + }, + { + "epoch": 5.430256201708011, + "grad_norm": 2.3125327686294983, + "learning_rate": 1.3099611691456311e-05, + "loss": 0.0249, + "step": 13353 + }, + { + "epoch": 5.430662871085807, + "grad_norm": 0.13292456667644117, + "learning_rate": 1.3098648387660034e-05, + "loss": 0.0024, + "step": 13354 + }, + { + "epoch": 5.431069540463603, + "grad_norm": 3.825609124511128, + "learning_rate": 1.309768505205457e-05, + "loss": 0.0769, + "step": 13355 + }, + { + "epoch": 5.431476209841399, + "grad_norm": 10.52984024182033, + "learning_rate": 1.3096721684649811e-05, + "loss": 0.2019, + "step": 13356 + }, + { + "epoch": 5.431882879219195, + "grad_norm": 5.810976817789636, + "learning_rate": 1.3095758285455647e-05, + "loss": 0.2537, + "step": 13357 + }, + { + "epoch": 5.432289548596991, + "grad_norm": 3.2042682372402393, + "learning_rate": 1.3094794854481962e-05, + "loss": 0.0565, + "step": 13358 + }, + { + "epoch": 5.432696217974787, + "grad_norm": 11.519888177375964, + "learning_rate": 1.3093831391738653e-05, + "loss": 0.5823, + "step": 13359 + }, + { + "epoch": 5.433102887352582, + "grad_norm": 4.782142071165482, + "learning_rate": 1.309286789723561e-05, + "loss": 0.2302, + "step": 13360 + }, + { + "epoch": 5.433509556730378, + "grad_norm": 0.9939393259409011, + "learning_rate": 1.309190437098272e-05, + "loss": 0.0172, + "step": 13361 + }, + { + "epoch": 5.433916226108174, + "grad_norm": 16.73500459574792, + "learning_rate": 1.3090940812989871e-05, + "loss": 0.3924, + "step": 13362 + }, + { + "epoch": 5.43432289548597, + "grad_norm": 3.4087808440617864, + "learning_rate": 1.3089977223266965e-05, + "loss": 0.0779, + "step": 13363 + }, + { + "epoch": 5.4347295648637655, + "grad_norm": 1.860373853064332, + "learning_rate": 1.3089013601823885e-05, + "loss": 0.0319, + "step": 13364 + }, + { + "epoch": 5.435136234241561, + "grad_norm": 8.696254080092137, + "learning_rate": 1.3088049948670524e-05, + "loss": 0.165, + "step": 13365 + }, + { + "epoch": 5.435542903619357, + "grad_norm": 5.94515637419597, + "learning_rate": 1.3087086263816779e-05, + "loss": 0.0931, + "step": 13366 + }, + { + "epoch": 5.435949572997154, + "grad_norm": 7.613473298942168, + "learning_rate": 1.308612254727254e-05, + "loss": 0.2815, + "step": 13367 + }, + { + "epoch": 5.436356242374949, + "grad_norm": 6.386138964359928, + "learning_rate": 1.3085158799047697e-05, + "loss": 0.0779, + "step": 13368 + }, + { + "epoch": 5.436762911752745, + "grad_norm": 3.8732446825254017, + "learning_rate": 1.3084195019152149e-05, + "loss": 0.0796, + "step": 13369 + }, + { + "epoch": 5.437169581130541, + "grad_norm": 8.301992897911614, + "learning_rate": 1.3083231207595785e-05, + "loss": 0.4696, + "step": 13370 + }, + { + "epoch": 5.437576250508337, + "grad_norm": 5.974411160528429, + "learning_rate": 1.3082267364388498e-05, + "loss": 0.1157, + "step": 13371 + }, + { + "epoch": 5.4379829198861325, + "grad_norm": 1.8809251939639877, + "learning_rate": 1.3081303489540191e-05, + "loss": 0.0281, + "step": 13372 + }, + { + "epoch": 5.438389589263928, + "grad_norm": 1.277334280835187, + "learning_rate": 1.3080339583060751e-05, + "loss": 0.0261, + "step": 13373 + }, + { + "epoch": 5.438796258641724, + "grad_norm": 3.1870580516139797, + "learning_rate": 1.3079375644960077e-05, + "loss": 0.073, + "step": 13374 + }, + { + "epoch": 5.43920292801952, + "grad_norm": 0.13879190448059067, + "learning_rate": 1.3078411675248057e-05, + "loss": 0.0023, + "step": 13375 + }, + { + "epoch": 5.439609597397316, + "grad_norm": 2.495422666574623, + "learning_rate": 1.3077447673934595e-05, + "loss": 0.0378, + "step": 13376 + }, + { + "epoch": 5.440016266775112, + "grad_norm": 0.16352794694660835, + "learning_rate": 1.3076483641029586e-05, + "loss": 0.0024, + "step": 13377 + }, + { + "epoch": 5.440422936152908, + "grad_norm": 14.673697027040703, + "learning_rate": 1.3075519576542919e-05, + "loss": 0.148, + "step": 13378 + }, + { + "epoch": 5.440829605530704, + "grad_norm": 23.697455123487025, + "learning_rate": 1.3074555480484498e-05, + "loss": 0.4507, + "step": 13379 + }, + { + "epoch": 5.4412362749084995, + "grad_norm": 7.515929917917983, + "learning_rate": 1.3073591352864219e-05, + "loss": 0.1829, + "step": 13380 + }, + { + "epoch": 5.441642944286295, + "grad_norm": 3.598404162781084, + "learning_rate": 1.3072627193691974e-05, + "loss": 0.0935, + "step": 13381 + }, + { + "epoch": 5.442049613664091, + "grad_norm": 5.426351970975582, + "learning_rate": 1.3071663002977667e-05, + "loss": 0.0513, + "step": 13382 + }, + { + "epoch": 5.442456283041887, + "grad_norm": 0.714093808383077, + "learning_rate": 1.3070698780731194e-05, + "loss": 0.0068, + "step": 13383 + }, + { + "epoch": 5.4428629524196825, + "grad_norm": 1.9738151886683066, + "learning_rate": 1.3069734526962452e-05, + "loss": 0.0498, + "step": 13384 + }, + { + "epoch": 5.443269621797478, + "grad_norm": 3.8935165872233384, + "learning_rate": 1.3068770241681338e-05, + "loss": 0.0493, + "step": 13385 + }, + { + "epoch": 5.443676291175274, + "grad_norm": 0.061831821276080334, + "learning_rate": 1.3067805924897758e-05, + "loss": 0.0011, + "step": 13386 + }, + { + "epoch": 5.444082960553071, + "grad_norm": 2.1643770512903604, + "learning_rate": 1.3066841576621603e-05, + "loss": 0.0272, + "step": 13387 + }, + { + "epoch": 5.4444896299308665, + "grad_norm": 0.37141717418876186, + "learning_rate": 1.3065877196862775e-05, + "loss": 0.0049, + "step": 13388 + }, + { + "epoch": 5.444896299308662, + "grad_norm": 8.938434340275055, + "learning_rate": 1.3064912785631177e-05, + "loss": 0.2525, + "step": 13389 + }, + { + "epoch": 5.445302968686458, + "grad_norm": 25.590592579591654, + "learning_rate": 1.3063948342936703e-05, + "loss": 1.0585, + "step": 13390 + }, + { + "epoch": 5.445709638064254, + "grad_norm": 5.4266523315687545, + "learning_rate": 1.3062983868789258e-05, + "loss": 0.093, + "step": 13391 + }, + { + "epoch": 5.4461163074420496, + "grad_norm": 25.524469627094835, + "learning_rate": 1.3062019363198746e-05, + "loss": 0.652, + "step": 13392 + }, + { + "epoch": 5.446522976819845, + "grad_norm": 1.3741253222735714, + "learning_rate": 1.3061054826175061e-05, + "loss": 0.0186, + "step": 13393 + }, + { + "epoch": 5.446929646197641, + "grad_norm": 2.5900554912274574, + "learning_rate": 1.3060090257728106e-05, + "loss": 0.0491, + "step": 13394 + }, + { + "epoch": 5.447336315575437, + "grad_norm": 16.1420899785262, + "learning_rate": 1.3059125657867788e-05, + "loss": 0.8241, + "step": 13395 + }, + { + "epoch": 5.447742984953233, + "grad_norm": 13.688410458701446, + "learning_rate": 1.3058161026604004e-05, + "loss": 0.4814, + "step": 13396 + }, + { + "epoch": 5.448149654331029, + "grad_norm": 4.528570558559385, + "learning_rate": 1.3057196363946657e-05, + "loss": 0.1153, + "step": 13397 + }, + { + "epoch": 5.448556323708825, + "grad_norm": 1.1906947980691975, + "learning_rate": 1.3056231669905654e-05, + "loss": 0.0168, + "step": 13398 + }, + { + "epoch": 5.448962993086621, + "grad_norm": 1.0414114789015902, + "learning_rate": 1.3055266944490893e-05, + "loss": 0.0249, + "step": 13399 + }, + { + "epoch": 5.449369662464417, + "grad_norm": 0.6271432055134172, + "learning_rate": 1.3054302187712278e-05, + "loss": 0.0082, + "step": 13400 + }, + { + "epoch": 5.449776331842212, + "grad_norm": 6.7852894388917395, + "learning_rate": 1.3053337399579716e-05, + "loss": 0.1929, + "step": 13401 + }, + { + "epoch": 5.450183001220008, + "grad_norm": 6.676151476583309, + "learning_rate": 1.3052372580103105e-05, + "loss": 0.1073, + "step": 13402 + }, + { + "epoch": 5.450589670597804, + "grad_norm": 7.314109295427627, + "learning_rate": 1.3051407729292356e-05, + "loss": 0.3131, + "step": 13403 + }, + { + "epoch": 5.4509963399756, + "grad_norm": 1.0755331115825435, + "learning_rate": 1.3050442847157369e-05, + "loss": 0.0129, + "step": 13404 + }, + { + "epoch": 5.451403009353395, + "grad_norm": 7.328488316634265, + "learning_rate": 1.3049477933708055e-05, + "loss": 0.318, + "step": 13405 + }, + { + "epoch": 5.451809678731191, + "grad_norm": 5.251818221126663, + "learning_rate": 1.3048512988954309e-05, + "loss": 0.2657, + "step": 13406 + }, + { + "epoch": 5.452216348108988, + "grad_norm": 35.49318044575245, + "learning_rate": 1.3047548012906048e-05, + "loss": 1.0789, + "step": 13407 + }, + { + "epoch": 5.452623017486784, + "grad_norm": 0.7076275412402839, + "learning_rate": 1.3046583005573172e-05, + "loss": 0.0118, + "step": 13408 + }, + { + "epoch": 5.453029686864579, + "grad_norm": 1.0137177005850924, + "learning_rate": 1.3045617966965586e-05, + "loss": 0.0162, + "step": 13409 + }, + { + "epoch": 5.453436356242375, + "grad_norm": 12.916661401973185, + "learning_rate": 1.3044652897093202e-05, + "loss": 0.5962, + "step": 13410 + }, + { + "epoch": 5.453843025620171, + "grad_norm": 1.5664550651136446, + "learning_rate": 1.304368779596592e-05, + "loss": 0.0323, + "step": 13411 + }, + { + "epoch": 5.454249694997967, + "grad_norm": 1.8300211193701434, + "learning_rate": 1.3042722663593654e-05, + "loss": 0.0378, + "step": 13412 + }, + { + "epoch": 5.454656364375762, + "grad_norm": 4.897250171570759, + "learning_rate": 1.3041757499986307e-05, + "loss": 0.2175, + "step": 13413 + }, + { + "epoch": 5.455063033753558, + "grad_norm": 1.6892283801698393, + "learning_rate": 1.3040792305153789e-05, + "loss": 0.0333, + "step": 13414 + }, + { + "epoch": 5.455469703131354, + "grad_norm": 7.9411027418933795, + "learning_rate": 1.3039827079106007e-05, + "loss": 0.508, + "step": 13415 + }, + { + "epoch": 5.45587637250915, + "grad_norm": 5.7612877728423255, + "learning_rate": 1.3038861821852868e-05, + "loss": 0.1362, + "step": 13416 + }, + { + "epoch": 5.456283041886946, + "grad_norm": 5.14002631538055, + "learning_rate": 1.3037896533404286e-05, + "loss": 0.0688, + "step": 13417 + }, + { + "epoch": 5.456689711264742, + "grad_norm": 5.65975020409113, + "learning_rate": 1.3036931213770167e-05, + "loss": 0.1055, + "step": 13418 + }, + { + "epoch": 5.457096380642538, + "grad_norm": 2.2646383679938404, + "learning_rate": 1.3035965862960422e-05, + "loss": 0.0407, + "step": 13419 + }, + { + "epoch": 5.457503050020334, + "grad_norm": 6.601370397853813, + "learning_rate": 1.3035000480984957e-05, + "loss": 0.3437, + "step": 13420 + }, + { + "epoch": 5.457909719398129, + "grad_norm": 6.947810977146708, + "learning_rate": 1.3034035067853683e-05, + "loss": 0.1465, + "step": 13421 + }, + { + "epoch": 5.458316388775925, + "grad_norm": 2.1892847803360507, + "learning_rate": 1.3033069623576515e-05, + "loss": 0.0431, + "step": 13422 + }, + { + "epoch": 5.458723058153721, + "grad_norm": 10.123744234569934, + "learning_rate": 1.303210414816336e-05, + "loss": 0.9715, + "step": 13423 + }, + { + "epoch": 5.459129727531517, + "grad_norm": 4.924917586362144, + "learning_rate": 1.303113864162413e-05, + "loss": 0.1297, + "step": 13424 + }, + { + "epoch": 5.4595363969093125, + "grad_norm": 6.132474248094052, + "learning_rate": 1.3030173103968738e-05, + "loss": 0.1914, + "step": 13425 + }, + { + "epoch": 5.459943066287108, + "grad_norm": 15.42500400009907, + "learning_rate": 1.302920753520709e-05, + "loss": 0.9454, + "step": 13426 + }, + { + "epoch": 5.460349735664904, + "grad_norm": 1.8552774061498294, + "learning_rate": 1.3028241935349104e-05, + "loss": 0.0432, + "step": 13427 + }, + { + "epoch": 5.460756405042701, + "grad_norm": 6.868785952513772, + "learning_rate": 1.3027276304404692e-05, + "loss": 0.1472, + "step": 13428 + }, + { + "epoch": 5.461163074420496, + "grad_norm": 4.677733373243667, + "learning_rate": 1.3026310642383761e-05, + "loss": 0.0705, + "step": 13429 + }, + { + "epoch": 5.461569743798292, + "grad_norm": 24.567991882545872, + "learning_rate": 1.3025344949296232e-05, + "loss": 1.2514, + "step": 13430 + }, + { + "epoch": 5.461976413176088, + "grad_norm": 0.7320314576365992, + "learning_rate": 1.3024379225152016e-05, + "loss": 0.0132, + "step": 13431 + }, + { + "epoch": 5.462383082553884, + "grad_norm": 11.189008544193124, + "learning_rate": 1.3023413469961022e-05, + "loss": 0.5889, + "step": 13432 + }, + { + "epoch": 5.4627897519316795, + "grad_norm": 5.031000945013458, + "learning_rate": 1.3022447683733167e-05, + "loss": 0.2495, + "step": 13433 + }, + { + "epoch": 5.463196421309475, + "grad_norm": 2.774510507516891, + "learning_rate": 1.3021481866478364e-05, + "loss": 0.0682, + "step": 13434 + }, + { + "epoch": 5.463603090687271, + "grad_norm": 0.220087843969907, + "learning_rate": 1.3020516018206533e-05, + "loss": 0.0037, + "step": 13435 + }, + { + "epoch": 5.464009760065067, + "grad_norm": 6.411000351652352, + "learning_rate": 1.301955013892758e-05, + "loss": 0.3792, + "step": 13436 + }, + { + "epoch": 5.4644164294428625, + "grad_norm": 9.995889980954939, + "learning_rate": 1.301858422865143e-05, + "loss": 0.5007, + "step": 13437 + }, + { + "epoch": 5.464823098820659, + "grad_norm": 0.8727256122608192, + "learning_rate": 1.3017618287387992e-05, + "loss": 0.0208, + "step": 13438 + }, + { + "epoch": 5.465229768198455, + "grad_norm": 4.086223808857772, + "learning_rate": 1.301665231514718e-05, + "loss": 0.0985, + "step": 13439 + }, + { + "epoch": 5.465636437576251, + "grad_norm": 3.449173680865074, + "learning_rate": 1.301568631193892e-05, + "loss": 0.0543, + "step": 13440 + }, + { + "epoch": 5.4660431069540465, + "grad_norm": 7.24058827599575, + "learning_rate": 1.3014720277773119e-05, + "loss": 0.1623, + "step": 13441 + }, + { + "epoch": 5.466449776331842, + "grad_norm": 12.431415238176466, + "learning_rate": 1.3013754212659695e-05, + "loss": 0.3124, + "step": 13442 + }, + { + "epoch": 5.466856445709638, + "grad_norm": 1.0425682151617643, + "learning_rate": 1.301278811660857e-05, + "loss": 0.0175, + "step": 13443 + }, + { + "epoch": 5.467263115087434, + "grad_norm": 7.309365770769184, + "learning_rate": 1.3011821989629659e-05, + "loss": 0.1774, + "step": 13444 + }, + { + "epoch": 5.4676697844652296, + "grad_norm": 6.948056046277232, + "learning_rate": 1.301085583173288e-05, + "loss": 0.1905, + "step": 13445 + }, + { + "epoch": 5.468076453843025, + "grad_norm": 14.393292044271089, + "learning_rate": 1.3009889642928147e-05, + "loss": 0.3899, + "step": 13446 + }, + { + "epoch": 5.468483123220821, + "grad_norm": 5.558754644566827, + "learning_rate": 1.3008923423225385e-05, + "loss": 0.0667, + "step": 13447 + }, + { + "epoch": 5.468889792598618, + "grad_norm": 4.347222376905742, + "learning_rate": 1.3007957172634509e-05, + "loss": 0.1528, + "step": 13448 + }, + { + "epoch": 5.4692964619764135, + "grad_norm": 0.762603077149142, + "learning_rate": 1.300699089116544e-05, + "loss": 0.0179, + "step": 13449 + }, + { + "epoch": 5.469703131354209, + "grad_norm": 6.394841952981122, + "learning_rate": 1.3006024578828094e-05, + "loss": 0.2313, + "step": 13450 + }, + { + "epoch": 5.470109800732005, + "grad_norm": 10.190120682923144, + "learning_rate": 1.3005058235632393e-05, + "loss": 0.6383, + "step": 13451 + }, + { + "epoch": 5.470516470109801, + "grad_norm": 6.635729632355221, + "learning_rate": 1.3004091861588256e-05, + "loss": 0.1904, + "step": 13452 + }, + { + "epoch": 5.470923139487597, + "grad_norm": 8.38587912985447, + "learning_rate": 1.3003125456705607e-05, + "loss": 0.1404, + "step": 13453 + }, + { + "epoch": 5.471329808865392, + "grad_norm": 10.515607235675278, + "learning_rate": 1.3002159020994362e-05, + "loss": 0.3887, + "step": 13454 + }, + { + "epoch": 5.471736478243188, + "grad_norm": 4.720075946816048, + "learning_rate": 1.3001192554464443e-05, + "loss": 0.0915, + "step": 13455 + }, + { + "epoch": 5.472143147620984, + "grad_norm": 5.324814847390708, + "learning_rate": 1.3000226057125773e-05, + "loss": 0.0957, + "step": 13456 + }, + { + "epoch": 5.47254981699878, + "grad_norm": 13.862905250538807, + "learning_rate": 1.2999259528988273e-05, + "loss": 0.13, + "step": 13457 + }, + { + "epoch": 5.472956486376576, + "grad_norm": 3.1060933558406263, + "learning_rate": 1.2998292970061862e-05, + "loss": 0.0468, + "step": 13458 + }, + { + "epoch": 5.473363155754372, + "grad_norm": 4.2415491467358315, + "learning_rate": 1.2997326380356468e-05, + "loss": 0.0798, + "step": 13459 + }, + { + "epoch": 5.473769825132168, + "grad_norm": 10.070592244526514, + "learning_rate": 1.299635975988201e-05, + "loss": 0.5426, + "step": 13460 + }, + { + "epoch": 5.474176494509964, + "grad_norm": 12.788916960431669, + "learning_rate": 1.2995393108648406e-05, + "loss": 0.4007, + "step": 13461 + }, + { + "epoch": 5.474583163887759, + "grad_norm": 1.7656308319158471, + "learning_rate": 1.299442642666559e-05, + "loss": 0.0231, + "step": 13462 + }, + { + "epoch": 5.474989833265555, + "grad_norm": 1.0230984020780998, + "learning_rate": 1.2993459713943478e-05, + "loss": 0.0164, + "step": 13463 + }, + { + "epoch": 5.475396502643351, + "grad_norm": 3.5784835275316715, + "learning_rate": 1.2992492970491995e-05, + "loss": 0.045, + "step": 13464 + }, + { + "epoch": 5.475803172021147, + "grad_norm": 5.2388461308084056, + "learning_rate": 1.2991526196321065e-05, + "loss": 0.1863, + "step": 13465 + }, + { + "epoch": 5.476209841398942, + "grad_norm": 6.3694112191481995, + "learning_rate": 1.2990559391440613e-05, + "loss": 0.1361, + "step": 13466 + }, + { + "epoch": 5.476616510776738, + "grad_norm": 6.391899864573731, + "learning_rate": 1.2989592555860564e-05, + "loss": 0.2508, + "step": 13467 + }, + { + "epoch": 5.477023180154534, + "grad_norm": 1.5267306739780988, + "learning_rate": 1.2988625689590842e-05, + "loss": 0.0133, + "step": 13468 + }, + { + "epoch": 5.477429849532331, + "grad_norm": 8.131200298402812, + "learning_rate": 1.2987658792641375e-05, + "loss": 0.1898, + "step": 13469 + }, + { + "epoch": 5.477836518910126, + "grad_norm": 0.14514226663808713, + "learning_rate": 1.2986691865022088e-05, + "loss": 0.0022, + "step": 13470 + }, + { + "epoch": 5.478243188287922, + "grad_norm": 14.337427358102401, + "learning_rate": 1.29857249067429e-05, + "loss": 0.5647, + "step": 13471 + }, + { + "epoch": 5.478649857665718, + "grad_norm": 7.985929705255045, + "learning_rate": 1.2984757917813748e-05, + "loss": 0.5598, + "step": 13472 + }, + { + "epoch": 5.479056527043514, + "grad_norm": 0.07684755061099867, + "learning_rate": 1.2983790898244553e-05, + "loss": 0.0013, + "step": 13473 + }, + { + "epoch": 5.479463196421309, + "grad_norm": 6.257286150173162, + "learning_rate": 1.2982823848045243e-05, + "loss": 0.1269, + "step": 13474 + }, + { + "epoch": 5.479869865799105, + "grad_norm": 4.51469930170028, + "learning_rate": 1.2981856767225745e-05, + "loss": 0.0984, + "step": 13475 + }, + { + "epoch": 5.480276535176901, + "grad_norm": 10.663141373791474, + "learning_rate": 1.2980889655795986e-05, + "loss": 0.6574, + "step": 13476 + }, + { + "epoch": 5.480683204554697, + "grad_norm": 8.972287658515876, + "learning_rate": 1.2979922513765894e-05, + "loss": 0.8618, + "step": 13477 + }, + { + "epoch": 5.4810898739324925, + "grad_norm": 1.679056171284813, + "learning_rate": 1.2978955341145397e-05, + "loss": 0.0235, + "step": 13478 + }, + { + "epoch": 5.481496543310289, + "grad_norm": 0.1437265756993651, + "learning_rate": 1.2977988137944425e-05, + "loss": 0.0029, + "step": 13479 + }, + { + "epoch": 5.481903212688085, + "grad_norm": 4.050279917589606, + "learning_rate": 1.2977020904172908e-05, + "loss": 0.0643, + "step": 13480 + }, + { + "epoch": 5.482309882065881, + "grad_norm": 5.015319593956263, + "learning_rate": 1.297605363984077e-05, + "loss": 0.12, + "step": 13481 + }, + { + "epoch": 5.482716551443676, + "grad_norm": 3.958415478687418, + "learning_rate": 1.2975086344957943e-05, + "loss": 0.0683, + "step": 13482 + }, + { + "epoch": 5.483123220821472, + "grad_norm": 4.814394260586103, + "learning_rate": 1.297411901953436e-05, + "loss": 0.0915, + "step": 13483 + }, + { + "epoch": 5.483529890199268, + "grad_norm": 1.56491885005329, + "learning_rate": 1.2973151663579948e-05, + "loss": 0.0131, + "step": 13484 + }, + { + "epoch": 5.483936559577064, + "grad_norm": 5.627357162705599, + "learning_rate": 1.2972184277104639e-05, + "loss": 0.1046, + "step": 13485 + }, + { + "epoch": 5.4843432289548595, + "grad_norm": 8.63345619494572, + "learning_rate": 1.297121686011836e-05, + "loss": 0.2723, + "step": 13486 + }, + { + "epoch": 5.484749898332655, + "grad_norm": 11.605433633963692, + "learning_rate": 1.2970249412631044e-05, + "loss": 0.5677, + "step": 13487 + }, + { + "epoch": 5.485156567710451, + "grad_norm": 5.892831833297422, + "learning_rate": 1.2969281934652625e-05, + "loss": 0.1232, + "step": 13488 + }, + { + "epoch": 5.485563237088248, + "grad_norm": 2.356162729029738, + "learning_rate": 1.296831442619303e-05, + "loss": 0.0456, + "step": 13489 + }, + { + "epoch": 5.485969906466043, + "grad_norm": 0.9004685097262551, + "learning_rate": 1.29673468872622e-05, + "loss": 0.021, + "step": 13490 + }, + { + "epoch": 5.486376575843839, + "grad_norm": 3.134171738028808, + "learning_rate": 1.2966379317870055e-05, + "loss": 0.0882, + "step": 13491 + }, + { + "epoch": 5.486783245221635, + "grad_norm": 0.02797080389123014, + "learning_rate": 1.2965411718026534e-05, + "loss": 0.0006, + "step": 13492 + }, + { + "epoch": 5.487189914599431, + "grad_norm": 3.708396976846564, + "learning_rate": 1.2964444087741571e-05, + "loss": 0.0758, + "step": 13493 + }, + { + "epoch": 5.4875965839772265, + "grad_norm": 0.06881951809486962, + "learning_rate": 1.2963476427025094e-05, + "loss": 0.0012, + "step": 13494 + }, + { + "epoch": 5.488003253355022, + "grad_norm": 2.538740528283616, + "learning_rate": 1.2962508735887043e-05, + "loss": 0.0414, + "step": 13495 + }, + { + "epoch": 5.488409922732818, + "grad_norm": 2.8437067419173045, + "learning_rate": 1.2961541014337349e-05, + "loss": 0.1309, + "step": 13496 + }, + { + "epoch": 5.488816592110614, + "grad_norm": 15.90645570565783, + "learning_rate": 1.2960573262385942e-05, + "loss": 0.1544, + "step": 13497 + }, + { + "epoch": 5.4892232614884096, + "grad_norm": 1.824608325367016, + "learning_rate": 1.2959605480042767e-05, + "loss": 0.0465, + "step": 13498 + }, + { + "epoch": 5.489629930866206, + "grad_norm": 2.9579864039650725, + "learning_rate": 1.2958637667317748e-05, + "loss": 0.064, + "step": 13499 + }, + { + "epoch": 5.490036600244002, + "grad_norm": 6.091467985617289, + "learning_rate": 1.2957669824220825e-05, + "loss": 0.1258, + "step": 13500 + }, + { + "epoch": 5.490443269621798, + "grad_norm": 0.08088941027125364, + "learning_rate": 1.2956701950761934e-05, + "loss": 0.0018, + "step": 13501 + }, + { + "epoch": 5.4908499389995935, + "grad_norm": 3.079871588037802, + "learning_rate": 1.2955734046951007e-05, + "loss": 0.0562, + "step": 13502 + }, + { + "epoch": 5.491256608377389, + "grad_norm": 1.5746595047092178, + "learning_rate": 1.2954766112797984e-05, + "loss": 0.0216, + "step": 13503 + }, + { + "epoch": 5.491663277755185, + "grad_norm": 7.204390013763167, + "learning_rate": 1.29537981483128e-05, + "loss": 0.2318, + "step": 13504 + }, + { + "epoch": 5.492069947132981, + "grad_norm": 14.632465078508591, + "learning_rate": 1.2952830153505388e-05, + "loss": 0.4305, + "step": 13505 + }, + { + "epoch": 5.492476616510777, + "grad_norm": 3.003654405206801, + "learning_rate": 1.2951862128385693e-05, + "loss": 0.0977, + "step": 13506 + }, + { + "epoch": 5.492883285888572, + "grad_norm": 0.15562939180487223, + "learning_rate": 1.2950894072963645e-05, + "loss": 0.0022, + "step": 13507 + }, + { + "epoch": 5.493289955266368, + "grad_norm": 5.400533180010857, + "learning_rate": 1.2949925987249185e-05, + "loss": 0.099, + "step": 13508 + }, + { + "epoch": 5.493696624644164, + "grad_norm": 3.8201844274083965, + "learning_rate": 1.2948957871252248e-05, + "loss": 0.0607, + "step": 13509 + }, + { + "epoch": 5.4941032940219605, + "grad_norm": 0.20864076465626344, + "learning_rate": 1.2947989724982778e-05, + "loss": 0.0033, + "step": 13510 + }, + { + "epoch": 5.494509963399756, + "grad_norm": 7.393593156210894, + "learning_rate": 1.2947021548450707e-05, + "loss": 0.1437, + "step": 13511 + }, + { + "epoch": 5.494916632777552, + "grad_norm": 0.48513179456789735, + "learning_rate": 1.2946053341665978e-05, + "loss": 0.0062, + "step": 13512 + }, + { + "epoch": 5.495323302155348, + "grad_norm": 1.8997967858592064, + "learning_rate": 1.2945085104638527e-05, + "loss": 0.0259, + "step": 13513 + }, + { + "epoch": 5.495729971533144, + "grad_norm": 4.193883019219511, + "learning_rate": 1.2944116837378297e-05, + "loss": 0.0509, + "step": 13514 + }, + { + "epoch": 5.496136640910939, + "grad_norm": 0.4097970032887189, + "learning_rate": 1.2943148539895227e-05, + "loss": 0.0068, + "step": 13515 + }, + { + "epoch": 5.496543310288735, + "grad_norm": 7.415824612525296, + "learning_rate": 1.2942180212199253e-05, + "loss": 0.2694, + "step": 13516 + }, + { + "epoch": 5.496949979666531, + "grad_norm": 12.580941263185167, + "learning_rate": 1.294121185430032e-05, + "loss": 0.5165, + "step": 13517 + }, + { + "epoch": 5.497356649044327, + "grad_norm": 10.01187708398683, + "learning_rate": 1.2940243466208367e-05, + "loss": 0.4393, + "step": 13518 + }, + { + "epoch": 5.497763318422122, + "grad_norm": 3.049879616431635, + "learning_rate": 1.2939275047933333e-05, + "loss": 0.0528, + "step": 13519 + }, + { + "epoch": 5.498169987799919, + "grad_norm": 12.222066676695565, + "learning_rate": 1.2938306599485164e-05, + "loss": 0.178, + "step": 13520 + }, + { + "epoch": 5.498576657177715, + "grad_norm": 7.39918713645205, + "learning_rate": 1.2937338120873797e-05, + "loss": 0.3308, + "step": 13521 + }, + { + "epoch": 5.498983326555511, + "grad_norm": 2.402304827124761, + "learning_rate": 1.2936369612109178e-05, + "loss": 0.043, + "step": 13522 + }, + { + "epoch": 5.499389995933306, + "grad_norm": 13.514656354117774, + "learning_rate": 1.2935401073201246e-05, + "loss": 0.4565, + "step": 13523 + }, + { + "epoch": 5.499796665311102, + "grad_norm": 3.305424686509878, + "learning_rate": 1.2934432504159943e-05, + "loss": 0.0481, + "step": 13524 + }, + { + "epoch": 5.500203334688898, + "grad_norm": 0.2099902048754249, + "learning_rate": 1.2933463904995218e-05, + "loss": 0.0019, + "step": 13525 + }, + { + "epoch": 5.500610004066694, + "grad_norm": 0.9562422956863583, + "learning_rate": 1.2932495275717005e-05, + "loss": 0.0158, + "step": 13526 + }, + { + "epoch": 5.501016673444489, + "grad_norm": 4.5146281259757695, + "learning_rate": 1.2931526616335253e-05, + "loss": 0.1666, + "step": 13527 + }, + { + "epoch": 5.501423342822285, + "grad_norm": 6.4352745853817, + "learning_rate": 1.2930557926859906e-05, + "loss": 0.228, + "step": 13528 + }, + { + "epoch": 5.501830012200081, + "grad_norm": 8.365139482531314, + "learning_rate": 1.2929589207300906e-05, + "loss": 0.2977, + "step": 13529 + }, + { + "epoch": 5.502236681577877, + "grad_norm": 10.027269329505213, + "learning_rate": 1.29286204576682e-05, + "loss": 0.6759, + "step": 13530 + }, + { + "epoch": 5.502643350955673, + "grad_norm": 8.657870587245469, + "learning_rate": 1.2927651677971728e-05, + "loss": 0.41, + "step": 13531 + }, + { + "epoch": 5.503050020333469, + "grad_norm": 0.14280176202684775, + "learning_rate": 1.2926682868221441e-05, + "loss": 0.0024, + "step": 13532 + }, + { + "epoch": 5.503456689711265, + "grad_norm": 36.21345994181989, + "learning_rate": 1.292571402842728e-05, + "loss": 0.2047, + "step": 13533 + }, + { + "epoch": 5.503863359089061, + "grad_norm": 0.10758437331268067, + "learning_rate": 1.2924745158599194e-05, + "loss": 0.0016, + "step": 13534 + }, + { + "epoch": 5.504270028466856, + "grad_norm": 1.8693062444194692, + "learning_rate": 1.2923776258747128e-05, + "loss": 0.0375, + "step": 13535 + }, + { + "epoch": 5.504676697844652, + "grad_norm": 1.4979350898171344, + "learning_rate": 1.2922807328881021e-05, + "loss": 0.0216, + "step": 13536 + }, + { + "epoch": 5.505083367222448, + "grad_norm": 1.2689876745951643, + "learning_rate": 1.2921838369010831e-05, + "loss": 0.0227, + "step": 13537 + }, + { + "epoch": 5.505490036600244, + "grad_norm": 6.096765121466743, + "learning_rate": 1.29208693791465e-05, + "loss": 0.1765, + "step": 13538 + }, + { + "epoch": 5.5058967059780395, + "grad_norm": 3.2228077464907607, + "learning_rate": 1.2919900359297967e-05, + "loss": 0.1244, + "step": 13539 + }, + { + "epoch": 5.506303375355836, + "grad_norm": 4.2893936445974665, + "learning_rate": 1.2918931309475196e-05, + "loss": 0.1583, + "step": 13540 + }, + { + "epoch": 5.506710044733632, + "grad_norm": 1.5953806066606628, + "learning_rate": 1.2917962229688124e-05, + "loss": 0.0198, + "step": 13541 + }, + { + "epoch": 5.507116714111428, + "grad_norm": 0.03809156889650037, + "learning_rate": 1.2916993119946697e-05, + "loss": 0.0006, + "step": 13542 + }, + { + "epoch": 5.507523383489223, + "grad_norm": 7.354214095161491, + "learning_rate": 1.2916023980260872e-05, + "loss": 0.4158, + "step": 13543 + }, + { + "epoch": 5.507930052867019, + "grad_norm": 12.90976760383149, + "learning_rate": 1.2915054810640592e-05, + "loss": 0.2329, + "step": 13544 + }, + { + "epoch": 5.508336722244815, + "grad_norm": 0.2047880247881776, + "learning_rate": 1.2914085611095807e-05, + "loss": 0.0024, + "step": 13545 + }, + { + "epoch": 5.508743391622611, + "grad_norm": 1.9000289561569377, + "learning_rate": 1.2913116381636468e-05, + "loss": 0.0321, + "step": 13546 + }, + { + "epoch": 5.5091500610004065, + "grad_norm": 0.10037137603888094, + "learning_rate": 1.2912147122272523e-05, + "loss": 0.0018, + "step": 13547 + }, + { + "epoch": 5.509556730378202, + "grad_norm": 6.710561207201357, + "learning_rate": 1.2911177833013922e-05, + "loss": 0.2296, + "step": 13548 + }, + { + "epoch": 5.509963399755998, + "grad_norm": 4.630118692633585, + "learning_rate": 1.2910208513870614e-05, + "loss": 0.1067, + "step": 13549 + }, + { + "epoch": 5.510370069133794, + "grad_norm": 6.833983116880238, + "learning_rate": 1.2909239164852552e-05, + "loss": 0.0412, + "step": 13550 + }, + { + "epoch": 5.5107767385115904, + "grad_norm": 15.277896478271908, + "learning_rate": 1.2908269785969685e-05, + "loss": 0.4524, + "step": 13551 + }, + { + "epoch": 5.511183407889386, + "grad_norm": 5.127769246857872, + "learning_rate": 1.2907300377231963e-05, + "loss": 0.0787, + "step": 13552 + }, + { + "epoch": 5.511590077267182, + "grad_norm": 0.02223858611169927, + "learning_rate": 1.2906330938649343e-05, + "loss": 0.0004, + "step": 13553 + }, + { + "epoch": 5.511996746644978, + "grad_norm": 2.5509317205226085, + "learning_rate": 1.2905361470231772e-05, + "loss": 0.0304, + "step": 13554 + }, + { + "epoch": 5.5124034160227735, + "grad_norm": 4.610653906937848, + "learning_rate": 1.2904391971989203e-05, + "loss": 0.1139, + "step": 13555 + }, + { + "epoch": 5.512810085400569, + "grad_norm": 10.802445950620921, + "learning_rate": 1.2903422443931588e-05, + "loss": 0.3539, + "step": 13556 + }, + { + "epoch": 5.513216754778365, + "grad_norm": 7.460723338623493, + "learning_rate": 1.2902452886068883e-05, + "loss": 0.3584, + "step": 13557 + }, + { + "epoch": 5.513623424156161, + "grad_norm": 13.145020589604917, + "learning_rate": 1.2901483298411036e-05, + "loss": 0.7192, + "step": 13558 + }, + { + "epoch": 5.514030093533957, + "grad_norm": 0.4898564215988015, + "learning_rate": 1.2900513680968003e-05, + "loss": 0.0071, + "step": 13559 + }, + { + "epoch": 5.514436762911753, + "grad_norm": 14.100608502820462, + "learning_rate": 1.2899544033749738e-05, + "loss": 0.4864, + "step": 13560 + }, + { + "epoch": 5.514843432289549, + "grad_norm": 3.230210812314097, + "learning_rate": 1.289857435676619e-05, + "loss": 0.0969, + "step": 13561 + }, + { + "epoch": 5.515250101667345, + "grad_norm": 6.34611036088838, + "learning_rate": 1.2897604650027322e-05, + "loss": 0.3634, + "step": 13562 + }, + { + "epoch": 5.5156567710451405, + "grad_norm": 1.4420049340915129, + "learning_rate": 1.2896634913543085e-05, + "loss": 0.0203, + "step": 13563 + }, + { + "epoch": 5.516063440422936, + "grad_norm": 8.544338951609086, + "learning_rate": 1.2895665147323428e-05, + "loss": 0.2571, + "step": 13564 + }, + { + "epoch": 5.516470109800732, + "grad_norm": 1.8013088615004789, + "learning_rate": 1.2894695351378315e-05, + "loss": 0.0655, + "step": 13565 + }, + { + "epoch": 5.516876779178528, + "grad_norm": 0.8648131989485223, + "learning_rate": 1.2893725525717695e-05, + "loss": 0.0129, + "step": 13566 + }, + { + "epoch": 5.517283448556324, + "grad_norm": 7.30236021377641, + "learning_rate": 1.2892755670351527e-05, + "loss": 0.1615, + "step": 13567 + }, + { + "epoch": 5.517690117934119, + "grad_norm": 2.7875220932828024, + "learning_rate": 1.2891785785289766e-05, + "loss": 0.0266, + "step": 13568 + }, + { + "epoch": 5.518096787311915, + "grad_norm": 8.259174631837755, + "learning_rate": 1.2890815870542368e-05, + "loss": 0.3225, + "step": 13569 + }, + { + "epoch": 5.518503456689711, + "grad_norm": 0.806134287293601, + "learning_rate": 1.2889845926119293e-05, + "loss": 0.0143, + "step": 13570 + }, + { + "epoch": 5.518910126067507, + "grad_norm": 0.38135828991547316, + "learning_rate": 1.288887595203049e-05, + "loss": 0.0076, + "step": 13571 + }, + { + "epoch": 5.519316795445303, + "grad_norm": 28.883267400227435, + "learning_rate": 1.2887905948285925e-05, + "loss": 1.1095, + "step": 13572 + }, + { + "epoch": 5.519723464823099, + "grad_norm": 1.0745006750561745, + "learning_rate": 1.2886935914895553e-05, + "loss": 0.0154, + "step": 13573 + }, + { + "epoch": 5.520130134200895, + "grad_norm": 5.157854940152574, + "learning_rate": 1.2885965851869328e-05, + "loss": 0.2585, + "step": 13574 + }, + { + "epoch": 5.520536803578691, + "grad_norm": 0.7597644368900335, + "learning_rate": 1.2884995759217214e-05, + "loss": 0.0175, + "step": 13575 + }, + { + "epoch": 5.520943472956486, + "grad_norm": 7.124798612208495, + "learning_rate": 1.2884025636949165e-05, + "loss": 0.4363, + "step": 13576 + }, + { + "epoch": 5.521350142334282, + "grad_norm": 12.961815060625666, + "learning_rate": 1.288305548507514e-05, + "loss": 0.6262, + "step": 13577 + }, + { + "epoch": 5.521756811712078, + "grad_norm": 9.03790445676234, + "learning_rate": 1.2882085303605101e-05, + "loss": 0.0601, + "step": 13578 + }, + { + "epoch": 5.522163481089874, + "grad_norm": 0.2942114406778332, + "learning_rate": 1.2881115092549006e-05, + "loss": 0.0035, + "step": 13579 + }, + { + "epoch": 5.522570150467669, + "grad_norm": 2.443308204424559, + "learning_rate": 1.2880144851916813e-05, + "loss": 0.0276, + "step": 13580 + }, + { + "epoch": 5.522976819845466, + "grad_norm": 3.4449415366717635, + "learning_rate": 1.2879174581718483e-05, + "loss": 0.0599, + "step": 13581 + }, + { + "epoch": 5.523383489223262, + "grad_norm": 0.3129832076569817, + "learning_rate": 1.2878204281963979e-05, + "loss": 0.0075, + "step": 13582 + }, + { + "epoch": 5.523790158601058, + "grad_norm": 1.3387876445646623, + "learning_rate": 1.287723395266326e-05, + "loss": 0.0172, + "step": 13583 + }, + { + "epoch": 5.524196827978853, + "grad_norm": 9.076799562058724, + "learning_rate": 1.2876263593826284e-05, + "loss": 0.1085, + "step": 13584 + }, + { + "epoch": 5.524603497356649, + "grad_norm": 7.876594810674021, + "learning_rate": 1.2875293205463018e-05, + "loss": 0.2686, + "step": 13585 + }, + { + "epoch": 5.525010166734445, + "grad_norm": 0.07305699931219947, + "learning_rate": 1.2874322787583418e-05, + "loss": 0.0016, + "step": 13586 + }, + { + "epoch": 5.525416836112241, + "grad_norm": 4.804248413393725, + "learning_rate": 1.2873352340197444e-05, + "loss": 0.0836, + "step": 13587 + }, + { + "epoch": 5.525823505490036, + "grad_norm": 9.60508994679602, + "learning_rate": 1.2872381863315069e-05, + "loss": 0.5449, + "step": 13588 + }, + { + "epoch": 5.526230174867832, + "grad_norm": 4.63052253315597, + "learning_rate": 1.2871411356946247e-05, + "loss": 0.1509, + "step": 13589 + }, + { + "epoch": 5.526636844245628, + "grad_norm": 1.9993212691883662, + "learning_rate": 1.2870440821100938e-05, + "loss": 0.0256, + "step": 13590 + }, + { + "epoch": 5.527043513623424, + "grad_norm": 5.791755654556313, + "learning_rate": 1.2869470255789112e-05, + "loss": 0.2226, + "step": 13591 + }, + { + "epoch": 5.52745018300122, + "grad_norm": 8.244481182271695, + "learning_rate": 1.286849966102073e-05, + "loss": 0.2157, + "step": 13592 + }, + { + "epoch": 5.527856852379016, + "grad_norm": 14.868653623792527, + "learning_rate": 1.2867529036805758e-05, + "loss": 0.3422, + "step": 13593 + }, + { + "epoch": 5.528263521756812, + "grad_norm": 2.26988015461243, + "learning_rate": 1.286655838315415e-05, + "loss": 0.0314, + "step": 13594 + }, + { + "epoch": 5.528670191134608, + "grad_norm": 13.53006267555536, + "learning_rate": 1.2865587700075884e-05, + "loss": 0.8979, + "step": 13595 + }, + { + "epoch": 5.529076860512403, + "grad_norm": 5.310207924842247, + "learning_rate": 1.2864616987580915e-05, + "loss": 0.274, + "step": 13596 + }, + { + "epoch": 5.529483529890199, + "grad_norm": 1.7101206436150675, + "learning_rate": 1.286364624567921e-05, + "loss": 0.0398, + "step": 13597 + }, + { + "epoch": 5.529890199267995, + "grad_norm": 8.447646723347818, + "learning_rate": 1.286267547438074e-05, + "loss": 0.155, + "step": 13598 + }, + { + "epoch": 5.530296868645791, + "grad_norm": 1.8570299129221488, + "learning_rate": 1.2861704673695461e-05, + "loss": 0.0941, + "step": 13599 + }, + { + "epoch": 5.5307035380235865, + "grad_norm": 14.398638117940301, + "learning_rate": 1.2860733843633346e-05, + "loss": 0.9623, + "step": 13600 + }, + { + "epoch": 5.531110207401383, + "grad_norm": 11.189848164960901, + "learning_rate": 1.2859762984204357e-05, + "loss": 0.5817, + "step": 13601 + }, + { + "epoch": 5.531516876779179, + "grad_norm": 5.932806953330759, + "learning_rate": 1.2858792095418462e-05, + "loss": 0.0969, + "step": 13602 + }, + { + "epoch": 5.531923546156975, + "grad_norm": 3.7645199197758297, + "learning_rate": 1.2857821177285627e-05, + "loss": 0.0809, + "step": 13603 + }, + { + "epoch": 5.5323302155347704, + "grad_norm": 0.657617429492689, + "learning_rate": 1.2856850229815821e-05, + "loss": 0.0092, + "step": 13604 + }, + { + "epoch": 5.532736884912566, + "grad_norm": 9.466853057750885, + "learning_rate": 1.2855879253019005e-05, + "loss": 0.2891, + "step": 13605 + }, + { + "epoch": 5.533143554290362, + "grad_norm": 3.752750962524637, + "learning_rate": 1.2854908246905154e-05, + "loss": 0.0767, + "step": 13606 + }, + { + "epoch": 5.533550223668158, + "grad_norm": 9.952762848175047, + "learning_rate": 1.2853937211484234e-05, + "loss": 0.6832, + "step": 13607 + }, + { + "epoch": 5.5339568930459535, + "grad_norm": 1.224389139930418, + "learning_rate": 1.2852966146766211e-05, + "loss": 0.0225, + "step": 13608 + }, + { + "epoch": 5.534363562423749, + "grad_norm": 7.381337170225335, + "learning_rate": 1.2851995052761055e-05, + "loss": 0.2659, + "step": 13609 + }, + { + "epoch": 5.534770231801545, + "grad_norm": 0.018193800405841388, + "learning_rate": 1.2851023929478734e-05, + "loss": 0.0002, + "step": 13610 + }, + { + "epoch": 5.535176901179341, + "grad_norm": 5.519633322242378, + "learning_rate": 1.2850052776929219e-05, + "loss": 0.1603, + "step": 13611 + }, + { + "epoch": 5.535583570557137, + "grad_norm": 8.884081616385467, + "learning_rate": 1.2849081595122472e-05, + "loss": 0.2295, + "step": 13612 + }, + { + "epoch": 5.535990239934933, + "grad_norm": 14.046748837550478, + "learning_rate": 1.2848110384068474e-05, + "loss": 0.7302, + "step": 13613 + }, + { + "epoch": 5.536396909312729, + "grad_norm": 0.5316871650222115, + "learning_rate": 1.284713914377719e-05, + "loss": 0.0093, + "step": 13614 + }, + { + "epoch": 5.536803578690525, + "grad_norm": 4.952011986301211, + "learning_rate": 1.2846167874258589e-05, + "loss": 0.0955, + "step": 13615 + }, + { + "epoch": 5.5372102480683205, + "grad_norm": 3.8911527760191458, + "learning_rate": 1.284519657552264e-05, + "loss": 0.0853, + "step": 13616 + }, + { + "epoch": 5.537616917446116, + "grad_norm": 7.5950767624705495, + "learning_rate": 1.2844225247579318e-05, + "loss": 0.2505, + "step": 13617 + }, + { + "epoch": 5.538023586823912, + "grad_norm": 5.924105434012417, + "learning_rate": 1.2843253890438594e-05, + "loss": 0.1296, + "step": 13618 + }, + { + "epoch": 5.538430256201708, + "grad_norm": 0.1664268633813245, + "learning_rate": 1.2842282504110432e-05, + "loss": 0.0025, + "step": 13619 + }, + { + "epoch": 5.538836925579504, + "grad_norm": 2.5597924368652394, + "learning_rate": 1.2841311088604815e-05, + "loss": 0.1117, + "step": 13620 + }, + { + "epoch": 5.539243594957299, + "grad_norm": 0.5893795813293594, + "learning_rate": 1.284033964393171e-05, + "loss": 0.0115, + "step": 13621 + }, + { + "epoch": 5.539650264335096, + "grad_norm": 3.6451161361560573, + "learning_rate": 1.2839368170101086e-05, + "loss": 0.0727, + "step": 13622 + }, + { + "epoch": 5.540056933712892, + "grad_norm": 3.78644612699433, + "learning_rate": 1.283839666712292e-05, + "loss": 0.064, + "step": 13623 + }, + { + "epoch": 5.5404636030906875, + "grad_norm": 7.716530786829998, + "learning_rate": 1.2837425135007184e-05, + "loss": 0.164, + "step": 13624 + }, + { + "epoch": 5.540870272468483, + "grad_norm": 18.890435220224187, + "learning_rate": 1.283645357376385e-05, + "loss": 0.2628, + "step": 13625 + }, + { + "epoch": 5.541276941846279, + "grad_norm": 0.8058162160813371, + "learning_rate": 1.2835481983402893e-05, + "loss": 0.014, + "step": 13626 + }, + { + "epoch": 5.541683611224075, + "grad_norm": 6.75558279000114, + "learning_rate": 1.2834510363934287e-05, + "loss": 0.1631, + "step": 13627 + }, + { + "epoch": 5.542090280601871, + "grad_norm": 0.48533650704168296, + "learning_rate": 1.2833538715368005e-05, + "loss": 0.0071, + "step": 13628 + }, + { + "epoch": 5.542496949979666, + "grad_norm": 10.0382881324178, + "learning_rate": 1.2832567037714021e-05, + "loss": 0.1936, + "step": 13629 + }, + { + "epoch": 5.542903619357462, + "grad_norm": 1.8511192539057795, + "learning_rate": 1.283159533098231e-05, + "loss": 0.0108, + "step": 13630 + }, + { + "epoch": 5.543310288735258, + "grad_norm": 9.59296811333558, + "learning_rate": 1.2830623595182852e-05, + "loss": 0.2986, + "step": 13631 + }, + { + "epoch": 5.543716958113054, + "grad_norm": 0.14057933840065842, + "learning_rate": 1.2829651830325615e-05, + "loss": 0.0032, + "step": 13632 + }, + { + "epoch": 5.54412362749085, + "grad_norm": 0.714134194036529, + "learning_rate": 1.282868003642058e-05, + "loss": 0.0125, + "step": 13633 + }, + { + "epoch": 5.544530296868646, + "grad_norm": 0.29382163062899763, + "learning_rate": 1.2827708213477718e-05, + "loss": 0.0042, + "step": 13634 + }, + { + "epoch": 5.544936966246442, + "grad_norm": 4.836917134743602, + "learning_rate": 1.2826736361507008e-05, + "loss": 0.2692, + "step": 13635 + }, + { + "epoch": 5.545343635624238, + "grad_norm": 7.459718345194977, + "learning_rate": 1.282576448051843e-05, + "loss": 0.4107, + "step": 13636 + }, + { + "epoch": 5.545750305002033, + "grad_norm": 0.015926458636769603, + "learning_rate": 1.2824792570521955e-05, + "loss": 0.0003, + "step": 13637 + }, + { + "epoch": 5.546156974379829, + "grad_norm": 7.962746245773771, + "learning_rate": 1.2823820631527564e-05, + "loss": 0.2206, + "step": 13638 + }, + { + "epoch": 5.546563643757625, + "grad_norm": 7.0125402058366335, + "learning_rate": 1.2822848663545231e-05, + "loss": 0.2605, + "step": 13639 + }, + { + "epoch": 5.546970313135421, + "grad_norm": 1.7527297183749635, + "learning_rate": 1.2821876666584935e-05, + "loss": 0.031, + "step": 13640 + }, + { + "epoch": 5.547376982513216, + "grad_norm": 4.941780867012435, + "learning_rate": 1.282090464065666e-05, + "loss": 0.1714, + "step": 13641 + }, + { + "epoch": 5.547783651891013, + "grad_norm": 8.718789052277613, + "learning_rate": 1.2819932585770371e-05, + "loss": 0.4601, + "step": 13642 + }, + { + "epoch": 5.548190321268809, + "grad_norm": 9.00060117058162, + "learning_rate": 1.2818960501936062e-05, + "loss": 0.2745, + "step": 13643 + }, + { + "epoch": 5.548596990646605, + "grad_norm": 3.0845291165892124, + "learning_rate": 1.2817988389163703e-05, + "loss": 0.1054, + "step": 13644 + }, + { + "epoch": 5.5490036600244, + "grad_norm": 1.5601120454588902, + "learning_rate": 1.2817016247463272e-05, + "loss": 0.0242, + "step": 13645 + }, + { + "epoch": 5.549410329402196, + "grad_norm": 3.9965669153669845, + "learning_rate": 1.2816044076844754e-05, + "loss": 0.1153, + "step": 13646 + }, + { + "epoch": 5.549816998779992, + "grad_norm": 5.877793752401173, + "learning_rate": 1.2815071877318128e-05, + "loss": 0.2953, + "step": 13647 + }, + { + "epoch": 5.550223668157788, + "grad_norm": 5.714485702951296, + "learning_rate": 1.281409964889337e-05, + "loss": 0.0905, + "step": 13648 + }, + { + "epoch": 5.550630337535583, + "grad_norm": 13.850969724099281, + "learning_rate": 1.2813127391580462e-05, + "loss": 0.5716, + "step": 13649 + }, + { + "epoch": 5.551037006913379, + "grad_norm": 0.10086869303202942, + "learning_rate": 1.2812155105389387e-05, + "loss": 0.0018, + "step": 13650 + }, + { + "epoch": 5.551443676291175, + "grad_norm": 3.0528684287183236, + "learning_rate": 1.2811182790330123e-05, + "loss": 0.0436, + "step": 13651 + }, + { + "epoch": 5.551850345668971, + "grad_norm": 4.088450665028134, + "learning_rate": 1.2810210446412654e-05, + "loss": 0.1905, + "step": 13652 + }, + { + "epoch": 5.5522570150467665, + "grad_norm": 2.3945719116906172, + "learning_rate": 1.2809238073646962e-05, + "loss": 0.0441, + "step": 13653 + }, + { + "epoch": 5.552663684424563, + "grad_norm": 0.16723787039622964, + "learning_rate": 1.2808265672043023e-05, + "loss": 0.0028, + "step": 13654 + }, + { + "epoch": 5.553070353802359, + "grad_norm": 0.2726715831435269, + "learning_rate": 1.2807293241610828e-05, + "loss": 0.0034, + "step": 13655 + }, + { + "epoch": 5.553477023180155, + "grad_norm": 4.291863906304089, + "learning_rate": 1.2806320782360353e-05, + "loss": 0.136, + "step": 13656 + }, + { + "epoch": 5.5538836925579504, + "grad_norm": 9.077303350028584, + "learning_rate": 1.2805348294301581e-05, + "loss": 0.398, + "step": 13657 + }, + { + "epoch": 5.554290361935746, + "grad_norm": 5.5035496242372455, + "learning_rate": 1.28043757774445e-05, + "loss": 0.1712, + "step": 13658 + }, + { + "epoch": 5.554697031313542, + "grad_norm": 0.8633875168744078, + "learning_rate": 1.280340323179909e-05, + "loss": 0.012, + "step": 13659 + }, + { + "epoch": 5.555103700691338, + "grad_norm": 13.638176310982775, + "learning_rate": 1.2802430657375333e-05, + "loss": 0.4228, + "step": 13660 + }, + { + "epoch": 5.5555103700691335, + "grad_norm": 2.7282548290316084, + "learning_rate": 1.2801458054183216e-05, + "loss": 0.0735, + "step": 13661 + }, + { + "epoch": 5.555917039446929, + "grad_norm": 2.410622538279278, + "learning_rate": 1.280048542223272e-05, + "loss": 0.0386, + "step": 13662 + }, + { + "epoch": 5.556323708824726, + "grad_norm": 5.098576908950383, + "learning_rate": 1.2799512761533834e-05, + "loss": 0.0561, + "step": 13663 + }, + { + "epoch": 5.556730378202522, + "grad_norm": 5.309850759538643, + "learning_rate": 1.279854007209654e-05, + "loss": 0.1115, + "step": 13664 + }, + { + "epoch": 5.5571370475803175, + "grad_norm": 0.3173687957499172, + "learning_rate": 1.2797567353930824e-05, + "loss": 0.0067, + "step": 13665 + }, + { + "epoch": 5.557543716958113, + "grad_norm": 4.450961682665836, + "learning_rate": 1.279659460704667e-05, + "loss": 0.1559, + "step": 13666 + }, + { + "epoch": 5.557950386335909, + "grad_norm": 8.126202210790227, + "learning_rate": 1.2795621831454066e-05, + "loss": 0.4169, + "step": 13667 + }, + { + "epoch": 5.558357055713705, + "grad_norm": 7.341649660611907, + "learning_rate": 1.2794649027162995e-05, + "loss": 0.1711, + "step": 13668 + }, + { + "epoch": 5.5587637250915005, + "grad_norm": 11.622232805896225, + "learning_rate": 1.2793676194183449e-05, + "loss": 0.8618, + "step": 13669 + }, + { + "epoch": 5.559170394469296, + "grad_norm": 2.808830269509346, + "learning_rate": 1.2792703332525406e-05, + "loss": 0.0805, + "step": 13670 + }, + { + "epoch": 5.559577063847092, + "grad_norm": 5.486368057632565, + "learning_rate": 1.2791730442198861e-05, + "loss": 0.0743, + "step": 13671 + }, + { + "epoch": 5.559983733224888, + "grad_norm": 1.8638866433582268, + "learning_rate": 1.2790757523213798e-05, + "loss": 0.0336, + "step": 13672 + }, + { + "epoch": 5.560390402602684, + "grad_norm": 0.6458093302548067, + "learning_rate": 1.2789784575580204e-05, + "loss": 0.012, + "step": 13673 + }, + { + "epoch": 5.56079707198048, + "grad_norm": 9.037386104887624, + "learning_rate": 1.2788811599308066e-05, + "loss": 0.3398, + "step": 13674 + }, + { + "epoch": 5.561203741358276, + "grad_norm": 7.333274270727394, + "learning_rate": 1.2787838594407375e-05, + "loss": 0.1683, + "step": 13675 + }, + { + "epoch": 5.561610410736072, + "grad_norm": 7.476089397734168, + "learning_rate": 1.278686556088812e-05, + "loss": 0.2021, + "step": 13676 + }, + { + "epoch": 5.5620170801138675, + "grad_norm": 1.4255380498602408, + "learning_rate": 1.2785892498760281e-05, + "loss": 0.0224, + "step": 13677 + }, + { + "epoch": 5.562423749491663, + "grad_norm": 5.521217266189332, + "learning_rate": 1.2784919408033859e-05, + "loss": 0.1122, + "step": 13678 + }, + { + "epoch": 5.562830418869459, + "grad_norm": 5.879522083588401, + "learning_rate": 1.2783946288718836e-05, + "loss": 0.2922, + "step": 13679 + }, + { + "epoch": 5.563237088247255, + "grad_norm": 0.8677325902056996, + "learning_rate": 1.2782973140825201e-05, + "loss": 0.0288, + "step": 13680 + }, + { + "epoch": 5.563643757625051, + "grad_norm": 1.4364201756096378, + "learning_rate": 1.2781999964362948e-05, + "loss": 0.0241, + "step": 13681 + }, + { + "epoch": 5.564050427002846, + "grad_norm": 5.103887074517488, + "learning_rate": 1.2781026759342065e-05, + "loss": 0.0935, + "step": 13682 + }, + { + "epoch": 5.564457096380643, + "grad_norm": 10.743187341109163, + "learning_rate": 1.2780053525772542e-05, + "loss": 0.2626, + "step": 13683 + }, + { + "epoch": 5.564863765758439, + "grad_norm": 2.9954173403241415, + "learning_rate": 1.2779080263664371e-05, + "loss": 0.0417, + "step": 13684 + }, + { + "epoch": 5.5652704351362345, + "grad_norm": 0.1860770334257151, + "learning_rate": 1.2778106973027545e-05, + "loss": 0.0027, + "step": 13685 + }, + { + "epoch": 5.56567710451403, + "grad_norm": 6.0735573880807205, + "learning_rate": 1.277713365387205e-05, + "loss": 0.1257, + "step": 13686 + }, + { + "epoch": 5.566083773891826, + "grad_norm": 0.6877551308338776, + "learning_rate": 1.277616030620788e-05, + "loss": 0.0079, + "step": 13687 + }, + { + "epoch": 5.566490443269622, + "grad_norm": 13.868452718935439, + "learning_rate": 1.2775186930045026e-05, + "loss": 0.6631, + "step": 13688 + }, + { + "epoch": 5.566897112647418, + "grad_norm": 2.4392349610517106, + "learning_rate": 1.2774213525393486e-05, + "loss": 0.037, + "step": 13689 + }, + { + "epoch": 5.567303782025213, + "grad_norm": 5.556904415676431, + "learning_rate": 1.2773240092263244e-05, + "loss": 0.1982, + "step": 13690 + }, + { + "epoch": 5.567710451403009, + "grad_norm": 0.10725119940386804, + "learning_rate": 1.2772266630664298e-05, + "loss": 0.0018, + "step": 13691 + }, + { + "epoch": 5.568117120780805, + "grad_norm": 6.617058025340976, + "learning_rate": 1.277129314060664e-05, + "loss": 0.088, + "step": 13692 + }, + { + "epoch": 5.568523790158601, + "grad_norm": 6.639496370826649, + "learning_rate": 1.2770319622100262e-05, + "loss": 0.1998, + "step": 13693 + }, + { + "epoch": 5.568930459536396, + "grad_norm": 5.383822854605633, + "learning_rate": 1.276934607515516e-05, + "loss": 0.1176, + "step": 13694 + }, + { + "epoch": 5.569337128914193, + "grad_norm": 1.2398660524999952, + "learning_rate": 1.2768372499781328e-05, + "loss": 0.0217, + "step": 13695 + }, + { + "epoch": 5.569743798291989, + "grad_norm": 5.332208140464262, + "learning_rate": 1.2767398895988757e-05, + "loss": 0.1931, + "step": 13696 + }, + { + "epoch": 5.570150467669785, + "grad_norm": 1.3296381660407022, + "learning_rate": 1.2766425263787446e-05, + "loss": 0.0225, + "step": 13697 + }, + { + "epoch": 5.57055713704758, + "grad_norm": 6.940148537959465, + "learning_rate": 1.2765451603187384e-05, + "loss": 0.1562, + "step": 13698 + }, + { + "epoch": 5.570963806425376, + "grad_norm": 15.539303909582246, + "learning_rate": 1.276447791419857e-05, + "loss": 0.4064, + "step": 13699 + }, + { + "epoch": 5.571370475803172, + "grad_norm": 9.570117174996879, + "learning_rate": 1.2763504196831e-05, + "loss": 0.3983, + "step": 13700 + }, + { + "epoch": 5.571777145180968, + "grad_norm": 7.992178590994239, + "learning_rate": 1.2762530451094671e-05, + "loss": 0.2878, + "step": 13701 + }, + { + "epoch": 5.572183814558763, + "grad_norm": 6.341567609987995, + "learning_rate": 1.2761556676999572e-05, + "loss": 0.2526, + "step": 13702 + }, + { + "epoch": 5.572590483936559, + "grad_norm": 0.12422655464493407, + "learning_rate": 1.2760582874555704e-05, + "loss": 0.0024, + "step": 13703 + }, + { + "epoch": 5.572997153314356, + "grad_norm": 4.39728034842289, + "learning_rate": 1.2759609043773067e-05, + "loss": 0.1425, + "step": 13704 + }, + { + "epoch": 5.573403822692152, + "grad_norm": 0.2058815084469825, + "learning_rate": 1.2758635184661654e-05, + "loss": 0.0038, + "step": 13705 + }, + { + "epoch": 5.573810492069947, + "grad_norm": 4.636502268189052, + "learning_rate": 1.2757661297231459e-05, + "loss": 0.0838, + "step": 13706 + }, + { + "epoch": 5.574217161447743, + "grad_norm": 7.58599126120408, + "learning_rate": 1.2756687381492484e-05, + "loss": 0.3428, + "step": 13707 + }, + { + "epoch": 5.574623830825539, + "grad_norm": 8.793065025273007, + "learning_rate": 1.2755713437454727e-05, + "loss": 0.2797, + "step": 13708 + }, + { + "epoch": 5.575030500203335, + "grad_norm": 13.310049814863476, + "learning_rate": 1.2754739465128184e-05, + "loss": 0.606, + "step": 13709 + }, + { + "epoch": 5.5754371695811304, + "grad_norm": 6.4624712019338775, + "learning_rate": 1.2753765464522854e-05, + "loss": 0.4221, + "step": 13710 + }, + { + "epoch": 5.575843838958926, + "grad_norm": 11.506818649944627, + "learning_rate": 1.2752791435648735e-05, + "loss": 0.6024, + "step": 13711 + }, + { + "epoch": 5.576250508336722, + "grad_norm": 0.4777562741860915, + "learning_rate": 1.2751817378515824e-05, + "loss": 0.0078, + "step": 13712 + }, + { + "epoch": 5.576657177714518, + "grad_norm": 2.083013109119046, + "learning_rate": 1.2750843293134128e-05, + "loss": 0.0264, + "step": 13713 + }, + { + "epoch": 5.5770638470923135, + "grad_norm": 0.05919624612082046, + "learning_rate": 1.2749869179513639e-05, + "loss": 0.0007, + "step": 13714 + }, + { + "epoch": 5.57747051647011, + "grad_norm": 0.9105697285552453, + "learning_rate": 1.2748895037664357e-05, + "loss": 0.0119, + "step": 13715 + }, + { + "epoch": 5.577877185847906, + "grad_norm": 11.586052692095343, + "learning_rate": 1.2747920867596287e-05, + "loss": 0.4102, + "step": 13716 + }, + { + "epoch": 5.578283855225702, + "grad_norm": 8.298713424277803, + "learning_rate": 1.2746946669319425e-05, + "loss": 0.3047, + "step": 13717 + }, + { + "epoch": 5.5786905246034975, + "grad_norm": 4.5772692286674275, + "learning_rate": 1.2745972442843773e-05, + "loss": 0.1993, + "step": 13718 + }, + { + "epoch": 5.579097193981293, + "grad_norm": 0.19647906455320777, + "learning_rate": 1.2744998188179332e-05, + "loss": 0.0029, + "step": 13719 + }, + { + "epoch": 5.579503863359089, + "grad_norm": 0.10444170406231598, + "learning_rate": 1.2744023905336102e-05, + "loss": 0.0016, + "step": 13720 + }, + { + "epoch": 5.579910532736885, + "grad_norm": 8.040895442143265, + "learning_rate": 1.2743049594324087e-05, + "loss": 0.2199, + "step": 13721 + }, + { + "epoch": 5.5803172021146805, + "grad_norm": 1.874507783923276, + "learning_rate": 1.2742075255153286e-05, + "loss": 0.0244, + "step": 13722 + }, + { + "epoch": 5.580723871492476, + "grad_norm": 4.224484904669378, + "learning_rate": 1.2741100887833703e-05, + "loss": 0.1467, + "step": 13723 + }, + { + "epoch": 5.581130540870273, + "grad_norm": 2.8009360434168253, + "learning_rate": 1.2740126492375339e-05, + "loss": 0.0499, + "step": 13724 + }, + { + "epoch": 5.581537210248069, + "grad_norm": 8.30287581933289, + "learning_rate": 1.2739152068788198e-05, + "loss": 0.1734, + "step": 13725 + }, + { + "epoch": 5.5819438796258645, + "grad_norm": 1.6968482596342747, + "learning_rate": 1.2738177617082284e-05, + "loss": 0.0383, + "step": 13726 + }, + { + "epoch": 5.58235054900366, + "grad_norm": 4.137157620484192, + "learning_rate": 1.2737203137267597e-05, + "loss": 0.0671, + "step": 13727 + }, + { + "epoch": 5.582757218381456, + "grad_norm": 7.5656051822199535, + "learning_rate": 1.2736228629354142e-05, + "loss": 0.3473, + "step": 13728 + }, + { + "epoch": 5.583163887759252, + "grad_norm": 4.456608938188857, + "learning_rate": 1.2735254093351924e-05, + "loss": 0.1035, + "step": 13729 + }, + { + "epoch": 5.5835705571370475, + "grad_norm": 4.018787006769579, + "learning_rate": 1.2734279529270945e-05, + "loss": 0.0355, + "step": 13730 + }, + { + "epoch": 5.583977226514843, + "grad_norm": 1.4279687249807886, + "learning_rate": 1.2733304937121213e-05, + "loss": 0.0323, + "step": 13731 + }, + { + "epoch": 5.584383895892639, + "grad_norm": 3.871545197449017, + "learning_rate": 1.2732330316912726e-05, + "loss": 0.2451, + "step": 13732 + }, + { + "epoch": 5.584790565270435, + "grad_norm": 13.4609162825286, + "learning_rate": 1.2731355668655496e-05, + "loss": 0.323, + "step": 13733 + }, + { + "epoch": 5.585197234648231, + "grad_norm": 4.206646417015794, + "learning_rate": 1.2730380992359528e-05, + "loss": 0.1254, + "step": 13734 + }, + { + "epoch": 5.585603904026026, + "grad_norm": 0.4322069135692536, + "learning_rate": 1.272940628803482e-05, + "loss": 0.0041, + "step": 13735 + }, + { + "epoch": 5.586010573403823, + "grad_norm": 8.358039757743388, + "learning_rate": 1.2728431555691386e-05, + "loss": 0.3072, + "step": 13736 + }, + { + "epoch": 5.586417242781619, + "grad_norm": 6.232987815411985, + "learning_rate": 1.2727456795339225e-05, + "loss": 0.1019, + "step": 13737 + }, + { + "epoch": 5.5868239121594145, + "grad_norm": 3.3409663991229714, + "learning_rate": 1.272648200698835e-05, + "loss": 0.0666, + "step": 13738 + }, + { + "epoch": 5.58723058153721, + "grad_norm": 4.498846454256768, + "learning_rate": 1.2725507190648764e-05, + "loss": 0.0618, + "step": 13739 + }, + { + "epoch": 5.587637250915006, + "grad_norm": 3.7302719401638624, + "learning_rate": 1.2724532346330478e-05, + "loss": 0.0996, + "step": 13740 + }, + { + "epoch": 5.588043920292802, + "grad_norm": 0.670190316399799, + "learning_rate": 1.2723557474043495e-05, + "loss": 0.014, + "step": 13741 + }, + { + "epoch": 5.588450589670598, + "grad_norm": 0.9336412830265495, + "learning_rate": 1.2722582573797821e-05, + "loss": 0.0165, + "step": 13742 + }, + { + "epoch": 5.588857259048393, + "grad_norm": 6.525129707921775, + "learning_rate": 1.272160764560347e-05, + "loss": 0.1958, + "step": 13743 + }, + { + "epoch": 5.589263928426189, + "grad_norm": 4.525001745582196, + "learning_rate": 1.2720632689470444e-05, + "loss": 0.1048, + "step": 13744 + }, + { + "epoch": 5.589670597803986, + "grad_norm": 0.39588981895402126, + "learning_rate": 1.2719657705408754e-05, + "loss": 0.0052, + "step": 13745 + }, + { + "epoch": 5.5900772671817816, + "grad_norm": 7.052355264414161, + "learning_rate": 1.2718682693428413e-05, + "loss": 0.4324, + "step": 13746 + }, + { + "epoch": 5.590483936559577, + "grad_norm": 1.1581403799634935, + "learning_rate": 1.2717707653539424e-05, + "loss": 0.0215, + "step": 13747 + }, + { + "epoch": 5.590890605937373, + "grad_norm": 2.967039676886622, + "learning_rate": 1.2716732585751798e-05, + "loss": 0.0453, + "step": 13748 + }, + { + "epoch": 5.591297275315169, + "grad_norm": 3.4201500871539157, + "learning_rate": 1.2715757490075544e-05, + "loss": 0.0617, + "step": 13749 + }, + { + "epoch": 5.591703944692965, + "grad_norm": 1.4219813522827862, + "learning_rate": 1.2714782366520673e-05, + "loss": 0.0142, + "step": 13750 + }, + { + "epoch": 5.59211061407076, + "grad_norm": 1.4635655576974067, + "learning_rate": 1.2713807215097194e-05, + "loss": 0.027, + "step": 13751 + }, + { + "epoch": 5.592517283448556, + "grad_norm": 8.953546890251467, + "learning_rate": 1.2712832035815117e-05, + "loss": 0.2025, + "step": 13752 + }, + { + "epoch": 5.592923952826352, + "grad_norm": 0.10496577410451717, + "learning_rate": 1.2711856828684458e-05, + "loss": 0.0016, + "step": 13753 + }, + { + "epoch": 5.593330622204148, + "grad_norm": 0.07045881173765914, + "learning_rate": 1.2710881593715219e-05, + "loss": 0.0011, + "step": 13754 + }, + { + "epoch": 5.593737291581943, + "grad_norm": 6.2965860832988545, + "learning_rate": 1.2709906330917419e-05, + "loss": 0.1007, + "step": 13755 + }, + { + "epoch": 5.59414396095974, + "grad_norm": 3.781200085367891, + "learning_rate": 1.2708931040301065e-05, + "loss": 0.1404, + "step": 13756 + }, + { + "epoch": 5.594550630337536, + "grad_norm": 3.7618234768121184, + "learning_rate": 1.270795572187617e-05, + "loss": 0.0617, + "step": 13757 + }, + { + "epoch": 5.594957299715332, + "grad_norm": 3.0934768555652608, + "learning_rate": 1.2706980375652748e-05, + "loss": 0.0442, + "step": 13758 + }, + { + "epoch": 5.595363969093127, + "grad_norm": 2.0221328795081015, + "learning_rate": 1.2706005001640812e-05, + "loss": 0.0246, + "step": 13759 + }, + { + "epoch": 5.595770638470923, + "grad_norm": 0.7733129571116419, + "learning_rate": 1.2705029599850368e-05, + "loss": 0.0106, + "step": 13760 + }, + { + "epoch": 5.596177307848719, + "grad_norm": 6.002064478904459, + "learning_rate": 1.2704054170291437e-05, + "loss": 0.1095, + "step": 13761 + }, + { + "epoch": 5.596583977226515, + "grad_norm": 3.679029719913606, + "learning_rate": 1.270307871297403e-05, + "loss": 0.0974, + "step": 13762 + }, + { + "epoch": 5.5969906466043104, + "grad_norm": 2.66345931336565, + "learning_rate": 1.2702103227908158e-05, + "loss": 0.0544, + "step": 13763 + }, + { + "epoch": 5.597397315982106, + "grad_norm": 5.735943833331792, + "learning_rate": 1.2701127715103835e-05, + "loss": 0.2995, + "step": 13764 + }, + { + "epoch": 5.597803985359903, + "grad_norm": 0.927226718679574, + "learning_rate": 1.270015217457108e-05, + "loss": 0.0163, + "step": 13765 + }, + { + "epoch": 5.598210654737699, + "grad_norm": 1.406332070303551, + "learning_rate": 1.26991766063199e-05, + "loss": 0.0338, + "step": 13766 + }, + { + "epoch": 5.598617324115494, + "grad_norm": 1.2860441812767356, + "learning_rate": 1.2698201010360317e-05, + "loss": 0.05, + "step": 13767 + }, + { + "epoch": 5.59902399349329, + "grad_norm": 3.6303314755489975, + "learning_rate": 1.269722538670234e-05, + "loss": 0.0768, + "step": 13768 + }, + { + "epoch": 5.599430662871086, + "grad_norm": 5.8867192141805464, + "learning_rate": 1.269624973535599e-05, + "loss": 0.1914, + "step": 13769 + }, + { + "epoch": 5.599837332248882, + "grad_norm": 8.994117130617484, + "learning_rate": 1.2695274056331275e-05, + "loss": 0.3229, + "step": 13770 + }, + { + "epoch": 5.6002440016266775, + "grad_norm": 5.7715013660303445, + "learning_rate": 1.2694298349638219e-05, + "loss": 0.1434, + "step": 13771 + }, + { + "epoch": 5.600650671004473, + "grad_norm": 10.878667734294867, + "learning_rate": 1.2693322615286835e-05, + "loss": 0.3354, + "step": 13772 + }, + { + "epoch": 5.601057340382269, + "grad_norm": 3.9353674169710726, + "learning_rate": 1.2692346853287135e-05, + "loss": 0.1435, + "step": 13773 + }, + { + "epoch": 5.601464009760065, + "grad_norm": 2.7010366332868725, + "learning_rate": 1.2691371063649143e-05, + "loss": 0.0651, + "step": 13774 + }, + { + "epoch": 5.6018706791378605, + "grad_norm": 9.653238741115683, + "learning_rate": 1.2690395246382871e-05, + "loss": 0.0883, + "step": 13775 + }, + { + "epoch": 5.602277348515656, + "grad_norm": 5.667263153659175, + "learning_rate": 1.268941940149834e-05, + "loss": 0.1723, + "step": 13776 + }, + { + "epoch": 5.602684017893453, + "grad_norm": 3.0531942887683603, + "learning_rate": 1.268844352900556e-05, + "loss": 0.0482, + "step": 13777 + }, + { + "epoch": 5.603090687271249, + "grad_norm": 0.6008903347756712, + "learning_rate": 1.2687467628914558e-05, + "loss": 0.0097, + "step": 13778 + }, + { + "epoch": 5.6034973566490445, + "grad_norm": 0.12512942844945502, + "learning_rate": 1.2686491701235348e-05, + "loss": 0.0018, + "step": 13779 + }, + { + "epoch": 5.60390402602684, + "grad_norm": 12.006420112474077, + "learning_rate": 1.2685515745977948e-05, + "loss": 0.5006, + "step": 13780 + }, + { + "epoch": 5.604310695404636, + "grad_norm": 4.371918680340598, + "learning_rate": 1.2684539763152378e-05, + "loss": 0.1712, + "step": 13781 + }, + { + "epoch": 5.604717364782432, + "grad_norm": 10.797708555428425, + "learning_rate": 1.2683563752768655e-05, + "loss": 0.3826, + "step": 13782 + }, + { + "epoch": 5.6051240341602275, + "grad_norm": 0.14119401779720045, + "learning_rate": 1.26825877148368e-05, + "loss": 0.0016, + "step": 13783 + }, + { + "epoch": 5.605530703538023, + "grad_norm": 11.594847421876478, + "learning_rate": 1.2681611649366832e-05, + "loss": 0.0606, + "step": 13784 + }, + { + "epoch": 5.605937372915819, + "grad_norm": 5.111715976179633, + "learning_rate": 1.2680635556368773e-05, + "loss": 0.148, + "step": 13785 + }, + { + "epoch": 5.606344042293616, + "grad_norm": 8.850206599124052, + "learning_rate": 1.2679659435852635e-05, + "loss": 0.3044, + "step": 13786 + }, + { + "epoch": 5.6067507116714115, + "grad_norm": 1.3851506158905909, + "learning_rate": 1.2678683287828451e-05, + "loss": 0.0109, + "step": 13787 + }, + { + "epoch": 5.607157381049207, + "grad_norm": 3.739985959234371, + "learning_rate": 1.2677707112306233e-05, + "loss": 0.0752, + "step": 13788 + }, + { + "epoch": 5.607564050427003, + "grad_norm": 7.6583018312365825, + "learning_rate": 1.2676730909296002e-05, + "loss": 0.2562, + "step": 13789 + }, + { + "epoch": 5.607970719804799, + "grad_norm": 0.09530022136515542, + "learning_rate": 1.2675754678807781e-05, + "loss": 0.0019, + "step": 13790 + }, + { + "epoch": 5.6083773891825945, + "grad_norm": 0.03401868294888287, + "learning_rate": 1.2674778420851592e-05, + "loss": 0.0007, + "step": 13791 + }, + { + "epoch": 5.60878405856039, + "grad_norm": 4.541998819335282, + "learning_rate": 1.2673802135437459e-05, + "loss": 0.0725, + "step": 13792 + }, + { + "epoch": 5.609190727938186, + "grad_norm": 3.1880199392385427, + "learning_rate": 1.2672825822575396e-05, + "loss": 0.0625, + "step": 13793 + }, + { + "epoch": 5.609597397315982, + "grad_norm": 9.10201708022712, + "learning_rate": 1.2671849482275436e-05, + "loss": 0.2219, + "step": 13794 + }, + { + "epoch": 5.610004066693778, + "grad_norm": 10.975503877077932, + "learning_rate": 1.2670873114547595e-05, + "loss": 0.5122, + "step": 13795 + }, + { + "epoch": 5.610410736071573, + "grad_norm": 17.093791736712227, + "learning_rate": 1.2669896719401894e-05, + "loss": 0.4725, + "step": 13796 + }, + { + "epoch": 5.61081740544937, + "grad_norm": 3.17177868941977, + "learning_rate": 1.2668920296848364e-05, + "loss": 0.0382, + "step": 13797 + }, + { + "epoch": 5.611224074827166, + "grad_norm": 0.22665625243965692, + "learning_rate": 1.2667943846897024e-05, + "loss": 0.0053, + "step": 13798 + }, + { + "epoch": 5.6116307442049616, + "grad_norm": 16.15721520486607, + "learning_rate": 1.2666967369557892e-05, + "loss": 0.1609, + "step": 13799 + }, + { + "epoch": 5.612037413582757, + "grad_norm": 0.021508384156209693, + "learning_rate": 1.2665990864841003e-05, + "loss": 0.0004, + "step": 13800 + }, + { + "epoch": 5.612444082960553, + "grad_norm": 4.596618924955986, + "learning_rate": 1.2665014332756376e-05, + "loss": 0.1115, + "step": 13801 + }, + { + "epoch": 5.612850752338349, + "grad_norm": 2.665389993421583, + "learning_rate": 1.266403777331403e-05, + "loss": 0.013, + "step": 13802 + }, + { + "epoch": 5.613257421716145, + "grad_norm": 0.34591324164784015, + "learning_rate": 1.2663061186524001e-05, + "loss": 0.0052, + "step": 13803 + }, + { + "epoch": 5.61366409109394, + "grad_norm": 0.0478229998248779, + "learning_rate": 1.2662084572396307e-05, + "loss": 0.0008, + "step": 13804 + }, + { + "epoch": 5.614070760471736, + "grad_norm": 9.302954351406052, + "learning_rate": 1.2661107930940975e-05, + "loss": 0.0862, + "step": 13805 + }, + { + "epoch": 5.614477429849533, + "grad_norm": 2.391183431467633, + "learning_rate": 1.266013126216803e-05, + "loss": 0.0201, + "step": 13806 + }, + { + "epoch": 5.614884099227329, + "grad_norm": 7.173406984937978, + "learning_rate": 1.2659154566087499e-05, + "loss": 0.2289, + "step": 13807 + }, + { + "epoch": 5.615290768605124, + "grad_norm": 8.546017355751363, + "learning_rate": 1.2658177842709409e-05, + "loss": 0.1886, + "step": 13808 + }, + { + "epoch": 5.61569743798292, + "grad_norm": 5.388369722137445, + "learning_rate": 1.2657201092043785e-05, + "loss": 0.1216, + "step": 13809 + }, + { + "epoch": 5.616104107360716, + "grad_norm": 9.73982264977799, + "learning_rate": 1.2656224314100655e-05, + "loss": 0.3086, + "step": 13810 + }, + { + "epoch": 5.616510776738512, + "grad_norm": 1.4820316504570668, + "learning_rate": 1.2655247508890045e-05, + "loss": 0.0239, + "step": 13811 + }, + { + "epoch": 5.616917446116307, + "grad_norm": 12.74495189437677, + "learning_rate": 1.2654270676421982e-05, + "loss": 0.4355, + "step": 13812 + }, + { + "epoch": 5.617324115494103, + "grad_norm": 8.145894425877044, + "learning_rate": 1.2653293816706498e-05, + "loss": 0.1301, + "step": 13813 + }, + { + "epoch": 5.617730784871899, + "grad_norm": 9.412164908900529, + "learning_rate": 1.2652316929753617e-05, + "loss": 0.3323, + "step": 13814 + }, + { + "epoch": 5.618137454249695, + "grad_norm": 16.193514190879743, + "learning_rate": 1.2651340015573363e-05, + "loss": 0.8977, + "step": 13815 + }, + { + "epoch": 5.6185441236274904, + "grad_norm": 0.51954630814742, + "learning_rate": 1.2650363074175773e-05, + "loss": 0.0075, + "step": 13816 + }, + { + "epoch": 5.618950793005286, + "grad_norm": 4.059512650640294, + "learning_rate": 1.264938610557087e-05, + "loss": 0.1227, + "step": 13817 + }, + { + "epoch": 5.619357462383083, + "grad_norm": 0.039933854918123984, + "learning_rate": 1.2648409109768687e-05, + "loss": 0.0007, + "step": 13818 + }, + { + "epoch": 5.619764131760879, + "grad_norm": 3.307635569683271, + "learning_rate": 1.2647432086779252e-05, + "loss": 0.0825, + "step": 13819 + }, + { + "epoch": 5.620170801138674, + "grad_norm": 10.194467748160188, + "learning_rate": 1.2646455036612594e-05, + "loss": 0.3291, + "step": 13820 + }, + { + "epoch": 5.62057747051647, + "grad_norm": 0.2890392299940073, + "learning_rate": 1.2645477959278745e-05, + "loss": 0.004, + "step": 13821 + }, + { + "epoch": 5.620984139894266, + "grad_norm": 3.639676399759413, + "learning_rate": 1.264450085478773e-05, + "loss": 0.0854, + "step": 13822 + }, + { + "epoch": 5.621390809272062, + "grad_norm": 6.482872772712375, + "learning_rate": 1.2643523723149584e-05, + "loss": 0.1395, + "step": 13823 + }, + { + "epoch": 5.6217974786498575, + "grad_norm": 0.39717001136089686, + "learning_rate": 1.2642546564374336e-05, + "loss": 0.0085, + "step": 13824 + }, + { + "epoch": 5.622204148027653, + "grad_norm": 5.164383182148671, + "learning_rate": 1.2641569378472017e-05, + "loss": 0.0909, + "step": 13825 + }, + { + "epoch": 5.622610817405449, + "grad_norm": 4.98592596976233, + "learning_rate": 1.264059216545266e-05, + "loss": 0.1192, + "step": 13826 + }, + { + "epoch": 5.623017486783246, + "grad_norm": 9.413673946008775, + "learning_rate": 1.2639614925326296e-05, + "loss": 0.2522, + "step": 13827 + }, + { + "epoch": 5.623424156161041, + "grad_norm": 3.3563282402765826, + "learning_rate": 1.2638637658102954e-05, + "loss": 0.0695, + "step": 13828 + }, + { + "epoch": 5.623830825538837, + "grad_norm": 9.947203954493876, + "learning_rate": 1.2637660363792672e-05, + "loss": 0.3698, + "step": 13829 + }, + { + "epoch": 5.624237494916633, + "grad_norm": 9.573989238384938, + "learning_rate": 1.2636683042405477e-05, + "loss": 0.2827, + "step": 13830 + }, + { + "epoch": 5.624644164294429, + "grad_norm": 0.34442588718595185, + "learning_rate": 1.26357056939514e-05, + "loss": 0.0051, + "step": 13831 + }, + { + "epoch": 5.6250508336722245, + "grad_norm": 5.22496792320565, + "learning_rate": 1.2634728318440484e-05, + "loss": 0.0679, + "step": 13832 + }, + { + "epoch": 5.62545750305002, + "grad_norm": 0.05760779325632161, + "learning_rate": 1.2633750915882752e-05, + "loss": 0.001, + "step": 13833 + }, + { + "epoch": 5.625864172427816, + "grad_norm": 4.161932618105942, + "learning_rate": 1.2632773486288244e-05, + "loss": 0.0942, + "step": 13834 + }, + { + "epoch": 5.626270841805612, + "grad_norm": 3.7771058730085265, + "learning_rate": 1.2631796029666987e-05, + "loss": 0.0681, + "step": 13835 + }, + { + "epoch": 5.6266775111834075, + "grad_norm": 21.487441842792137, + "learning_rate": 1.2630818546029022e-05, + "loss": 0.3702, + "step": 13836 + }, + { + "epoch": 5.627084180561203, + "grad_norm": 8.627830852812478, + "learning_rate": 1.262984103538438e-05, + "loss": 0.1076, + "step": 13837 + }, + { + "epoch": 5.627490849939, + "grad_norm": 8.652921402465747, + "learning_rate": 1.2628863497743096e-05, + "loss": 0.2382, + "step": 13838 + }, + { + "epoch": 5.627897519316796, + "grad_norm": 15.975610830601363, + "learning_rate": 1.2627885933115205e-05, + "loss": 0.3464, + "step": 13839 + }, + { + "epoch": 5.6283041886945915, + "grad_norm": 3.9401161290226505, + "learning_rate": 1.2626908341510745e-05, + "loss": 0.0701, + "step": 13840 + }, + { + "epoch": 5.628710858072387, + "grad_norm": 3.270173637443681, + "learning_rate": 1.2625930722939743e-05, + "loss": 0.0638, + "step": 13841 + }, + { + "epoch": 5.629117527450183, + "grad_norm": 0.11204103607366793, + "learning_rate": 1.2624953077412245e-05, + "loss": 0.0017, + "step": 13842 + }, + { + "epoch": 5.629524196827979, + "grad_norm": 8.088521873691342, + "learning_rate": 1.2623975404938283e-05, + "loss": 0.1404, + "step": 13843 + }, + { + "epoch": 5.6299308662057745, + "grad_norm": 0.6743087109054817, + "learning_rate": 1.2622997705527888e-05, + "loss": 0.0102, + "step": 13844 + }, + { + "epoch": 5.63033753558357, + "grad_norm": 4.321091429291293, + "learning_rate": 1.2622019979191106e-05, + "loss": 0.0519, + "step": 13845 + }, + { + "epoch": 5.630744204961366, + "grad_norm": 7.076059396242298, + "learning_rate": 1.2621042225937968e-05, + "loss": 0.2983, + "step": 13846 + }, + { + "epoch": 5.631150874339163, + "grad_norm": 0.2119140197342639, + "learning_rate": 1.2620064445778513e-05, + "loss": 0.0042, + "step": 13847 + }, + { + "epoch": 5.6315575437169585, + "grad_norm": 1.0088343959130293, + "learning_rate": 1.2619086638722777e-05, + "loss": 0.0189, + "step": 13848 + }, + { + "epoch": 5.631964213094754, + "grad_norm": 8.45596641770239, + "learning_rate": 1.26181088047808e-05, + "loss": 0.2053, + "step": 13849 + }, + { + "epoch": 5.63237088247255, + "grad_norm": 1.2388121881760168, + "learning_rate": 1.2617130943962617e-05, + "loss": 0.0115, + "step": 13850 + }, + { + "epoch": 5.632777551850346, + "grad_norm": 7.495683950560397, + "learning_rate": 1.2616153056278266e-05, + "loss": 0.2511, + "step": 13851 + }, + { + "epoch": 5.6331842212281416, + "grad_norm": 6.766679474862011, + "learning_rate": 1.2615175141737791e-05, + "loss": 0.5882, + "step": 13852 + }, + { + "epoch": 5.633590890605937, + "grad_norm": 9.028034824625434, + "learning_rate": 1.2614197200351226e-05, + "loss": 0.3151, + "step": 13853 + }, + { + "epoch": 5.633997559983733, + "grad_norm": 3.2487552888219224, + "learning_rate": 1.2613219232128608e-05, + "loss": 0.1071, + "step": 13854 + }, + { + "epoch": 5.634404229361529, + "grad_norm": 1.902126070746049, + "learning_rate": 1.2612241237079983e-05, + "loss": 0.0276, + "step": 13855 + }, + { + "epoch": 5.634810898739325, + "grad_norm": 1.063782715989855, + "learning_rate": 1.2611263215215386e-05, + "loss": 0.0195, + "step": 13856 + }, + { + "epoch": 5.63521756811712, + "grad_norm": 8.08817376171106, + "learning_rate": 1.2610285166544856e-05, + "loss": 0.1658, + "step": 13857 + }, + { + "epoch": 5.635624237494916, + "grad_norm": 2.389310992340646, + "learning_rate": 1.2609307091078436e-05, + "loss": 0.0499, + "step": 13858 + }, + { + "epoch": 5.636030906872713, + "grad_norm": 1.894142244160897, + "learning_rate": 1.2608328988826167e-05, + "loss": 0.0318, + "step": 13859 + }, + { + "epoch": 5.636437576250509, + "grad_norm": 4.465641529898688, + "learning_rate": 1.2607350859798085e-05, + "loss": 0.0679, + "step": 13860 + }, + { + "epoch": 5.636844245628304, + "grad_norm": 8.787075713348184, + "learning_rate": 1.2606372704004238e-05, + "loss": 0.1053, + "step": 13861 + }, + { + "epoch": 5.6372509150061, + "grad_norm": 14.044158078473195, + "learning_rate": 1.2605394521454661e-05, + "loss": 0.395, + "step": 13862 + }, + { + "epoch": 5.637657584383896, + "grad_norm": 1.1917861455920762, + "learning_rate": 1.2604416312159398e-05, + "loss": 0.0231, + "step": 13863 + }, + { + "epoch": 5.638064253761692, + "grad_norm": 0.7044970960712276, + "learning_rate": 1.2603438076128491e-05, + "loss": 0.0085, + "step": 13864 + }, + { + "epoch": 5.638470923139487, + "grad_norm": 7.969247096231953, + "learning_rate": 1.2602459813371983e-05, + "loss": 0.3639, + "step": 13865 + }, + { + "epoch": 5.638877592517283, + "grad_norm": 4.750544902151582, + "learning_rate": 1.2601481523899914e-05, + "loss": 0.097, + "step": 13866 + }, + { + "epoch": 5.639284261895079, + "grad_norm": 0.4909296938972388, + "learning_rate": 1.2600503207722327e-05, + "loss": 0.0063, + "step": 13867 + }, + { + "epoch": 5.639690931272876, + "grad_norm": 1.8284143049316175, + "learning_rate": 1.2599524864849268e-05, + "loss": 0.0288, + "step": 13868 + }, + { + "epoch": 5.640097600650671, + "grad_norm": 13.098541683635682, + "learning_rate": 1.2598546495290778e-05, + "loss": 0.4422, + "step": 13869 + }, + { + "epoch": 5.640504270028467, + "grad_norm": 9.158364029331365, + "learning_rate": 1.2597568099056898e-05, + "loss": 0.1706, + "step": 13870 + }, + { + "epoch": 5.640910939406263, + "grad_norm": 0.009537953158076214, + "learning_rate": 1.2596589676157676e-05, + "loss": 0.0001, + "step": 13871 + }, + { + "epoch": 5.641317608784059, + "grad_norm": 9.11886748145652, + "learning_rate": 1.2595611226603155e-05, + "loss": 0.4624, + "step": 13872 + }, + { + "epoch": 5.641724278161854, + "grad_norm": 9.753951464279151, + "learning_rate": 1.2594632750403375e-05, + "loss": 0.5038, + "step": 13873 + }, + { + "epoch": 5.64213094753965, + "grad_norm": 4.170632520938246, + "learning_rate": 1.2593654247568386e-05, + "loss": 0.2849, + "step": 13874 + }, + { + "epoch": 5.642537616917446, + "grad_norm": 1.4575475432469305, + "learning_rate": 1.2592675718108231e-05, + "loss": 0.026, + "step": 13875 + }, + { + "epoch": 5.642944286295242, + "grad_norm": 3.8189629595688803, + "learning_rate": 1.2591697162032954e-05, + "loss": 0.0763, + "step": 13876 + }, + { + "epoch": 5.6433509556730375, + "grad_norm": 6.152426826733582, + "learning_rate": 1.2590718579352603e-05, + "loss": 0.2773, + "step": 13877 + }, + { + "epoch": 5.643757625050833, + "grad_norm": 1.2959873828362718, + "learning_rate": 1.2589739970077221e-05, + "loss": 0.0262, + "step": 13878 + }, + { + "epoch": 5.64416429442863, + "grad_norm": 4.164946668154611, + "learning_rate": 1.2588761334216855e-05, + "loss": 0.1504, + "step": 13879 + }, + { + "epoch": 5.644570963806426, + "grad_norm": 2.2967101054339034, + "learning_rate": 1.258778267178155e-05, + "loss": 0.039, + "step": 13880 + }, + { + "epoch": 5.644977633184221, + "grad_norm": 8.680337009991526, + "learning_rate": 1.2586803982781355e-05, + "loss": 0.269, + "step": 13881 + }, + { + "epoch": 5.645384302562017, + "grad_norm": 0.034819591652590624, + "learning_rate": 1.2585825267226315e-05, + "loss": 0.0005, + "step": 13882 + }, + { + "epoch": 5.645790971939813, + "grad_norm": 0.6573535551304824, + "learning_rate": 1.2584846525126476e-05, + "loss": 0.013, + "step": 13883 + }, + { + "epoch": 5.646197641317609, + "grad_norm": 0.023804364384896158, + "learning_rate": 1.2583867756491886e-05, + "loss": 0.0004, + "step": 13884 + }, + { + "epoch": 5.6466043106954045, + "grad_norm": 7.917916395130027, + "learning_rate": 1.2582888961332596e-05, + "loss": 0.4115, + "step": 13885 + }, + { + "epoch": 5.6470109800732, + "grad_norm": 2.1150264567700123, + "learning_rate": 1.2581910139658646e-05, + "loss": 0.0266, + "step": 13886 + }, + { + "epoch": 5.647417649450996, + "grad_norm": 0.9549149481638969, + "learning_rate": 1.2580931291480093e-05, + "loss": 0.0171, + "step": 13887 + }, + { + "epoch": 5.647824318828793, + "grad_norm": 3.7046186402469834, + "learning_rate": 1.257995241680698e-05, + "loss": 0.1345, + "step": 13888 + }, + { + "epoch": 5.648230988206588, + "grad_norm": 2.148353678181667, + "learning_rate": 1.2578973515649357e-05, + "loss": 0.0278, + "step": 13889 + }, + { + "epoch": 5.648637657584384, + "grad_norm": 11.867839199527015, + "learning_rate": 1.2577994588017273e-05, + "loss": 0.6636, + "step": 13890 + }, + { + "epoch": 5.64904432696218, + "grad_norm": 0.5286509954120057, + "learning_rate": 1.257701563392078e-05, + "loss": 0.0107, + "step": 13891 + }, + { + "epoch": 5.649450996339976, + "grad_norm": 4.245363222063965, + "learning_rate": 1.257603665336992e-05, + "loss": 0.0905, + "step": 13892 + }, + { + "epoch": 5.6498576657177715, + "grad_norm": 0.499560091097631, + "learning_rate": 1.2575057646374749e-05, + "loss": 0.0088, + "step": 13893 + }, + { + "epoch": 5.650264335095567, + "grad_norm": 7.1137213322074055, + "learning_rate": 1.2574078612945315e-05, + "loss": 0.1733, + "step": 13894 + }, + { + "epoch": 5.650671004473363, + "grad_norm": 5.125225702191817, + "learning_rate": 1.257309955309167e-05, + "loss": 0.2025, + "step": 13895 + }, + { + "epoch": 5.651077673851159, + "grad_norm": 5.149097922205062, + "learning_rate": 1.2572120466823861e-05, + "loss": 0.1039, + "step": 13896 + }, + { + "epoch": 5.6514843432289545, + "grad_norm": 4.03605446322948, + "learning_rate": 1.2571141354151944e-05, + "loss": 0.1363, + "step": 13897 + }, + { + "epoch": 5.65189101260675, + "grad_norm": 1.2344790664547463, + "learning_rate": 1.2570162215085966e-05, + "loss": 0.023, + "step": 13898 + }, + { + "epoch": 5.652297681984546, + "grad_norm": 6.628439983380538, + "learning_rate": 1.2569183049635975e-05, + "loss": 0.1657, + "step": 13899 + }, + { + "epoch": 5.652704351362343, + "grad_norm": 6.247689880416767, + "learning_rate": 1.2568203857812032e-05, + "loss": 0.2091, + "step": 13900 + }, + { + "epoch": 5.6531110207401385, + "grad_norm": 9.308316412935136, + "learning_rate": 1.2567224639624182e-05, + "loss": 0.3261, + "step": 13901 + }, + { + "epoch": 5.653517690117934, + "grad_norm": 1.6085061031006904, + "learning_rate": 1.2566245395082477e-05, + "loss": 0.0342, + "step": 13902 + }, + { + "epoch": 5.65392435949573, + "grad_norm": 7.583715911143186, + "learning_rate": 1.2565266124196973e-05, + "loss": 0.2232, + "step": 13903 + }, + { + "epoch": 5.654331028873526, + "grad_norm": 4.311235637390805, + "learning_rate": 1.2564286826977724e-05, + "loss": 0.1103, + "step": 13904 + }, + { + "epoch": 5.6547376982513216, + "grad_norm": 0.11361781320382533, + "learning_rate": 1.2563307503434776e-05, + "loss": 0.0021, + "step": 13905 + }, + { + "epoch": 5.655144367629117, + "grad_norm": 2.1968181006014174, + "learning_rate": 1.256232815357819e-05, + "loss": 0.0823, + "step": 13906 + }, + { + "epoch": 5.655551037006913, + "grad_norm": 11.655996580347724, + "learning_rate": 1.2561348777418012e-05, + "loss": 0.6503, + "step": 13907 + }, + { + "epoch": 5.655957706384709, + "grad_norm": 5.891819394164373, + "learning_rate": 1.25603693749643e-05, + "loss": 0.1999, + "step": 13908 + }, + { + "epoch": 5.6563643757625055, + "grad_norm": 1.324032636758705, + "learning_rate": 1.2559389946227113e-05, + "loss": 0.0357, + "step": 13909 + }, + { + "epoch": 5.656771045140301, + "grad_norm": 4.191698088622565, + "learning_rate": 1.2558410491216496e-05, + "loss": 0.1365, + "step": 13910 + }, + { + "epoch": 5.657177714518097, + "grad_norm": 8.785498717991334, + "learning_rate": 1.2557431009942508e-05, + "loss": 0.1118, + "step": 13911 + }, + { + "epoch": 5.657584383895893, + "grad_norm": 9.25841113400414, + "learning_rate": 1.25564515024152e-05, + "loss": 0.501, + "step": 13912 + }, + { + "epoch": 5.657991053273689, + "grad_norm": 2.5872092053105624, + "learning_rate": 1.2555471968644636e-05, + "loss": 0.0513, + "step": 13913 + }, + { + "epoch": 5.658397722651484, + "grad_norm": 6.259609802347918, + "learning_rate": 1.2554492408640865e-05, + "loss": 0.1635, + "step": 13914 + }, + { + "epoch": 5.65880439202928, + "grad_norm": 3.5773802862943507, + "learning_rate": 1.2553512822413945e-05, + "loss": 0.1842, + "step": 13915 + }, + { + "epoch": 5.659211061407076, + "grad_norm": 7.471540747664343, + "learning_rate": 1.2552533209973929e-05, + "loss": 0.091, + "step": 13916 + }, + { + "epoch": 5.659617730784872, + "grad_norm": 0.06560315944399975, + "learning_rate": 1.2551553571330875e-05, + "loss": 0.0006, + "step": 13917 + }, + { + "epoch": 5.660024400162667, + "grad_norm": 10.626425300569794, + "learning_rate": 1.2550573906494839e-05, + "loss": 0.089, + "step": 13918 + }, + { + "epoch": 5.660431069540463, + "grad_norm": 1.4389479534298935, + "learning_rate": 1.2549594215475877e-05, + "loss": 0.0189, + "step": 13919 + }, + { + "epoch": 5.66083773891826, + "grad_norm": 21.588652630893904, + "learning_rate": 1.254861449828405e-05, + "loss": 0.3946, + "step": 13920 + }, + { + "epoch": 5.661244408296056, + "grad_norm": 7.28017345522276, + "learning_rate": 1.254763475492941e-05, + "loss": 0.2103, + "step": 13921 + }, + { + "epoch": 5.661651077673851, + "grad_norm": 0.7821296329797961, + "learning_rate": 1.2546654985422019e-05, + "loss": 0.0109, + "step": 13922 + }, + { + "epoch": 5.662057747051647, + "grad_norm": 0.6580706945204213, + "learning_rate": 1.2545675189771934e-05, + "loss": 0.0095, + "step": 13923 + }, + { + "epoch": 5.662464416429443, + "grad_norm": 1.6405103045204905, + "learning_rate": 1.254469536798921e-05, + "loss": 0.0215, + "step": 13924 + }, + { + "epoch": 5.662871085807239, + "grad_norm": 4.790787563672182, + "learning_rate": 1.2543715520083906e-05, + "loss": 0.1664, + "step": 13925 + }, + { + "epoch": 5.663277755185034, + "grad_norm": 1.755461694751617, + "learning_rate": 1.2542735646066085e-05, + "loss": 0.0249, + "step": 13926 + }, + { + "epoch": 5.66368442456283, + "grad_norm": 0.11494074038145019, + "learning_rate": 1.25417557459458e-05, + "loss": 0.0017, + "step": 13927 + }, + { + "epoch": 5.664091093940626, + "grad_norm": 4.3368171707055065, + "learning_rate": 1.2540775819733113e-05, + "loss": 0.2235, + "step": 13928 + }, + { + "epoch": 5.664497763318423, + "grad_norm": 5.307264148337743, + "learning_rate": 1.2539795867438086e-05, + "loss": 0.0737, + "step": 13929 + }, + { + "epoch": 5.664904432696218, + "grad_norm": 0.1439829170654508, + "learning_rate": 1.2538815889070774e-05, + "loss": 0.0029, + "step": 13930 + }, + { + "epoch": 5.665311102074014, + "grad_norm": 3.388223491982984, + "learning_rate": 1.253783588464124e-05, + "loss": 0.03, + "step": 13931 + }, + { + "epoch": 5.66571777145181, + "grad_norm": 5.986063647223015, + "learning_rate": 1.2536855854159544e-05, + "loss": 0.0945, + "step": 13932 + }, + { + "epoch": 5.666124440829606, + "grad_norm": 0.15392754627461736, + "learning_rate": 1.2535875797635746e-05, + "loss": 0.0021, + "step": 13933 + }, + { + "epoch": 5.666531110207401, + "grad_norm": 9.971918998882401, + "learning_rate": 1.2534895715079904e-05, + "loss": 0.403, + "step": 13934 + }, + { + "epoch": 5.666937779585197, + "grad_norm": 0.3148431176738588, + "learning_rate": 1.2533915606502085e-05, + "loss": 0.005, + "step": 13935 + }, + { + "epoch": 5.667344448962993, + "grad_norm": 0.20533804121584565, + "learning_rate": 1.2532935471912347e-05, + "loss": 0.0025, + "step": 13936 + }, + { + "epoch": 5.667751118340789, + "grad_norm": 0.7972271195393515, + "learning_rate": 1.253195531132075e-05, + "loss": 0.0089, + "step": 13937 + }, + { + "epoch": 5.6681577877185845, + "grad_norm": 9.2271947797879, + "learning_rate": 1.2530975124737357e-05, + "loss": 0.257, + "step": 13938 + }, + { + "epoch": 5.66856445709638, + "grad_norm": 12.898696314562004, + "learning_rate": 1.2529994912172233e-05, + "loss": 0.2287, + "step": 13939 + }, + { + "epoch": 5.668971126474176, + "grad_norm": 3.166496147434491, + "learning_rate": 1.2529014673635436e-05, + "loss": 0.0476, + "step": 13940 + }, + { + "epoch": 5.669377795851973, + "grad_norm": 6.676622258810979, + "learning_rate": 1.252803440913703e-05, + "loss": 0.2111, + "step": 13941 + }, + { + "epoch": 5.669784465229768, + "grad_norm": 1.4954151154400832, + "learning_rate": 1.2527054118687078e-05, + "loss": 0.0187, + "step": 13942 + }, + { + "epoch": 5.670191134607564, + "grad_norm": 2.734418132007538, + "learning_rate": 1.2526073802295647e-05, + "loss": 0.0418, + "step": 13943 + }, + { + "epoch": 5.67059780398536, + "grad_norm": 2.808095668323908, + "learning_rate": 1.2525093459972793e-05, + "loss": 0.0544, + "step": 13944 + }, + { + "epoch": 5.671004473363156, + "grad_norm": 0.03835886451988166, + "learning_rate": 1.2524113091728584e-05, + "loss": 0.0006, + "step": 13945 + }, + { + "epoch": 5.6714111427409515, + "grad_norm": 14.340033471482077, + "learning_rate": 1.2523132697573085e-05, + "loss": 1.2196, + "step": 13946 + }, + { + "epoch": 5.671817812118747, + "grad_norm": 12.38193419753186, + "learning_rate": 1.2522152277516361e-05, + "loss": 0.3694, + "step": 13947 + }, + { + "epoch": 5.672224481496543, + "grad_norm": 5.2486434079926605, + "learning_rate": 1.2521171831568476e-05, + "loss": 0.1285, + "step": 13948 + }, + { + "epoch": 5.672631150874339, + "grad_norm": 33.399780226815295, + "learning_rate": 1.2520191359739488e-05, + "loss": 0.3422, + "step": 13949 + }, + { + "epoch": 5.673037820252135, + "grad_norm": 3.7947202359953307, + "learning_rate": 1.251921086203947e-05, + "loss": 0.062, + "step": 13950 + }, + { + "epoch": 5.673444489629931, + "grad_norm": 4.291357194029903, + "learning_rate": 1.2518230338478486e-05, + "loss": 0.0783, + "step": 13951 + }, + { + "epoch": 5.673851159007727, + "grad_norm": 1.2811442942412428, + "learning_rate": 1.2517249789066597e-05, + "loss": 0.0177, + "step": 13952 + }, + { + "epoch": 5.674257828385523, + "grad_norm": 6.67725520912678, + "learning_rate": 1.2516269213813872e-05, + "loss": 0.2657, + "step": 13953 + }, + { + "epoch": 5.6746644977633185, + "grad_norm": 4.871385617410229, + "learning_rate": 1.2515288612730379e-05, + "loss": 0.1812, + "step": 13954 + }, + { + "epoch": 5.675071167141114, + "grad_norm": 2.0124657268528865, + "learning_rate": 1.2514307985826181e-05, + "loss": 0.0304, + "step": 13955 + }, + { + "epoch": 5.67547783651891, + "grad_norm": 4.3951699589059645, + "learning_rate": 1.251332733311135e-05, + "loss": 0.1196, + "step": 13956 + }, + { + "epoch": 5.675884505896706, + "grad_norm": 7.2889910271003515, + "learning_rate": 1.2512346654595943e-05, + "loss": 0.1781, + "step": 13957 + }, + { + "epoch": 5.6762911752745016, + "grad_norm": 3.89980923088476, + "learning_rate": 1.2511365950290037e-05, + "loss": 0.0959, + "step": 13958 + }, + { + "epoch": 5.676697844652297, + "grad_norm": 1.8496353750304786, + "learning_rate": 1.2510385220203696e-05, + "loss": 0.0353, + "step": 13959 + }, + { + "epoch": 5.677104514030093, + "grad_norm": 11.418355741249387, + "learning_rate": 1.2509404464346984e-05, + "loss": 0.5446, + "step": 13960 + }, + { + "epoch": 5.67751118340789, + "grad_norm": 8.167790475094302, + "learning_rate": 1.2508423682729975e-05, + "loss": 0.1797, + "step": 13961 + }, + { + "epoch": 5.6779178527856855, + "grad_norm": 1.110844951712214, + "learning_rate": 1.2507442875362735e-05, + "loss": 0.0186, + "step": 13962 + }, + { + "epoch": 5.678324522163481, + "grad_norm": 7.239805417555227, + "learning_rate": 1.2506462042255327e-05, + "loss": 0.1375, + "step": 13963 + }, + { + "epoch": 5.678731191541277, + "grad_norm": 12.557050121238369, + "learning_rate": 1.250548118341783e-05, + "loss": 0.4968, + "step": 13964 + }, + { + "epoch": 5.679137860919073, + "grad_norm": 3.033233864951483, + "learning_rate": 1.2504500298860305e-05, + "loss": 0.1452, + "step": 13965 + }, + { + "epoch": 5.679544530296869, + "grad_norm": 1.2441037665838632, + "learning_rate": 1.2503519388592824e-05, + "loss": 0.0203, + "step": 13966 + }, + { + "epoch": 5.679951199674664, + "grad_norm": 0.2487368617387128, + "learning_rate": 1.2502538452625457e-05, + "loss": 0.0025, + "step": 13967 + }, + { + "epoch": 5.68035786905246, + "grad_norm": 10.987713235981898, + "learning_rate": 1.2501557490968275e-05, + "loss": 0.1855, + "step": 13968 + }, + { + "epoch": 5.680764538430256, + "grad_norm": 4.2997300819268265, + "learning_rate": 1.2500576503631345e-05, + "loss": 0.1396, + "step": 13969 + }, + { + "epoch": 5.6811712078080525, + "grad_norm": 10.656637638071242, + "learning_rate": 1.2499595490624739e-05, + "loss": 0.4764, + "step": 13970 + }, + { + "epoch": 5.681577877185848, + "grad_norm": 7.486212855789464, + "learning_rate": 1.2498614451958526e-05, + "loss": 0.1749, + "step": 13971 + }, + { + "epoch": 5.681984546563644, + "grad_norm": 6.206527483771278, + "learning_rate": 1.249763338764278e-05, + "loss": 0.2361, + "step": 13972 + }, + { + "epoch": 5.68239121594144, + "grad_norm": 10.79226197186942, + "learning_rate": 1.2496652297687568e-05, + "loss": 0.4135, + "step": 13973 + }, + { + "epoch": 5.682797885319236, + "grad_norm": 0.8848300312967675, + "learning_rate": 1.2495671182102965e-05, + "loss": 0.0159, + "step": 13974 + }, + { + "epoch": 5.683204554697031, + "grad_norm": 5.763136610211131, + "learning_rate": 1.2494690040899042e-05, + "loss": 0.0765, + "step": 13975 + }, + { + "epoch": 5.683611224074827, + "grad_norm": 3.716010974423784, + "learning_rate": 1.2493708874085867e-05, + "loss": 0.075, + "step": 13976 + }, + { + "epoch": 5.684017893452623, + "grad_norm": 7.309017487683809, + "learning_rate": 1.249272768167352e-05, + "loss": 0.3896, + "step": 13977 + }, + { + "epoch": 5.684424562830419, + "grad_norm": 0.8447544857120498, + "learning_rate": 1.2491746463672065e-05, + "loss": 0.0138, + "step": 13978 + }, + { + "epoch": 5.684831232208214, + "grad_norm": 10.130025036492928, + "learning_rate": 1.2490765220091579e-05, + "loss": 0.3147, + "step": 13979 + }, + { + "epoch": 5.68523790158601, + "grad_norm": 5.167369823321975, + "learning_rate": 1.2489783950942135e-05, + "loss": 0.097, + "step": 13980 + }, + { + "epoch": 5.685644570963806, + "grad_norm": 2.0787660807516724, + "learning_rate": 1.2488802656233807e-05, + "loss": 0.0401, + "step": 13981 + }, + { + "epoch": 5.686051240341603, + "grad_norm": 6.736435877912742, + "learning_rate": 1.2487821335976667e-05, + "loss": 0.3318, + "step": 13982 + }, + { + "epoch": 5.686457909719398, + "grad_norm": 0.05865355999296871, + "learning_rate": 1.2486839990180787e-05, + "loss": 0.0013, + "step": 13983 + }, + { + "epoch": 5.686864579097194, + "grad_norm": 9.795995976572156, + "learning_rate": 1.2485858618856246e-05, + "loss": 0.2548, + "step": 13984 + }, + { + "epoch": 5.68727124847499, + "grad_norm": 1.1216567859577602, + "learning_rate": 1.2484877222013112e-05, + "loss": 0.0194, + "step": 13985 + }, + { + "epoch": 5.687677917852786, + "grad_norm": 1.27594754358895, + "learning_rate": 1.2483895799661463e-05, + "loss": 0.0212, + "step": 13986 + }, + { + "epoch": 5.688084587230581, + "grad_norm": 9.955053760247278, + "learning_rate": 1.2482914351811374e-05, + "loss": 0.3408, + "step": 13987 + }, + { + "epoch": 5.688491256608377, + "grad_norm": 9.162984488473581, + "learning_rate": 1.248193287847292e-05, + "loss": 0.2162, + "step": 13988 + }, + { + "epoch": 5.688897925986173, + "grad_norm": 0.7970146043381398, + "learning_rate": 1.2480951379656175e-05, + "loss": 0.0196, + "step": 13989 + }, + { + "epoch": 5.689304595363969, + "grad_norm": 14.60216471939013, + "learning_rate": 1.2479969855371217e-05, + "loss": 0.6628, + "step": 13990 + }, + { + "epoch": 5.689711264741765, + "grad_norm": 0.7622398764244898, + "learning_rate": 1.2478988305628121e-05, + "loss": 0.0128, + "step": 13991 + }, + { + "epoch": 5.690117934119561, + "grad_norm": 8.462006324181448, + "learning_rate": 1.2478006730436957e-05, + "loss": 0.1949, + "step": 13992 + }, + { + "epoch": 5.690524603497357, + "grad_norm": 4.4588452424822025, + "learning_rate": 1.2477025129807814e-05, + "loss": 0.0514, + "step": 13993 + }, + { + "epoch": 5.690931272875153, + "grad_norm": 11.91841847233269, + "learning_rate": 1.2476043503750756e-05, + "loss": 0.2535, + "step": 13994 + }, + { + "epoch": 5.691337942252948, + "grad_norm": 10.492938897001153, + "learning_rate": 1.247506185227587e-05, + "loss": 0.2156, + "step": 13995 + }, + { + "epoch": 5.691744611630744, + "grad_norm": 4.816452969083181, + "learning_rate": 1.2474080175393227e-05, + "loss": 0.1354, + "step": 13996 + }, + { + "epoch": 5.69215128100854, + "grad_norm": 3.6444496761698444, + "learning_rate": 1.2473098473112903e-05, + "loss": 0.0766, + "step": 13997 + }, + { + "epoch": 5.692557950386336, + "grad_norm": 1.0071567061777473, + "learning_rate": 1.247211674544498e-05, + "loss": 0.0113, + "step": 13998 + }, + { + "epoch": 5.6929646197641315, + "grad_norm": 11.317876043194776, + "learning_rate": 1.2471134992399533e-05, + "loss": 0.5747, + "step": 13999 + }, + { + "epoch": 5.693371289141927, + "grad_norm": 1.8014586616426993, + "learning_rate": 1.2470153213986644e-05, + "loss": 0.0179, + "step": 14000 + }, + { + "epoch": 5.693777958519723, + "grad_norm": 4.851043915197987, + "learning_rate": 1.2469171410216388e-05, + "loss": 0.0998, + "step": 14001 + }, + { + "epoch": 5.69418462789752, + "grad_norm": 8.33018265621701, + "learning_rate": 1.2468189581098844e-05, + "loss": 0.2108, + "step": 14002 + }, + { + "epoch": 5.694591297275315, + "grad_norm": 5.734175342321171, + "learning_rate": 1.2467207726644094e-05, + "loss": 0.1074, + "step": 14003 + }, + { + "epoch": 5.694997966653111, + "grad_norm": 5.849163602758421, + "learning_rate": 1.2466225846862214e-05, + "loss": 0.152, + "step": 14004 + }, + { + "epoch": 5.695404636030907, + "grad_norm": 0.23142414243606957, + "learning_rate": 1.2465243941763282e-05, + "loss": 0.0031, + "step": 14005 + }, + { + "epoch": 5.695811305408703, + "grad_norm": 4.222909893063352, + "learning_rate": 1.2464262011357385e-05, + "loss": 0.1006, + "step": 14006 + }, + { + "epoch": 5.6962179747864985, + "grad_norm": 1.1770964857262365, + "learning_rate": 1.2463280055654595e-05, + "loss": 0.0186, + "step": 14007 + }, + { + "epoch": 5.696624644164294, + "grad_norm": 5.282452702508137, + "learning_rate": 1.2462298074664994e-05, + "loss": 0.2503, + "step": 14008 + }, + { + "epoch": 5.69703131354209, + "grad_norm": 5.512849426818364, + "learning_rate": 1.2461316068398665e-05, + "loss": 0.1669, + "step": 14009 + }, + { + "epoch": 5.697437982919886, + "grad_norm": 2.384546516168337, + "learning_rate": 1.246033403686569e-05, + "loss": 0.0314, + "step": 14010 + }, + { + "epoch": 5.697844652297682, + "grad_norm": 10.048462740795696, + "learning_rate": 1.2459351980076147e-05, + "loss": 0.2144, + "step": 14011 + }, + { + "epoch": 5.698251321675478, + "grad_norm": 8.488467467076212, + "learning_rate": 1.2458369898040117e-05, + "loss": 0.4044, + "step": 14012 + }, + { + "epoch": 5.698657991053274, + "grad_norm": 7.652521264264934, + "learning_rate": 1.2457387790767686e-05, + "loss": 0.1742, + "step": 14013 + }, + { + "epoch": 5.69906466043107, + "grad_norm": 10.650450542441607, + "learning_rate": 1.2456405658268928e-05, + "loss": 0.4168, + "step": 14014 + }, + { + "epoch": 5.6994713298088655, + "grad_norm": 6.287634320354751, + "learning_rate": 1.2455423500553931e-05, + "loss": 0.1495, + "step": 14015 + }, + { + "epoch": 5.699877999186661, + "grad_norm": 14.899437041676546, + "learning_rate": 1.2454441317632777e-05, + "loss": 0.518, + "step": 14016 + }, + { + "epoch": 5.700284668564457, + "grad_norm": 11.20605645140258, + "learning_rate": 1.245345910951555e-05, + "loss": 0.2429, + "step": 14017 + }, + { + "epoch": 5.700691337942253, + "grad_norm": 0.2656367033648135, + "learning_rate": 1.2452476876212325e-05, + "loss": 0.005, + "step": 14018 + }, + { + "epoch": 5.701098007320049, + "grad_norm": 0.032477359136120515, + "learning_rate": 1.2451494617733193e-05, + "loss": 0.0004, + "step": 14019 + }, + { + "epoch": 5.701504676697844, + "grad_norm": 1.9250786289089725, + "learning_rate": 1.2450512334088236e-05, + "loss": 0.0137, + "step": 14020 + }, + { + "epoch": 5.70191134607564, + "grad_norm": 0.34385928809478306, + "learning_rate": 1.2449530025287535e-05, + "loss": 0.0062, + "step": 14021 + }, + { + "epoch": 5.702318015453436, + "grad_norm": 0.060619335315664304, + "learning_rate": 1.2448547691341174e-05, + "loss": 0.0013, + "step": 14022 + }, + { + "epoch": 5.7027246848312325, + "grad_norm": 2.5946127605822467, + "learning_rate": 1.2447565332259243e-05, + "loss": 0.0483, + "step": 14023 + }, + { + "epoch": 5.703131354209028, + "grad_norm": 2.6737867037811176, + "learning_rate": 1.2446582948051819e-05, + "loss": 0.0389, + "step": 14024 + }, + { + "epoch": 5.703538023586824, + "grad_norm": 11.389429777006125, + "learning_rate": 1.244560053872899e-05, + "loss": 0.2742, + "step": 14025 + }, + { + "epoch": 5.70394469296462, + "grad_norm": 2.960758232424521, + "learning_rate": 1.2444618104300842e-05, + "loss": 0.0332, + "step": 14026 + }, + { + "epoch": 5.704351362342416, + "grad_norm": 13.969675273461108, + "learning_rate": 1.244363564477746e-05, + "loss": 0.6103, + "step": 14027 + }, + { + "epoch": 5.704758031720211, + "grad_norm": 13.545519347926644, + "learning_rate": 1.2442653160168924e-05, + "loss": 0.6431, + "step": 14028 + }, + { + "epoch": 5.705164701098007, + "grad_norm": 0.39070369731801324, + "learning_rate": 1.2441670650485326e-05, + "loss": 0.0043, + "step": 14029 + }, + { + "epoch": 5.705571370475803, + "grad_norm": 13.997461986045357, + "learning_rate": 1.2440688115736752e-05, + "loss": 0.7923, + "step": 14030 + }, + { + "epoch": 5.705978039853599, + "grad_norm": 3.7990078893359707, + "learning_rate": 1.243970555593328e-05, + "loss": 0.0524, + "step": 14031 + }, + { + "epoch": 5.706384709231395, + "grad_norm": 4.735826684324069, + "learning_rate": 1.2438722971085009e-05, + "loss": 0.1867, + "step": 14032 + }, + { + "epoch": 5.706791378609191, + "grad_norm": 0.15697980601577013, + "learning_rate": 1.2437740361202017e-05, + "loss": 0.0018, + "step": 14033 + }, + { + "epoch": 5.707198047986987, + "grad_norm": 3.747357142467032, + "learning_rate": 1.2436757726294392e-05, + "loss": 0.2214, + "step": 14034 + }, + { + "epoch": 5.707604717364783, + "grad_norm": 6.567487416728355, + "learning_rate": 1.2435775066372226e-05, + "loss": 0.188, + "step": 14035 + }, + { + "epoch": 5.708011386742578, + "grad_norm": 3.982888593120786, + "learning_rate": 1.2434792381445602e-05, + "loss": 0.0793, + "step": 14036 + }, + { + "epoch": 5.708418056120374, + "grad_norm": 0.29143172084863894, + "learning_rate": 1.2433809671524606e-05, + "loss": 0.0036, + "step": 14037 + }, + { + "epoch": 5.70882472549817, + "grad_norm": 2.8722398975531305, + "learning_rate": 1.2432826936619331e-05, + "loss": 0.0447, + "step": 14038 + }, + { + "epoch": 5.709231394875966, + "grad_norm": 5.130125694055085, + "learning_rate": 1.2431844176739862e-05, + "loss": 0.1507, + "step": 14039 + }, + { + "epoch": 5.709638064253761, + "grad_norm": 6.096726741552669, + "learning_rate": 1.2430861391896291e-05, + "loss": 0.1659, + "step": 14040 + }, + { + "epoch": 5.710044733631557, + "grad_norm": 0.6404564340595708, + "learning_rate": 1.2429878582098703e-05, + "loss": 0.0096, + "step": 14041 + }, + { + "epoch": 5.710451403009353, + "grad_norm": 5.457752940372776, + "learning_rate": 1.2428895747357187e-05, + "loss": 0.3143, + "step": 14042 + }, + { + "epoch": 5.71085807238715, + "grad_norm": 1.5403549092446875, + "learning_rate": 1.2427912887681836e-05, + "loss": 0.0164, + "step": 14043 + }, + { + "epoch": 5.711264741764945, + "grad_norm": 5.013407844409262, + "learning_rate": 1.2426930003082734e-05, + "loss": 0.2, + "step": 14044 + }, + { + "epoch": 5.711671411142741, + "grad_norm": 4.400061767060806, + "learning_rate": 1.2425947093569977e-05, + "loss": 0.0767, + "step": 14045 + }, + { + "epoch": 5.712078080520537, + "grad_norm": 4.4183561587783595, + "learning_rate": 1.2424964159153652e-05, + "loss": 0.0831, + "step": 14046 + }, + { + "epoch": 5.712484749898333, + "grad_norm": 0.6132132995447062, + "learning_rate": 1.2423981199843849e-05, + "loss": 0.0102, + "step": 14047 + }, + { + "epoch": 5.712891419276128, + "grad_norm": 7.982394112867012, + "learning_rate": 1.2422998215650659e-05, + "loss": 0.3715, + "step": 14048 + }, + { + "epoch": 5.713298088653924, + "grad_norm": 3.316549970323162, + "learning_rate": 1.2422015206584175e-05, + "loss": 0.0465, + "step": 14049 + }, + { + "epoch": 5.71370475803172, + "grad_norm": 2.323658313971607, + "learning_rate": 1.242103217265448e-05, + "loss": 0.0473, + "step": 14050 + }, + { + "epoch": 5.714111427409516, + "grad_norm": 7.403129103991259, + "learning_rate": 1.2420049113871678e-05, + "loss": 0.3715, + "step": 14051 + }, + { + "epoch": 5.714518096787312, + "grad_norm": 1.511381745816875, + "learning_rate": 1.241906603024585e-05, + "loss": 0.0191, + "step": 14052 + }, + { + "epoch": 5.714924766165108, + "grad_norm": 4.234913659506723, + "learning_rate": 1.2418082921787091e-05, + "loss": 0.1093, + "step": 14053 + }, + { + "epoch": 5.715331435542904, + "grad_norm": 0.49298886624938626, + "learning_rate": 1.2417099788505495e-05, + "loss": 0.0078, + "step": 14054 + }, + { + "epoch": 5.7157381049207, + "grad_norm": 7.762451545006254, + "learning_rate": 1.2416116630411154e-05, + "loss": 0.1915, + "step": 14055 + }, + { + "epoch": 5.716144774298495, + "grad_norm": 3.4637580113107935, + "learning_rate": 1.2415133447514158e-05, + "loss": 0.0667, + "step": 14056 + }, + { + "epoch": 5.716551443676291, + "grad_norm": 5.495487607533483, + "learning_rate": 1.2414150239824603e-05, + "loss": 0.2067, + "step": 14057 + }, + { + "epoch": 5.716958113054087, + "grad_norm": 13.850630478181156, + "learning_rate": 1.2413167007352583e-05, + "loss": 0.9125, + "step": 14058 + }, + { + "epoch": 5.717364782431883, + "grad_norm": 3.5665099476480324, + "learning_rate": 1.2412183750108185e-05, + "loss": 0.1124, + "step": 14059 + }, + { + "epoch": 5.7177714518096785, + "grad_norm": 7.117965854057599, + "learning_rate": 1.2411200468101507e-05, + "loss": 0.3431, + "step": 14060 + }, + { + "epoch": 5.718178121187474, + "grad_norm": 8.715761588884522, + "learning_rate": 1.2410217161342646e-05, + "loss": 0.3375, + "step": 14061 + }, + { + "epoch": 5.71858479056527, + "grad_norm": 0.19537517598087903, + "learning_rate": 1.2409233829841692e-05, + "loss": 0.0032, + "step": 14062 + }, + { + "epoch": 5.718991459943066, + "grad_norm": 0.30919584847982073, + "learning_rate": 1.2408250473608736e-05, + "loss": 0.0039, + "step": 14063 + }, + { + "epoch": 5.7193981293208624, + "grad_norm": 0.9777322134671967, + "learning_rate": 1.240726709265388e-05, + "loss": 0.015, + "step": 14064 + }, + { + "epoch": 5.719804798698658, + "grad_norm": 2.3752660436043835, + "learning_rate": 1.2406283686987217e-05, + "loss": 0.0384, + "step": 14065 + }, + { + "epoch": 5.720211468076454, + "grad_norm": 0.657207315541828, + "learning_rate": 1.2405300256618838e-05, + "loss": 0.0099, + "step": 14066 + }, + { + "epoch": 5.72061813745425, + "grad_norm": 2.2079118495727816, + "learning_rate": 1.2404316801558845e-05, + "loss": 0.0392, + "step": 14067 + }, + { + "epoch": 5.7210248068320455, + "grad_norm": 7.293052736196161, + "learning_rate": 1.2403333321817328e-05, + "loss": 0.2175, + "step": 14068 + }, + { + "epoch": 5.721431476209841, + "grad_norm": 8.31576212220171, + "learning_rate": 1.2402349817404382e-05, + "loss": 0.1883, + "step": 14069 + }, + { + "epoch": 5.721838145587637, + "grad_norm": 3.4753622946823084, + "learning_rate": 1.240136628833011e-05, + "loss": 0.0684, + "step": 14070 + }, + { + "epoch": 5.722244814965433, + "grad_norm": 4.7667775057653365, + "learning_rate": 1.2400382734604604e-05, + "loss": 0.0969, + "step": 14071 + }, + { + "epoch": 5.722651484343229, + "grad_norm": 1.9789332852068033, + "learning_rate": 1.239939915623796e-05, + "loss": 0.0313, + "step": 14072 + }, + { + "epoch": 5.723058153721025, + "grad_norm": 5.955727794265436, + "learning_rate": 1.2398415553240275e-05, + "loss": 0.1342, + "step": 14073 + }, + { + "epoch": 5.723464823098821, + "grad_norm": 3.4053791925140855, + "learning_rate": 1.239743192562165e-05, + "loss": 0.0745, + "step": 14074 + }, + { + "epoch": 5.723871492476617, + "grad_norm": 7.731719573202364, + "learning_rate": 1.239644827339218e-05, + "loss": 0.5222, + "step": 14075 + }, + { + "epoch": 5.7242781618544125, + "grad_norm": 4.730646563408185, + "learning_rate": 1.2395464596561959e-05, + "loss": 0.3971, + "step": 14076 + }, + { + "epoch": 5.724684831232208, + "grad_norm": 5.536406219351407, + "learning_rate": 1.239448089514109e-05, + "loss": 0.1638, + "step": 14077 + }, + { + "epoch": 5.725091500610004, + "grad_norm": 4.493821553843709, + "learning_rate": 1.239349716913967e-05, + "loss": 0.1155, + "step": 14078 + }, + { + "epoch": 5.7254981699878, + "grad_norm": 4.665526888022408, + "learning_rate": 1.2392513418567795e-05, + "loss": 0.1474, + "step": 14079 + }, + { + "epoch": 5.725904839365596, + "grad_norm": 0.48297276441493175, + "learning_rate": 1.2391529643435569e-05, + "loss": 0.0083, + "step": 14080 + }, + { + "epoch": 5.726311508743391, + "grad_norm": 9.624601094712123, + "learning_rate": 1.2390545843753086e-05, + "loss": 0.2964, + "step": 14081 + }, + { + "epoch": 5.726718178121187, + "grad_norm": 3.960568553108238, + "learning_rate": 1.2389562019530445e-05, + "loss": 0.0711, + "step": 14082 + }, + { + "epoch": 5.727124847498983, + "grad_norm": 4.337820123641975, + "learning_rate": 1.238857817077775e-05, + "loss": 0.262, + "step": 14083 + }, + { + "epoch": 5.7275315168767795, + "grad_norm": 13.837451614457196, + "learning_rate": 1.2387594297505096e-05, + "loss": 0.5336, + "step": 14084 + }, + { + "epoch": 5.727938186254575, + "grad_norm": 5.871082665786064, + "learning_rate": 1.2386610399722588e-05, + "loss": 0.1361, + "step": 14085 + }, + { + "epoch": 5.728344855632371, + "grad_norm": 9.89452086960918, + "learning_rate": 1.2385626477440317e-05, + "loss": 0.256, + "step": 14086 + }, + { + "epoch": 5.728751525010167, + "grad_norm": 7.568899523871897, + "learning_rate": 1.2384642530668394e-05, + "loss": 0.2288, + "step": 14087 + }, + { + "epoch": 5.729158194387963, + "grad_norm": 0.0744894244381167, + "learning_rate": 1.2383658559416916e-05, + "loss": 0.0013, + "step": 14088 + }, + { + "epoch": 5.729564863765758, + "grad_norm": 6.190982485451633, + "learning_rate": 1.238267456369598e-05, + "loss": 0.2373, + "step": 14089 + }, + { + "epoch": 5.729971533143554, + "grad_norm": 3.8687767280603227, + "learning_rate": 1.2381690543515692e-05, + "loss": 0.0978, + "step": 14090 + }, + { + "epoch": 5.73037820252135, + "grad_norm": 2.77047738476053, + "learning_rate": 1.2380706498886153e-05, + "loss": 0.0371, + "step": 14091 + }, + { + "epoch": 5.730784871899146, + "grad_norm": 6.190672395187521, + "learning_rate": 1.237972242981746e-05, + "loss": 0.0832, + "step": 14092 + }, + { + "epoch": 5.731191541276942, + "grad_norm": 2.7753565783747893, + "learning_rate": 1.2378738336319722e-05, + "loss": 0.0387, + "step": 14093 + }, + { + "epoch": 5.731598210654738, + "grad_norm": 8.883780779776519, + "learning_rate": 1.2377754218403037e-05, + "loss": 0.1024, + "step": 14094 + }, + { + "epoch": 5.732004880032534, + "grad_norm": 11.11761553537465, + "learning_rate": 1.2376770076077507e-05, + "loss": 0.123, + "step": 14095 + }, + { + "epoch": 5.73241154941033, + "grad_norm": 5.815844378991824, + "learning_rate": 1.2375785909353239e-05, + "loss": 0.3533, + "step": 14096 + }, + { + "epoch": 5.732818218788125, + "grad_norm": 0.6946192237708139, + "learning_rate": 1.2374801718240334e-05, + "loss": 0.0103, + "step": 14097 + }, + { + "epoch": 5.733224888165921, + "grad_norm": 5.522705575071885, + "learning_rate": 1.2373817502748887e-05, + "loss": 0.403, + "step": 14098 + }, + { + "epoch": 5.733631557543717, + "grad_norm": 7.082055083138476, + "learning_rate": 1.2372833262889015e-05, + "loss": 0.473, + "step": 14099 + }, + { + "epoch": 5.734038226921513, + "grad_norm": 4.2180727441002785, + "learning_rate": 1.2371848998670815e-05, + "loss": 0.0879, + "step": 14100 + }, + { + "epoch": 5.734444896299308, + "grad_norm": 0.75452565450484, + "learning_rate": 1.2370864710104386e-05, + "loss": 0.0121, + "step": 14101 + }, + { + "epoch": 5.734851565677104, + "grad_norm": 0.27029789396711473, + "learning_rate": 1.2369880397199843e-05, + "loss": 0.0039, + "step": 14102 + }, + { + "epoch": 5.7352582350549, + "grad_norm": 7.425186286024931, + "learning_rate": 1.2368896059967283e-05, + "loss": 0.265, + "step": 14103 + }, + { + "epoch": 5.735664904432696, + "grad_norm": 15.24322163501752, + "learning_rate": 1.2367911698416813e-05, + "loss": 0.8072, + "step": 14104 + }, + { + "epoch": 5.736071573810492, + "grad_norm": 3.1340379052834706, + "learning_rate": 1.2366927312558536e-05, + "loss": 0.0583, + "step": 14105 + }, + { + "epoch": 5.736478243188288, + "grad_norm": 6.484926427739541, + "learning_rate": 1.236594290240256e-05, + "loss": 0.2054, + "step": 14106 + }, + { + "epoch": 5.736884912566084, + "grad_norm": 6.090345906954652, + "learning_rate": 1.2364958467958992e-05, + "loss": 0.2375, + "step": 14107 + }, + { + "epoch": 5.73729158194388, + "grad_norm": 10.68153382166152, + "learning_rate": 1.236397400923793e-05, + "loss": 0.2331, + "step": 14108 + }, + { + "epoch": 5.737698251321675, + "grad_norm": 3.7509335738506704, + "learning_rate": 1.236298952624949e-05, + "loss": 0.2141, + "step": 14109 + }, + { + "epoch": 5.738104920699471, + "grad_norm": 7.629872437943295, + "learning_rate": 1.236200501900377e-05, + "loss": 0.3824, + "step": 14110 + }, + { + "epoch": 5.738511590077267, + "grad_norm": 4.535067950686156, + "learning_rate": 1.236102048751088e-05, + "loss": 0.1193, + "step": 14111 + }, + { + "epoch": 5.738918259455063, + "grad_norm": 13.780011341975463, + "learning_rate": 1.2360035931780927e-05, + "loss": 0.4811, + "step": 14112 + }, + { + "epoch": 5.739324928832859, + "grad_norm": 0.16682342620775598, + "learning_rate": 1.235905135182402e-05, + "loss": 0.003, + "step": 14113 + }, + { + "epoch": 5.739731598210655, + "grad_norm": 0.21260406824069902, + "learning_rate": 1.235806674765026e-05, + "loss": 0.003, + "step": 14114 + }, + { + "epoch": 5.740138267588451, + "grad_norm": 0.04589592521398748, + "learning_rate": 1.2357082119269759e-05, + "loss": 0.0011, + "step": 14115 + }, + { + "epoch": 5.740544936966247, + "grad_norm": 0.32819964787033934, + "learning_rate": 1.2356097466692623e-05, + "loss": 0.0056, + "step": 14116 + }, + { + "epoch": 5.7409516063440424, + "grad_norm": 8.77888482392397, + "learning_rate": 1.2355112789928963e-05, + "loss": 0.2295, + "step": 14117 + }, + { + "epoch": 5.741358275721838, + "grad_norm": 1.2644353772020975, + "learning_rate": 1.235412808898888e-05, + "loss": 0.0229, + "step": 14118 + }, + { + "epoch": 5.741764945099634, + "grad_norm": 2.3063770324020103, + "learning_rate": 1.2353143363882494e-05, + "loss": 0.0841, + "step": 14119 + }, + { + "epoch": 5.74217161447743, + "grad_norm": 4.100984627285232, + "learning_rate": 1.2352158614619905e-05, + "loss": 0.1232, + "step": 14120 + }, + { + "epoch": 5.7425782838552255, + "grad_norm": 10.398336734613093, + "learning_rate": 1.235117384121122e-05, + "loss": 0.6919, + "step": 14121 + }, + { + "epoch": 5.742984953233021, + "grad_norm": 0.6948903345148258, + "learning_rate": 1.2350189043666557e-05, + "loss": 0.0146, + "step": 14122 + }, + { + "epoch": 5.743391622610817, + "grad_norm": 5.213916210666448, + "learning_rate": 1.2349204221996019e-05, + "loss": 0.1337, + "step": 14123 + }, + { + "epoch": 5.743798291988613, + "grad_norm": 9.121540189861182, + "learning_rate": 1.2348219376209714e-05, + "loss": 0.4159, + "step": 14124 + }, + { + "epoch": 5.7442049613664095, + "grad_norm": 0.6696613479955447, + "learning_rate": 1.234723450631776e-05, + "loss": 0.0142, + "step": 14125 + }, + { + "epoch": 5.744611630744205, + "grad_norm": 4.7557761883412235, + "learning_rate": 1.2346249612330261e-05, + "loss": 0.2522, + "step": 14126 + }, + { + "epoch": 5.745018300122001, + "grad_norm": 4.631789105599779, + "learning_rate": 1.2345264694257326e-05, + "loss": 0.197, + "step": 14127 + }, + { + "epoch": 5.745424969499797, + "grad_norm": 4.915134247649829, + "learning_rate": 1.2344279752109072e-05, + "loss": 0.1041, + "step": 14128 + }, + { + "epoch": 5.7458316388775925, + "grad_norm": 0.4254565845156415, + "learning_rate": 1.2343294785895608e-05, + "loss": 0.0065, + "step": 14129 + }, + { + "epoch": 5.746238308255388, + "grad_norm": 0.5541387023948563, + "learning_rate": 1.2342309795627041e-05, + "loss": 0.0089, + "step": 14130 + }, + { + "epoch": 5.746644977633184, + "grad_norm": 8.740802221552475, + "learning_rate": 1.2341324781313484e-05, + "loss": 0.3689, + "step": 14131 + }, + { + "epoch": 5.74705164701098, + "grad_norm": 3.053109501908065, + "learning_rate": 1.234033974296505e-05, + "loss": 0.0466, + "step": 14132 + }, + { + "epoch": 5.747458316388776, + "grad_norm": 0.23123106009109587, + "learning_rate": 1.2339354680591856e-05, + "loss": 0.0043, + "step": 14133 + }, + { + "epoch": 5.747864985766572, + "grad_norm": 3.715912645797019, + "learning_rate": 1.2338369594204003e-05, + "loss": 0.0651, + "step": 14134 + }, + { + "epoch": 5.748271655144368, + "grad_norm": 10.68140350308085, + "learning_rate": 1.233738448381161e-05, + "loss": 0.3483, + "step": 14135 + }, + { + "epoch": 5.748678324522164, + "grad_norm": 2.7924122774523097, + "learning_rate": 1.233639934942479e-05, + "loss": 0.0833, + "step": 14136 + }, + { + "epoch": 5.7490849938999595, + "grad_norm": 12.989834267117303, + "learning_rate": 1.2335414191053654e-05, + "loss": 0.4931, + "step": 14137 + }, + { + "epoch": 5.749491663277755, + "grad_norm": 5.483744293610564, + "learning_rate": 1.2334429008708318e-05, + "loss": 0.0999, + "step": 14138 + }, + { + "epoch": 5.749898332655551, + "grad_norm": 5.480416787121076, + "learning_rate": 1.2333443802398892e-05, + "loss": 0.1985, + "step": 14139 + }, + { + "epoch": 5.750305002033347, + "grad_norm": 6.124445505733937, + "learning_rate": 1.2332458572135488e-05, + "loss": 0.0835, + "step": 14140 + }, + { + "epoch": 5.750711671411143, + "grad_norm": 4.264708726237095, + "learning_rate": 1.2331473317928226e-05, + "loss": 0.0881, + "step": 14141 + }, + { + "epoch": 5.751118340788938, + "grad_norm": 7.197499387434122, + "learning_rate": 1.2330488039787218e-05, + "loss": 0.3544, + "step": 14142 + }, + { + "epoch": 5.751525010166734, + "grad_norm": 3.6865833848538063, + "learning_rate": 1.2329502737722576e-05, + "loss": 0.2081, + "step": 14143 + }, + { + "epoch": 5.75193167954453, + "grad_norm": 3.12168820041149, + "learning_rate": 1.2328517411744417e-05, + "loss": 0.0943, + "step": 14144 + }, + { + "epoch": 5.752338348922326, + "grad_norm": 1.277912018168478, + "learning_rate": 1.2327532061862852e-05, + "loss": 0.0168, + "step": 14145 + }, + { + "epoch": 5.752745018300122, + "grad_norm": 8.657264608683091, + "learning_rate": 1.2326546688088003e-05, + "loss": 0.1713, + "step": 14146 + }, + { + "epoch": 5.753151687677918, + "grad_norm": 0.09261378533649693, + "learning_rate": 1.2325561290429979e-05, + "loss": 0.0015, + "step": 14147 + }, + { + "epoch": 5.753558357055714, + "grad_norm": 0.43758323229746054, + "learning_rate": 1.23245758688989e-05, + "loss": 0.0041, + "step": 14148 + }, + { + "epoch": 5.75396502643351, + "grad_norm": 0.18010696691367137, + "learning_rate": 1.2323590423504877e-05, + "loss": 0.0029, + "step": 14149 + }, + { + "epoch": 5.754371695811305, + "grad_norm": 6.985805587063977, + "learning_rate": 1.2322604954258029e-05, + "loss": 0.1656, + "step": 14150 + }, + { + "epoch": 5.754778365189101, + "grad_norm": 8.107467814815955, + "learning_rate": 1.2321619461168475e-05, + "loss": 0.2222, + "step": 14151 + }, + { + "epoch": 5.755185034566897, + "grad_norm": 2.3220679365530636, + "learning_rate": 1.2320633944246328e-05, + "loss": 0.0552, + "step": 14152 + }, + { + "epoch": 5.755591703944693, + "grad_norm": 4.9522293419342205, + "learning_rate": 1.2319648403501701e-05, + "loss": 0.0889, + "step": 14153 + }, + { + "epoch": 5.755998373322489, + "grad_norm": 6.839644339090274, + "learning_rate": 1.231866283894472e-05, + "loss": 0.1859, + "step": 14154 + }, + { + "epoch": 5.756405042700285, + "grad_norm": 5.312337212378142, + "learning_rate": 1.2317677250585498e-05, + "loss": 0.1064, + "step": 14155 + }, + { + "epoch": 5.756811712078081, + "grad_norm": 9.32017714860998, + "learning_rate": 1.231669163843415e-05, + "loss": 0.304, + "step": 14156 + }, + { + "epoch": 5.757218381455877, + "grad_norm": 0.2147739088645788, + "learning_rate": 1.2315706002500798e-05, + "loss": 0.0036, + "step": 14157 + }, + { + "epoch": 5.757625050833672, + "grad_norm": 12.806109430333787, + "learning_rate": 1.2314720342795559e-05, + "loss": 0.461, + "step": 14158 + }, + { + "epoch": 5.758031720211468, + "grad_norm": 3.8136504129071875, + "learning_rate": 1.2313734659328549e-05, + "loss": 0.083, + "step": 14159 + }, + { + "epoch": 5.758438389589264, + "grad_norm": 3.5732568101070283, + "learning_rate": 1.231274895210989e-05, + "loss": 0.0969, + "step": 14160 + }, + { + "epoch": 5.75884505896706, + "grad_norm": 8.96239780040388, + "learning_rate": 1.23117632211497e-05, + "loss": 0.3671, + "step": 14161 + }, + { + "epoch": 5.759251728344855, + "grad_norm": 7.322294479155803, + "learning_rate": 1.2310777466458093e-05, + "loss": 0.2713, + "step": 14162 + }, + { + "epoch": 5.759658397722651, + "grad_norm": 0.058218377516116186, + "learning_rate": 1.2309791688045194e-05, + "loss": 0.001, + "step": 14163 + }, + { + "epoch": 5.760065067100447, + "grad_norm": 0.6471399893381656, + "learning_rate": 1.2308805885921123e-05, + "loss": 0.008, + "step": 14164 + }, + { + "epoch": 5.760471736478243, + "grad_norm": 3.4768653487247354, + "learning_rate": 1.2307820060095996e-05, + "loss": 0.0704, + "step": 14165 + }, + { + "epoch": 5.760878405856039, + "grad_norm": 2.6161645463642604, + "learning_rate": 1.2306834210579935e-05, + "loss": 0.0627, + "step": 14166 + }, + { + "epoch": 5.761285075233835, + "grad_norm": 6.363150079132182, + "learning_rate": 1.230584833738306e-05, + "loss": 0.16, + "step": 14167 + }, + { + "epoch": 5.761691744611631, + "grad_norm": 4.36856293866602, + "learning_rate": 1.230486244051549e-05, + "loss": 0.185, + "step": 14168 + }, + { + "epoch": 5.762098413989427, + "grad_norm": 0.1524480216201598, + "learning_rate": 1.2303876519987348e-05, + "loss": 0.002, + "step": 14169 + }, + { + "epoch": 5.7625050833672224, + "grad_norm": 7.016118873987804, + "learning_rate": 1.2302890575808752e-05, + "loss": 0.1178, + "step": 14170 + }, + { + "epoch": 5.762911752745018, + "grad_norm": 1.337880755043517, + "learning_rate": 1.230190460798983e-05, + "loss": 0.0475, + "step": 14171 + }, + { + "epoch": 5.763318422122814, + "grad_norm": 1.3574464258078827, + "learning_rate": 1.2300918616540693e-05, + "loss": 0.0279, + "step": 14172 + }, + { + "epoch": 5.76372509150061, + "grad_norm": 0.08852148218363935, + "learning_rate": 1.2299932601471472e-05, + "loss": 0.0014, + "step": 14173 + }, + { + "epoch": 5.7641317608784055, + "grad_norm": 3.779573094823416, + "learning_rate": 1.2298946562792286e-05, + "loss": 0.0508, + "step": 14174 + }, + { + "epoch": 5.764538430256202, + "grad_norm": 8.947032811875433, + "learning_rate": 1.2297960500513255e-05, + "loss": 0.4512, + "step": 14175 + }, + { + "epoch": 5.764945099633998, + "grad_norm": 0.1471479182285179, + "learning_rate": 1.2296974414644503e-05, + "loss": 0.0027, + "step": 14176 + }, + { + "epoch": 5.765351769011794, + "grad_norm": 4.618812949734176, + "learning_rate": 1.2295988305196152e-05, + "loss": 0.0927, + "step": 14177 + }, + { + "epoch": 5.7657584383895895, + "grad_norm": 2.8055423115312372, + "learning_rate": 1.2295002172178326e-05, + "loss": 0.0617, + "step": 14178 + }, + { + "epoch": 5.766165107767385, + "grad_norm": 5.006381853639776, + "learning_rate": 1.2294016015601147e-05, + "loss": 0.1077, + "step": 14179 + }, + { + "epoch": 5.766571777145181, + "grad_norm": 2.005515623956607, + "learning_rate": 1.2293029835474739e-05, + "loss": 0.0313, + "step": 14180 + }, + { + "epoch": 5.766978446522977, + "grad_norm": 14.000462620725322, + "learning_rate": 1.2292043631809227e-05, + "loss": 0.1176, + "step": 14181 + }, + { + "epoch": 5.7673851159007725, + "grad_norm": 2.4854815821466425, + "learning_rate": 1.2291057404614732e-05, + "loss": 0.035, + "step": 14182 + }, + { + "epoch": 5.767791785278568, + "grad_norm": 2.4152149341944686, + "learning_rate": 1.2290071153901382e-05, + "loss": 0.0295, + "step": 14183 + }, + { + "epoch": 5.768198454656364, + "grad_norm": 8.101404902636807, + "learning_rate": 1.2289084879679298e-05, + "loss": 0.2915, + "step": 14184 + }, + { + "epoch": 5.76860512403416, + "grad_norm": 3.5378043193028526, + "learning_rate": 1.2288098581958602e-05, + "loss": 0.1784, + "step": 14185 + }, + { + "epoch": 5.7690117934119565, + "grad_norm": 0.1755543037760226, + "learning_rate": 1.2287112260749427e-05, + "loss": 0.0023, + "step": 14186 + }, + { + "epoch": 5.769418462789752, + "grad_norm": 4.984764428036204, + "learning_rate": 1.2286125916061893e-05, + "loss": 0.3533, + "step": 14187 + }, + { + "epoch": 5.769825132167548, + "grad_norm": 0.40105665931791484, + "learning_rate": 1.2285139547906127e-05, + "loss": 0.0051, + "step": 14188 + }, + { + "epoch": 5.770231801545344, + "grad_norm": 10.172879669716036, + "learning_rate": 1.228415315629225e-05, + "loss": 0.059, + "step": 14189 + }, + { + "epoch": 5.7706384709231395, + "grad_norm": 0.33139295889980785, + "learning_rate": 1.2283166741230392e-05, + "loss": 0.005, + "step": 14190 + }, + { + "epoch": 5.771045140300935, + "grad_norm": 0.08873084136737398, + "learning_rate": 1.2282180302730683e-05, + "loss": 0.0011, + "step": 14191 + }, + { + "epoch": 5.771451809678731, + "grad_norm": 1.3999939734019744, + "learning_rate": 1.2281193840803236e-05, + "loss": 0.0272, + "step": 14192 + }, + { + "epoch": 5.771858479056527, + "grad_norm": 2.5650713850623883, + "learning_rate": 1.2280207355458193e-05, + "loss": 0.0326, + "step": 14193 + }, + { + "epoch": 5.772265148434323, + "grad_norm": 4.705327839319359, + "learning_rate": 1.2279220846705673e-05, + "loss": 0.0688, + "step": 14194 + }, + { + "epoch": 5.772671817812119, + "grad_norm": 0.04555982888902627, + "learning_rate": 1.2278234314555802e-05, + "loss": 0.0006, + "step": 14195 + }, + { + "epoch": 5.773078487189915, + "grad_norm": 16.8549967764928, + "learning_rate": 1.2277247759018711e-05, + "loss": 0.1616, + "step": 14196 + }, + { + "epoch": 5.773485156567711, + "grad_norm": 8.34967463490152, + "learning_rate": 1.2276261180104525e-05, + "loss": 0.3653, + "step": 14197 + }, + { + "epoch": 5.7738918259455065, + "grad_norm": 5.876062944438852, + "learning_rate": 1.227527457782337e-05, + "loss": 0.222, + "step": 14198 + }, + { + "epoch": 5.774298495323302, + "grad_norm": 0.10177647671649691, + "learning_rate": 1.2274287952185379e-05, + "loss": 0.0022, + "step": 14199 + }, + { + "epoch": 5.774705164701098, + "grad_norm": 24.422792892144585, + "learning_rate": 1.2273301303200677e-05, + "loss": 0.7667, + "step": 14200 + }, + { + "epoch": 5.775111834078894, + "grad_norm": 6.189753608381727, + "learning_rate": 1.2272314630879391e-05, + "loss": 0.2495, + "step": 14201 + }, + { + "epoch": 5.77551850345669, + "grad_norm": 8.637820122902303, + "learning_rate": 1.2271327935231657e-05, + "loss": 0.2044, + "step": 14202 + }, + { + "epoch": 5.775925172834485, + "grad_norm": 3.7439916551979393, + "learning_rate": 1.2270341216267593e-05, + "loss": 0.0479, + "step": 14203 + }, + { + "epoch": 5.776331842212281, + "grad_norm": 0.07128166178361989, + "learning_rate": 1.2269354473997334e-05, + "loss": 0.0007, + "step": 14204 + }, + { + "epoch": 5.776738511590077, + "grad_norm": 0.06725923112584788, + "learning_rate": 1.2268367708431012e-05, + "loss": 0.0009, + "step": 14205 + }, + { + "epoch": 5.777145180967873, + "grad_norm": 0.9008481123812145, + "learning_rate": 1.2267380919578753e-05, + "loss": 0.0114, + "step": 14206 + }, + { + "epoch": 5.777551850345669, + "grad_norm": 0.4997494884073462, + "learning_rate": 1.2266394107450688e-05, + "loss": 0.0084, + "step": 14207 + }, + { + "epoch": 5.777958519723465, + "grad_norm": 1.938447013745076, + "learning_rate": 1.2265407272056946e-05, + "loss": 0.0121, + "step": 14208 + }, + { + "epoch": 5.778365189101261, + "grad_norm": 1.3196412654491239, + "learning_rate": 1.2264420413407661e-05, + "loss": 0.0269, + "step": 14209 + }, + { + "epoch": 5.778771858479057, + "grad_norm": 4.288521272722713, + "learning_rate": 1.2263433531512958e-05, + "loss": 0.1748, + "step": 14210 + }, + { + "epoch": 5.779178527856852, + "grad_norm": 3.2320360737886316, + "learning_rate": 1.2262446626382969e-05, + "loss": 0.068, + "step": 14211 + }, + { + "epoch": 5.779585197234648, + "grad_norm": 7.156848790994974, + "learning_rate": 1.226145969802783e-05, + "loss": 0.1503, + "step": 14212 + }, + { + "epoch": 5.779991866612444, + "grad_norm": 0.07195140024037344, + "learning_rate": 1.2260472746457669e-05, + "loss": 0.0014, + "step": 14213 + }, + { + "epoch": 5.78039853599024, + "grad_norm": 3.0904720483071895, + "learning_rate": 1.2259485771682613e-05, + "loss": 0.0561, + "step": 14214 + }, + { + "epoch": 5.780805205368035, + "grad_norm": 10.012485296944623, + "learning_rate": 1.2258498773712804e-05, + "loss": 0.2319, + "step": 14215 + }, + { + "epoch": 5.781211874745832, + "grad_norm": 0.03441606747567221, + "learning_rate": 1.2257511752558366e-05, + "loss": 0.0006, + "step": 14216 + }, + { + "epoch": 5.781618544123628, + "grad_norm": 11.310072416769671, + "learning_rate": 1.2256524708229432e-05, + "loss": 0.3655, + "step": 14217 + }, + { + "epoch": 5.782025213501424, + "grad_norm": 7.307102909417879, + "learning_rate": 1.225553764073614e-05, + "loss": 0.1085, + "step": 14218 + }, + { + "epoch": 5.782431882879219, + "grad_norm": 0.2397955766374174, + "learning_rate": 1.2254550550088615e-05, + "loss": 0.0035, + "step": 14219 + }, + { + "epoch": 5.782838552257015, + "grad_norm": 10.930074383621282, + "learning_rate": 1.2253563436296996e-05, + "loss": 0.313, + "step": 14220 + }, + { + "epoch": 5.783245221634811, + "grad_norm": 13.408177445228029, + "learning_rate": 1.2252576299371415e-05, + "loss": 0.5212, + "step": 14221 + }, + { + "epoch": 5.783651891012607, + "grad_norm": 0.27168748324564657, + "learning_rate": 1.2251589139322002e-05, + "loss": 0.0049, + "step": 14222 + }, + { + "epoch": 5.7840585603904024, + "grad_norm": 6.835214081462723, + "learning_rate": 1.2250601956158896e-05, + "loss": 0.321, + "step": 14223 + }, + { + "epoch": 5.784465229768198, + "grad_norm": 2.896728083397101, + "learning_rate": 1.2249614749892228e-05, + "loss": 0.0925, + "step": 14224 + }, + { + "epoch": 5.784871899145994, + "grad_norm": 3.4686954345726018, + "learning_rate": 1.224862752053213e-05, + "loss": 0.0744, + "step": 14225 + }, + { + "epoch": 5.78527856852379, + "grad_norm": 6.764989152645968, + "learning_rate": 1.224764026808874e-05, + "loss": 0.2656, + "step": 14226 + }, + { + "epoch": 5.785685237901586, + "grad_norm": 0.599424490277465, + "learning_rate": 1.2246652992572193e-05, + "loss": 0.0136, + "step": 14227 + }, + { + "epoch": 5.786091907279382, + "grad_norm": 0.27764459317504697, + "learning_rate": 1.224566569399262e-05, + "loss": 0.0052, + "step": 14228 + }, + { + "epoch": 5.786498576657178, + "grad_norm": 1.4058221795816226, + "learning_rate": 1.2244678372360159e-05, + "loss": 0.0201, + "step": 14229 + }, + { + "epoch": 5.786905246034974, + "grad_norm": 4.933128088044318, + "learning_rate": 1.2243691027684945e-05, + "loss": 0.2187, + "step": 14230 + }, + { + "epoch": 5.7873119154127695, + "grad_norm": 1.1933088085087442, + "learning_rate": 1.2242703659977114e-05, + "loss": 0.0193, + "step": 14231 + }, + { + "epoch": 5.787718584790565, + "grad_norm": 3.350855118148993, + "learning_rate": 1.2241716269246802e-05, + "loss": 0.0567, + "step": 14232 + }, + { + "epoch": 5.788125254168361, + "grad_norm": 0.16881953955851223, + "learning_rate": 1.2240728855504142e-05, + "loss": 0.0027, + "step": 14233 + }, + { + "epoch": 5.788531923546157, + "grad_norm": 0.057154722089789764, + "learning_rate": 1.2239741418759272e-05, + "loss": 0.0009, + "step": 14234 + }, + { + "epoch": 5.7889385929239525, + "grad_norm": 2.782680581724554, + "learning_rate": 1.2238753959022331e-05, + "loss": 0.0541, + "step": 14235 + }, + { + "epoch": 5.789345262301749, + "grad_norm": 2.2243499114995458, + "learning_rate": 1.2237766476303457e-05, + "loss": 0.0429, + "step": 14236 + }, + { + "epoch": 5.789751931679545, + "grad_norm": 6.785965079028191, + "learning_rate": 1.2236778970612777e-05, + "loss": 0.2818, + "step": 14237 + }, + { + "epoch": 5.790158601057341, + "grad_norm": 3.0269181563028438, + "learning_rate": 1.2235791441960441e-05, + "loss": 0.0626, + "step": 14238 + }, + { + "epoch": 5.7905652704351365, + "grad_norm": 5.28186135291052, + "learning_rate": 1.2234803890356577e-05, + "loss": 0.1448, + "step": 14239 + }, + { + "epoch": 5.790971939812932, + "grad_norm": 0.027595176569729048, + "learning_rate": 1.2233816315811327e-05, + "loss": 0.0003, + "step": 14240 + }, + { + "epoch": 5.791378609190728, + "grad_norm": 6.675138696002999, + "learning_rate": 1.2232828718334828e-05, + "loss": 0.2943, + "step": 14241 + }, + { + "epoch": 5.791785278568524, + "grad_norm": 5.529649560920604, + "learning_rate": 1.223184109793722e-05, + "loss": 0.1204, + "step": 14242 + }, + { + "epoch": 5.7921919479463195, + "grad_norm": 5.324002886576218, + "learning_rate": 1.2230853454628637e-05, + "loss": 0.1126, + "step": 14243 + }, + { + "epoch": 5.792598617324115, + "grad_norm": 0.4670930366308039, + "learning_rate": 1.2229865788419223e-05, + "loss": 0.0081, + "step": 14244 + }, + { + "epoch": 5.793005286701911, + "grad_norm": 13.03577368648315, + "learning_rate": 1.2228878099319115e-05, + "loss": 0.3785, + "step": 14245 + }, + { + "epoch": 5.793411956079707, + "grad_norm": 2.1837790781333024, + "learning_rate": 1.2227890387338453e-05, + "loss": 0.0487, + "step": 14246 + }, + { + "epoch": 5.793818625457503, + "grad_norm": 7.608500861750423, + "learning_rate": 1.222690265248737e-05, + "loss": 0.1842, + "step": 14247 + }, + { + "epoch": 5.794225294835299, + "grad_norm": 1.766107986180142, + "learning_rate": 1.2225914894776016e-05, + "loss": 0.0309, + "step": 14248 + }, + { + "epoch": 5.794631964213095, + "grad_norm": 0.45116188790626244, + "learning_rate": 1.222492711421452e-05, + "loss": 0.007, + "step": 14249 + }, + { + "epoch": 5.795038633590891, + "grad_norm": 5.020102284935504, + "learning_rate": 1.2223939310813031e-05, + "loss": 0.0491, + "step": 14250 + }, + { + "epoch": 5.7954453029686865, + "grad_norm": 0.015858633156953673, + "learning_rate": 1.2222951484581685e-05, + "loss": 0.0004, + "step": 14251 + }, + { + "epoch": 5.795851972346482, + "grad_norm": 0.1423008403047656, + "learning_rate": 1.222196363553062e-05, + "loss": 0.0021, + "step": 14252 + }, + { + "epoch": 5.796258641724278, + "grad_norm": 0.2678208410678241, + "learning_rate": 1.2220975763669984e-05, + "loss": 0.0036, + "step": 14253 + }, + { + "epoch": 5.796665311102074, + "grad_norm": 3.2049187922304814, + "learning_rate": 1.2219987869009913e-05, + "loss": 0.1795, + "step": 14254 + }, + { + "epoch": 5.79707198047987, + "grad_norm": 15.939157799909944, + "learning_rate": 1.221899995156055e-05, + "loss": 0.4132, + "step": 14255 + }, + { + "epoch": 5.797478649857665, + "grad_norm": 5.180461821532042, + "learning_rate": 1.2218012011332032e-05, + "loss": 0.1179, + "step": 14256 + }, + { + "epoch": 5.797885319235462, + "grad_norm": 3.4379168608530937, + "learning_rate": 1.2217024048334508e-05, + "loss": 0.1082, + "step": 14257 + }, + { + "epoch": 5.798291988613258, + "grad_norm": 2.6407832810224816, + "learning_rate": 1.2216036062578116e-05, + "loss": 0.0349, + "step": 14258 + }, + { + "epoch": 5.7986986579910536, + "grad_norm": 6.502795885001038, + "learning_rate": 1.2215048054072995e-05, + "loss": 0.0873, + "step": 14259 + }, + { + "epoch": 5.799105327368849, + "grad_norm": 7.430615070375528, + "learning_rate": 1.2214060022829296e-05, + "loss": 0.275, + "step": 14260 + }, + { + "epoch": 5.799511996746645, + "grad_norm": 15.562524220053275, + "learning_rate": 1.2213071968857153e-05, + "loss": 0.4953, + "step": 14261 + }, + { + "epoch": 5.799918666124441, + "grad_norm": 1.4430179876481604, + "learning_rate": 1.2212083892166713e-05, + "loss": 0.0602, + "step": 14262 + }, + { + "epoch": 5.800325335502237, + "grad_norm": 1.3806225663065481, + "learning_rate": 1.2211095792768119e-05, + "loss": 0.0247, + "step": 14263 + }, + { + "epoch": 5.800732004880032, + "grad_norm": 7.494214795180284, + "learning_rate": 1.2210107670671515e-05, + "loss": 0.2288, + "step": 14264 + }, + { + "epoch": 5.801138674257828, + "grad_norm": 15.012456923294161, + "learning_rate": 1.2209119525887041e-05, + "loss": 0.6272, + "step": 14265 + }, + { + "epoch": 5.801545343635624, + "grad_norm": 8.887000728996219, + "learning_rate": 1.2208131358424847e-05, + "loss": 0.4235, + "step": 14266 + }, + { + "epoch": 5.80195201301342, + "grad_norm": 1.8205773119052169, + "learning_rate": 1.220714316829507e-05, + "loss": 0.0246, + "step": 14267 + }, + { + "epoch": 5.802358682391216, + "grad_norm": 1.6556090637056788, + "learning_rate": 1.2206154955507858e-05, + "loss": 0.0247, + "step": 14268 + }, + { + "epoch": 5.802765351769012, + "grad_norm": 3.3209927373290276, + "learning_rate": 1.2205166720073356e-05, + "loss": 0.0737, + "step": 14269 + }, + { + "epoch": 5.803172021146808, + "grad_norm": 0.5688685898346719, + "learning_rate": 1.2204178462001708e-05, + "loss": 0.0053, + "step": 14270 + }, + { + "epoch": 5.803578690524604, + "grad_norm": 5.663099379045796, + "learning_rate": 1.2203190181303057e-05, + "loss": 0.0679, + "step": 14271 + }, + { + "epoch": 5.803985359902399, + "grad_norm": 2.328734241366753, + "learning_rate": 1.2202201877987549e-05, + "loss": 0.0143, + "step": 14272 + }, + { + "epoch": 5.804392029280195, + "grad_norm": 8.232531554734765, + "learning_rate": 1.2201213552065334e-05, + "loss": 0.2283, + "step": 14273 + }, + { + "epoch": 5.804798698657991, + "grad_norm": 2.4738961271702276, + "learning_rate": 1.2200225203546555e-05, + "loss": 0.0826, + "step": 14274 + }, + { + "epoch": 5.805205368035787, + "grad_norm": 7.305390090799887, + "learning_rate": 1.2199236832441353e-05, + "loss": 0.3097, + "step": 14275 + }, + { + "epoch": 5.8056120374135824, + "grad_norm": 0.2303216710819587, + "learning_rate": 1.219824843875988e-05, + "loss": 0.0051, + "step": 14276 + }, + { + "epoch": 5.806018706791379, + "grad_norm": 2.1183176511899253, + "learning_rate": 1.2197260022512279e-05, + "loss": 0.0592, + "step": 14277 + }, + { + "epoch": 5.806425376169175, + "grad_norm": 6.495022522553787, + "learning_rate": 1.21962715837087e-05, + "loss": 0.0349, + "step": 14278 + }, + { + "epoch": 5.806832045546971, + "grad_norm": 11.127544223324282, + "learning_rate": 1.2195283122359285e-05, + "loss": 0.3131, + "step": 14279 + }, + { + "epoch": 5.807238714924766, + "grad_norm": 19.210591885163705, + "learning_rate": 1.2194294638474187e-05, + "loss": 0.144, + "step": 14280 + }, + { + "epoch": 5.807645384302562, + "grad_norm": 0.1942523856886311, + "learning_rate": 1.219330613206355e-05, + "loss": 0.0039, + "step": 14281 + }, + { + "epoch": 5.808052053680358, + "grad_norm": 9.749417930278389, + "learning_rate": 1.2192317603137518e-05, + "loss": 0.3859, + "step": 14282 + }, + { + "epoch": 5.808458723058154, + "grad_norm": 4.409023358234033, + "learning_rate": 1.2191329051706244e-05, + "loss": 0.0925, + "step": 14283 + }, + { + "epoch": 5.8088653924359495, + "grad_norm": 2.6829121921823034, + "learning_rate": 1.2190340477779876e-05, + "loss": 0.0486, + "step": 14284 + }, + { + "epoch": 5.809272061813745, + "grad_norm": 4.528534232566309, + "learning_rate": 1.218935188136856e-05, + "loss": 0.1813, + "step": 14285 + }, + { + "epoch": 5.809678731191541, + "grad_norm": 1.7196340761126476, + "learning_rate": 1.2188363262482446e-05, + "loss": 0.0107, + "step": 14286 + }, + { + "epoch": 5.810085400569337, + "grad_norm": 0.07127509352939218, + "learning_rate": 1.218737462113168e-05, + "loss": 0.0012, + "step": 14287 + }, + { + "epoch": 5.8104920699471325, + "grad_norm": 16.33935780843386, + "learning_rate": 1.2186385957326411e-05, + "loss": 0.551, + "step": 14288 + }, + { + "epoch": 5.810898739324929, + "grad_norm": 0.3261712984717793, + "learning_rate": 1.2185397271076793e-05, + "loss": 0.0041, + "step": 14289 + }, + { + "epoch": 5.811305408702725, + "grad_norm": 1.58977455638014, + "learning_rate": 1.2184408562392973e-05, + "loss": 0.0254, + "step": 14290 + }, + { + "epoch": 5.811712078080521, + "grad_norm": 0.23955854407974797, + "learning_rate": 1.2183419831285097e-05, + "loss": 0.0042, + "step": 14291 + }, + { + "epoch": 5.8121187474583165, + "grad_norm": 5.697221330301157, + "learning_rate": 1.2182431077763317e-05, + "loss": 0.2368, + "step": 14292 + }, + { + "epoch": 5.812525416836112, + "grad_norm": 0.1723455350875994, + "learning_rate": 1.2181442301837787e-05, + "loss": 0.003, + "step": 14293 + }, + { + "epoch": 5.812932086213908, + "grad_norm": 10.143565619262553, + "learning_rate": 1.2180453503518652e-05, + "loss": 0.3361, + "step": 14294 + }, + { + "epoch": 5.813338755591704, + "grad_norm": 4.661987377453516, + "learning_rate": 1.2179464682816062e-05, + "loss": 0.2498, + "step": 14295 + }, + { + "epoch": 5.8137454249694995, + "grad_norm": 1.7592061962685006, + "learning_rate": 1.2178475839740175e-05, + "loss": 0.0249, + "step": 14296 + }, + { + "epoch": 5.814152094347295, + "grad_norm": 6.042502467414037, + "learning_rate": 1.2177486974301133e-05, + "loss": 0.1004, + "step": 14297 + }, + { + "epoch": 5.814558763725092, + "grad_norm": 0.6785031410026391, + "learning_rate": 1.2176498086509095e-05, + "loss": 0.013, + "step": 14298 + }, + { + "epoch": 5.814965433102888, + "grad_norm": 0.4821355931549478, + "learning_rate": 1.2175509176374206e-05, + "loss": 0.0066, + "step": 14299 + }, + { + "epoch": 5.8153721024806835, + "grad_norm": 7.539385633263881, + "learning_rate": 1.2174520243906622e-05, + "loss": 0.2341, + "step": 14300 + }, + { + "epoch": 5.815778771858479, + "grad_norm": 0.1662959792487708, + "learning_rate": 1.2173531289116491e-05, + "loss": 0.0031, + "step": 14301 + }, + { + "epoch": 5.816185441236275, + "grad_norm": 4.496844574091043, + "learning_rate": 1.2172542312013969e-05, + "loss": 0.0812, + "step": 14302 + }, + { + "epoch": 5.816592110614071, + "grad_norm": 5.3055414631607665, + "learning_rate": 1.2171553312609207e-05, + "loss": 0.1525, + "step": 14303 + }, + { + "epoch": 5.8169987799918665, + "grad_norm": 0.10486907159693923, + "learning_rate": 1.2170564290912354e-05, + "loss": 0.0011, + "step": 14304 + }, + { + "epoch": 5.817405449369662, + "grad_norm": 8.754605184090112, + "learning_rate": 1.2169575246933571e-05, + "loss": 0.4148, + "step": 14305 + }, + { + "epoch": 5.817812118747458, + "grad_norm": 3.4058081523061134, + "learning_rate": 1.2168586180683004e-05, + "loss": 0.0623, + "step": 14306 + }, + { + "epoch": 5.818218788125254, + "grad_norm": 6.306497448735496, + "learning_rate": 1.2167597092170809e-05, + "loss": 0.2733, + "step": 14307 + }, + { + "epoch": 5.81862545750305, + "grad_norm": 7.739189683873042, + "learning_rate": 1.2166607981407137e-05, + "loss": 0.1641, + "step": 14308 + }, + { + "epoch": 5.819032126880846, + "grad_norm": 11.037043092010025, + "learning_rate": 1.2165618848402146e-05, + "loss": 0.5869, + "step": 14309 + }, + { + "epoch": 5.819438796258642, + "grad_norm": 5.798821665450932, + "learning_rate": 1.2164629693165985e-05, + "loss": 0.1257, + "step": 14310 + }, + { + "epoch": 5.819845465636438, + "grad_norm": 1.1794446888625767, + "learning_rate": 1.2163640515708815e-05, + "loss": 0.013, + "step": 14311 + }, + { + "epoch": 5.8202521350142336, + "grad_norm": 5.273158334566021, + "learning_rate": 1.2162651316040783e-05, + "loss": 0.2074, + "step": 14312 + }, + { + "epoch": 5.820658804392029, + "grad_norm": 6.605358576099003, + "learning_rate": 1.216166209417205e-05, + "loss": 0.3448, + "step": 14313 + }, + { + "epoch": 5.821065473769825, + "grad_norm": 11.177574667821984, + "learning_rate": 1.2160672850112763e-05, + "loss": 0.3619, + "step": 14314 + }, + { + "epoch": 5.821472143147621, + "grad_norm": 5.694875286500983, + "learning_rate": 1.2159683583873084e-05, + "loss": 0.1446, + "step": 14315 + }, + { + "epoch": 5.821878812525417, + "grad_norm": 8.446192393398624, + "learning_rate": 1.2158694295463166e-05, + "loss": 0.3887, + "step": 14316 + }, + { + "epoch": 5.822285481903212, + "grad_norm": 6.203664018092476, + "learning_rate": 1.2157704984893161e-05, + "loss": 0.1678, + "step": 14317 + }, + { + "epoch": 5.822692151281009, + "grad_norm": 0.04658008242062702, + "learning_rate": 1.2156715652173234e-05, + "loss": 0.0008, + "step": 14318 + }, + { + "epoch": 5.823098820658805, + "grad_norm": 12.518725845686893, + "learning_rate": 1.2155726297313532e-05, + "loss": 0.3545, + "step": 14319 + }, + { + "epoch": 5.823505490036601, + "grad_norm": 8.226099317209744, + "learning_rate": 1.2154736920324215e-05, + "loss": 0.2211, + "step": 14320 + }, + { + "epoch": 5.823912159414396, + "grad_norm": 5.674163035030335, + "learning_rate": 1.215374752121544e-05, + "loss": 0.0567, + "step": 14321 + }, + { + "epoch": 5.824318828792192, + "grad_norm": 1.6206953766750343, + "learning_rate": 1.2152758099997364e-05, + "loss": 0.0263, + "step": 14322 + }, + { + "epoch": 5.824725498169988, + "grad_norm": 4.705188730918426, + "learning_rate": 1.2151768656680141e-05, + "loss": 0.0626, + "step": 14323 + }, + { + "epoch": 5.825132167547784, + "grad_norm": 9.452305262664057, + "learning_rate": 1.2150779191273929e-05, + "loss": 0.2622, + "step": 14324 + }, + { + "epoch": 5.825538836925579, + "grad_norm": 2.9907610046886446, + "learning_rate": 1.2149789703788888e-05, + "loss": 0.0724, + "step": 14325 + }, + { + "epoch": 5.825945506303375, + "grad_norm": 1.640938099080602, + "learning_rate": 1.2148800194235173e-05, + "loss": 0.0287, + "step": 14326 + }, + { + "epoch": 5.826352175681171, + "grad_norm": 17.900558341864393, + "learning_rate": 1.2147810662622941e-05, + "loss": 0.8725, + "step": 14327 + }, + { + "epoch": 5.826758845058967, + "grad_norm": 10.382267486477545, + "learning_rate": 1.2146821108962355e-05, + "loss": 0.2758, + "step": 14328 + }, + { + "epoch": 5.8271655144367625, + "grad_norm": 8.500556577839475, + "learning_rate": 1.2145831533263568e-05, + "loss": 0.2966, + "step": 14329 + }, + { + "epoch": 5.827572183814559, + "grad_norm": 8.259745850497659, + "learning_rate": 1.2144841935536738e-05, + "loss": 0.353, + "step": 14330 + }, + { + "epoch": 5.827978853192355, + "grad_norm": 3.6157921721931965, + "learning_rate": 1.2143852315792029e-05, + "loss": 0.2091, + "step": 14331 + }, + { + "epoch": 5.828385522570151, + "grad_norm": 9.284253670086551, + "learning_rate": 1.2142862674039596e-05, + "loss": 0.4556, + "step": 14332 + }, + { + "epoch": 5.828792191947946, + "grad_norm": 1.765045100651165, + "learning_rate": 1.21418730102896e-05, + "loss": 0.0199, + "step": 14333 + }, + { + "epoch": 5.829198861325742, + "grad_norm": 8.244195276389666, + "learning_rate": 1.21408833245522e-05, + "loss": 0.1886, + "step": 14334 + }, + { + "epoch": 5.829605530703538, + "grad_norm": 0.11796071592958365, + "learning_rate": 1.2139893616837552e-05, + "loss": 0.0016, + "step": 14335 + }, + { + "epoch": 5.830012200081334, + "grad_norm": 10.186868206370544, + "learning_rate": 1.2138903887155824e-05, + "loss": 0.3448, + "step": 14336 + }, + { + "epoch": 5.8304188694591295, + "grad_norm": 9.848622394985577, + "learning_rate": 1.2137914135517167e-05, + "loss": 0.3942, + "step": 14337 + }, + { + "epoch": 5.830825538836925, + "grad_norm": 4.173694115095295, + "learning_rate": 1.2136924361931745e-05, + "loss": 0.1555, + "step": 14338 + }, + { + "epoch": 5.831232208214722, + "grad_norm": 0.732574418372745, + "learning_rate": 1.2135934566409722e-05, + "loss": 0.0071, + "step": 14339 + }, + { + "epoch": 5.831638877592518, + "grad_norm": 3.5081774802099432, + "learning_rate": 1.2134944748961253e-05, + "loss": 0.0914, + "step": 14340 + }, + { + "epoch": 5.832045546970313, + "grad_norm": 7.815777556625608, + "learning_rate": 1.2133954909596503e-05, + "loss": 0.272, + "step": 14341 + }, + { + "epoch": 5.832452216348109, + "grad_norm": 10.271482906678743, + "learning_rate": 1.2132965048325633e-05, + "loss": 0.5575, + "step": 14342 + }, + { + "epoch": 5.832858885725905, + "grad_norm": 0.3854093240317162, + "learning_rate": 1.21319751651588e-05, + "loss": 0.0055, + "step": 14343 + }, + { + "epoch": 5.833265555103701, + "grad_norm": 10.458265629905986, + "learning_rate": 1.2130985260106175e-05, + "loss": 0.7593, + "step": 14344 + }, + { + "epoch": 5.8336722244814965, + "grad_norm": 13.53522111373486, + "learning_rate": 1.2129995333177911e-05, + "loss": 0.5288, + "step": 14345 + }, + { + "epoch": 5.834078893859292, + "grad_norm": 3.9690283879854955, + "learning_rate": 1.2129005384384169e-05, + "loss": 0.0737, + "step": 14346 + }, + { + "epoch": 5.834485563237088, + "grad_norm": 6.060944236708664, + "learning_rate": 1.212801541373512e-05, + "loss": 0.2807, + "step": 14347 + }, + { + "epoch": 5.834892232614884, + "grad_norm": 4.336933632857815, + "learning_rate": 1.2127025421240919e-05, + "loss": 0.165, + "step": 14348 + }, + { + "epoch": 5.8352989019926795, + "grad_norm": 12.569616454657984, + "learning_rate": 1.2126035406911732e-05, + "loss": 0.5211, + "step": 14349 + }, + { + "epoch": 5.835705571370476, + "grad_norm": 16.30928913301434, + "learning_rate": 1.2125045370757722e-05, + "loss": 0.5636, + "step": 14350 + }, + { + "epoch": 5.836112240748272, + "grad_norm": 8.669733617560107, + "learning_rate": 1.2124055312789052e-05, + "loss": 0.4362, + "step": 14351 + }, + { + "epoch": 5.836518910126068, + "grad_norm": 9.108638313493067, + "learning_rate": 1.2123065233015882e-05, + "loss": 0.2064, + "step": 14352 + }, + { + "epoch": 5.8369255795038635, + "grad_norm": 10.216040353362578, + "learning_rate": 1.2122075131448384e-05, + "loss": 0.3128, + "step": 14353 + }, + { + "epoch": 5.837332248881659, + "grad_norm": 5.198765336460142, + "learning_rate": 1.2121085008096714e-05, + "loss": 0.1032, + "step": 14354 + }, + { + "epoch": 5.837738918259455, + "grad_norm": 4.410545158280317, + "learning_rate": 1.2120094862971039e-05, + "loss": 0.077, + "step": 14355 + }, + { + "epoch": 5.838145587637251, + "grad_norm": 4.566020150704459, + "learning_rate": 1.2119104696081523e-05, + "loss": 0.1409, + "step": 14356 + }, + { + "epoch": 5.8385522570150465, + "grad_norm": 0.21194364551471342, + "learning_rate": 1.2118114507438331e-05, + "loss": 0.0028, + "step": 14357 + }, + { + "epoch": 5.838958926392842, + "grad_norm": 0.21606514117028983, + "learning_rate": 1.211712429705163e-05, + "loss": 0.0044, + "step": 14358 + }, + { + "epoch": 5.839365595770639, + "grad_norm": 1.085289148983302, + "learning_rate": 1.211613406493158e-05, + "loss": 0.0235, + "step": 14359 + }, + { + "epoch": 5.839772265148435, + "grad_norm": 2.3164628298975454, + "learning_rate": 1.2115143811088348e-05, + "loss": 0.0476, + "step": 14360 + }, + { + "epoch": 5.8401789345262305, + "grad_norm": 1.012039042205743, + "learning_rate": 1.2114153535532103e-05, + "loss": 0.0239, + "step": 14361 + }, + { + "epoch": 5.840585603904026, + "grad_norm": 0.3205439708822, + "learning_rate": 1.2113163238273004e-05, + "loss": 0.0042, + "step": 14362 + }, + { + "epoch": 5.840992273281822, + "grad_norm": 0.6095498543742176, + "learning_rate": 1.2112172919321223e-05, + "loss": 0.0131, + "step": 14363 + }, + { + "epoch": 5.841398942659618, + "grad_norm": 2.6795118878677457, + "learning_rate": 1.2111182578686927e-05, + "loss": 0.0377, + "step": 14364 + }, + { + "epoch": 5.8418056120374136, + "grad_norm": 6.065947629844863, + "learning_rate": 1.2110192216380275e-05, + "loss": 0.1899, + "step": 14365 + }, + { + "epoch": 5.842212281415209, + "grad_norm": 1.754356960445891, + "learning_rate": 1.210920183241144e-05, + "loss": 0.0508, + "step": 14366 + }, + { + "epoch": 5.842618950793005, + "grad_norm": 9.07666488557865, + "learning_rate": 1.2108211426790585e-05, + "loss": 0.2801, + "step": 14367 + }, + { + "epoch": 5.843025620170801, + "grad_norm": 13.425380961644345, + "learning_rate": 1.2107220999527878e-05, + "loss": 1.0311, + "step": 14368 + }, + { + "epoch": 5.843432289548597, + "grad_norm": 5.978325214861919, + "learning_rate": 1.210623055063349e-05, + "loss": 0.2064, + "step": 14369 + }, + { + "epoch": 5.843838958926392, + "grad_norm": 3.276330993689781, + "learning_rate": 1.2105240080117584e-05, + "loss": 0.0545, + "step": 14370 + }, + { + "epoch": 5.844245628304189, + "grad_norm": 7.77059837567553, + "learning_rate": 1.2104249587990329e-05, + "loss": 0.1456, + "step": 14371 + }, + { + "epoch": 5.844652297681985, + "grad_norm": 1.8942780256022835, + "learning_rate": 1.210325907426189e-05, + "loss": 0.0255, + "step": 14372 + }, + { + "epoch": 5.845058967059781, + "grad_norm": 2.568238064932039, + "learning_rate": 1.2102268538942443e-05, + "loss": 0.065, + "step": 14373 + }, + { + "epoch": 5.845465636437576, + "grad_norm": 6.002276930591635, + "learning_rate": 1.210127798204215e-05, + "loss": 0.2615, + "step": 14374 + }, + { + "epoch": 5.845872305815372, + "grad_norm": 0.9575352277730687, + "learning_rate": 1.2100287403571178e-05, + "loss": 0.0154, + "step": 14375 + }, + { + "epoch": 5.846278975193168, + "grad_norm": 0.5959582992680539, + "learning_rate": 1.2099296803539701e-05, + "loss": 0.0078, + "step": 14376 + }, + { + "epoch": 5.846685644570964, + "grad_norm": 8.648990985100387, + "learning_rate": 1.2098306181957889e-05, + "loss": 0.4628, + "step": 14377 + }, + { + "epoch": 5.847092313948759, + "grad_norm": 7.929406420771621, + "learning_rate": 1.20973155388359e-05, + "loss": 0.1547, + "step": 14378 + }, + { + "epoch": 5.847498983326555, + "grad_norm": 10.370624356863527, + "learning_rate": 1.2096324874183918e-05, + "loss": 0.2047, + "step": 14379 + }, + { + "epoch": 5.847905652704352, + "grad_norm": 0.749115616419187, + "learning_rate": 1.2095334188012107e-05, + "loss": 0.0093, + "step": 14380 + }, + { + "epoch": 5.848312322082148, + "grad_norm": 0.15244145127398076, + "learning_rate": 1.209434348033063e-05, + "loss": 0.0036, + "step": 14381 + }, + { + "epoch": 5.848718991459943, + "grad_norm": 0.4797049265059254, + "learning_rate": 1.2093352751149667e-05, + "loss": 0.0053, + "step": 14382 + }, + { + "epoch": 5.849125660837739, + "grad_norm": 1.0384356332583133, + "learning_rate": 1.2092362000479386e-05, + "loss": 0.0193, + "step": 14383 + }, + { + "epoch": 5.849532330215535, + "grad_norm": 6.492135633566714, + "learning_rate": 1.2091371228329953e-05, + "loss": 0.1578, + "step": 14384 + }, + { + "epoch": 5.849938999593331, + "grad_norm": 7.742219540888659, + "learning_rate": 1.2090380434711541e-05, + "loss": 0.1635, + "step": 14385 + }, + { + "epoch": 5.850345668971126, + "grad_norm": 2.1105720576472553, + "learning_rate": 1.2089389619634325e-05, + "loss": 0.0566, + "step": 14386 + }, + { + "epoch": 5.850752338348922, + "grad_norm": 3.590316560694529, + "learning_rate": 1.208839878310847e-05, + "loss": 0.0752, + "step": 14387 + }, + { + "epoch": 5.851159007726718, + "grad_norm": 0.2386780742976254, + "learning_rate": 1.2087407925144152e-05, + "loss": 0.0036, + "step": 14388 + }, + { + "epoch": 5.851565677104514, + "grad_norm": 0.32726107401316495, + "learning_rate": 1.2086417045751541e-05, + "loss": 0.007, + "step": 14389 + }, + { + "epoch": 5.8519723464823095, + "grad_norm": 3.166426622099088, + "learning_rate": 1.208542614494081e-05, + "loss": 0.0481, + "step": 14390 + }, + { + "epoch": 5.852379015860106, + "grad_norm": 4.964923932371947, + "learning_rate": 1.2084435222722126e-05, + "loss": 0.0224, + "step": 14391 + }, + { + "epoch": 5.852785685237902, + "grad_norm": 5.331291603011947, + "learning_rate": 1.208344427910567e-05, + "loss": 0.1859, + "step": 14392 + }, + { + "epoch": 5.853192354615698, + "grad_norm": 4.03157579528099, + "learning_rate": 1.2082453314101607e-05, + "loss": 0.0728, + "step": 14393 + }, + { + "epoch": 5.853599023993493, + "grad_norm": 2.131483477047874, + "learning_rate": 1.2081462327720114e-05, + "loss": 0.0334, + "step": 14394 + }, + { + "epoch": 5.854005693371289, + "grad_norm": 6.268206215819447, + "learning_rate": 1.2080471319971366e-05, + "loss": 0.2282, + "step": 14395 + }, + { + "epoch": 5.854412362749085, + "grad_norm": 4.0915302522051755, + "learning_rate": 1.207948029086553e-05, + "loss": 0.2633, + "step": 14396 + }, + { + "epoch": 5.854819032126881, + "grad_norm": 9.665047528511904, + "learning_rate": 1.2078489240412778e-05, + "loss": 0.4126, + "step": 14397 + }, + { + "epoch": 5.8552257015046765, + "grad_norm": 0.23604477843740584, + "learning_rate": 1.2077498168623291e-05, + "loss": 0.0038, + "step": 14398 + }, + { + "epoch": 5.855632370882472, + "grad_norm": 1.01270914377976, + "learning_rate": 1.2076507075507242e-05, + "loss": 0.0192, + "step": 14399 + }, + { + "epoch": 5.856039040260269, + "grad_norm": 18.857181327071842, + "learning_rate": 1.20755159610748e-05, + "loss": 0.7618, + "step": 14400 + }, + { + "epoch": 5.856445709638065, + "grad_norm": 1.3889316720865044, + "learning_rate": 1.2074524825336142e-05, + "loss": 0.0248, + "step": 14401 + }, + { + "epoch": 5.85685237901586, + "grad_norm": 6.3520076151162685, + "learning_rate": 1.2073533668301444e-05, + "loss": 0.1974, + "step": 14402 + }, + { + "epoch": 5.857259048393656, + "grad_norm": 11.763741194631477, + "learning_rate": 1.207254248998088e-05, + "loss": 0.49, + "step": 14403 + }, + { + "epoch": 5.857665717771452, + "grad_norm": 7.004085723568855, + "learning_rate": 1.2071551290384621e-05, + "loss": 0.135, + "step": 14404 + }, + { + "epoch": 5.858072387149248, + "grad_norm": 2.5096162674115887, + "learning_rate": 1.207056006952285e-05, + "loss": 0.0617, + "step": 14405 + }, + { + "epoch": 5.8584790565270435, + "grad_norm": 0.3896502709313854, + "learning_rate": 1.2069568827405735e-05, + "loss": 0.0057, + "step": 14406 + }, + { + "epoch": 5.858885725904839, + "grad_norm": 3.874984200050268, + "learning_rate": 1.2068577564043453e-05, + "loss": 0.0704, + "step": 14407 + }, + { + "epoch": 5.859292395282635, + "grad_norm": 1.4239971993012073, + "learning_rate": 1.2067586279446184e-05, + "loss": 0.0303, + "step": 14408 + }, + { + "epoch": 5.859699064660431, + "grad_norm": 2.0823596593349682, + "learning_rate": 1.2066594973624099e-05, + "loss": 0.0473, + "step": 14409 + }, + { + "epoch": 5.8601057340382265, + "grad_norm": 4.677779602437392, + "learning_rate": 1.2065603646587378e-05, + "loss": 0.0484, + "step": 14410 + }, + { + "epoch": 5.860512403416022, + "grad_norm": 0.5505459223269714, + "learning_rate": 1.2064612298346196e-05, + "loss": 0.0097, + "step": 14411 + }, + { + "epoch": 5.860919072793819, + "grad_norm": 7.059286231078115, + "learning_rate": 1.2063620928910729e-05, + "loss": 0.1868, + "step": 14412 + }, + { + "epoch": 5.861325742171615, + "grad_norm": 0.8896804706168809, + "learning_rate": 1.2062629538291154e-05, + "loss": 0.0142, + "step": 14413 + }, + { + "epoch": 5.8617324115494105, + "grad_norm": 5.998988779496645, + "learning_rate": 1.206163812649765e-05, + "loss": 0.1173, + "step": 14414 + }, + { + "epoch": 5.862139080927206, + "grad_norm": 2.874328616025243, + "learning_rate": 1.2060646693540394e-05, + "loss": 0.0701, + "step": 14415 + }, + { + "epoch": 5.862545750305002, + "grad_norm": 6.726543509559541, + "learning_rate": 1.2059655239429562e-05, + "loss": 0.1944, + "step": 14416 + }, + { + "epoch": 5.862952419682798, + "grad_norm": 10.8896065797464, + "learning_rate": 1.205866376417533e-05, + "loss": 0.858, + "step": 14417 + }, + { + "epoch": 5.8633590890605936, + "grad_norm": 1.1879167061401696, + "learning_rate": 1.2057672267787879e-05, + "loss": 0.0171, + "step": 14418 + }, + { + "epoch": 5.863765758438389, + "grad_norm": 0.20580103542951744, + "learning_rate": 1.2056680750277388e-05, + "loss": 0.0037, + "step": 14419 + }, + { + "epoch": 5.864172427816185, + "grad_norm": 5.2007812992774705, + "learning_rate": 1.2055689211654032e-05, + "loss": 0.2102, + "step": 14420 + }, + { + "epoch": 5.864579097193982, + "grad_norm": 1.7593138059531934, + "learning_rate": 1.2054697651927993e-05, + "loss": 0.0317, + "step": 14421 + }, + { + "epoch": 5.8649857665717775, + "grad_norm": 3.1893767400577886, + "learning_rate": 1.205370607110945e-05, + "loss": 0.0544, + "step": 14422 + }, + { + "epoch": 5.865392435949573, + "grad_norm": 0.9439062054402791, + "learning_rate": 1.2052714469208577e-05, + "loss": 0.0217, + "step": 14423 + }, + { + "epoch": 5.865799105327369, + "grad_norm": 7.632950223586302, + "learning_rate": 1.2051722846235561e-05, + "loss": 0.3271, + "step": 14424 + }, + { + "epoch": 5.866205774705165, + "grad_norm": 0.8166843575276145, + "learning_rate": 1.2050731202200576e-05, + "loss": 0.0107, + "step": 14425 + }, + { + "epoch": 5.866612444082961, + "grad_norm": 3.4112577801963577, + "learning_rate": 1.2049739537113801e-05, + "loss": 0.0737, + "step": 14426 + }, + { + "epoch": 5.867019113460756, + "grad_norm": 0.563268550527381, + "learning_rate": 1.2048747850985419e-05, + "loss": 0.0096, + "step": 14427 + }, + { + "epoch": 5.867425782838552, + "grad_norm": 6.646924034059354, + "learning_rate": 1.2047756143825611e-05, + "loss": 0.2371, + "step": 14428 + }, + { + "epoch": 5.867832452216348, + "grad_norm": 3.399755283798184, + "learning_rate": 1.2046764415644554e-05, + "loss": 0.0758, + "step": 14429 + }, + { + "epoch": 5.868239121594144, + "grad_norm": 10.079627317449384, + "learning_rate": 1.2045772666452429e-05, + "loss": 0.6574, + "step": 14430 + }, + { + "epoch": 5.868645790971939, + "grad_norm": 6.608301721381367, + "learning_rate": 1.2044780896259421e-05, + "loss": 0.1861, + "step": 14431 + }, + { + "epoch": 5.869052460349736, + "grad_norm": 2.328021049729032, + "learning_rate": 1.2043789105075706e-05, + "loss": 0.0329, + "step": 14432 + }, + { + "epoch": 5.869459129727532, + "grad_norm": 6.945896799399314, + "learning_rate": 1.2042797292911465e-05, + "loss": 0.1693, + "step": 14433 + }, + { + "epoch": 5.869865799105328, + "grad_norm": 9.85948828558064, + "learning_rate": 1.2041805459776886e-05, + "loss": 0.1441, + "step": 14434 + }, + { + "epoch": 5.870272468483123, + "grad_norm": 3.7857275016053853, + "learning_rate": 1.2040813605682145e-05, + "loss": 0.129, + "step": 14435 + }, + { + "epoch": 5.870679137860919, + "grad_norm": 0.3976076006978608, + "learning_rate": 1.2039821730637424e-05, + "loss": 0.0084, + "step": 14436 + }, + { + "epoch": 5.871085807238715, + "grad_norm": 6.9713990984695124, + "learning_rate": 1.2038829834652907e-05, + "loss": 0.1741, + "step": 14437 + }, + { + "epoch": 5.871492476616511, + "grad_norm": 6.961378551033161, + "learning_rate": 1.2037837917738777e-05, + "loss": 0.2785, + "step": 14438 + }, + { + "epoch": 5.871899145994306, + "grad_norm": 2.496365027746517, + "learning_rate": 1.2036845979905211e-05, + "loss": 0.1206, + "step": 14439 + }, + { + "epoch": 5.872305815372102, + "grad_norm": 11.426134585109448, + "learning_rate": 1.20358540211624e-05, + "loss": 0.5037, + "step": 14440 + }, + { + "epoch": 5.872712484749899, + "grad_norm": 15.335032247613052, + "learning_rate": 1.2034862041520523e-05, + "loss": 0.2776, + "step": 14441 + }, + { + "epoch": 5.873119154127695, + "grad_norm": 8.744988440306573, + "learning_rate": 1.2033870040989762e-05, + "loss": 0.2975, + "step": 14442 + }, + { + "epoch": 5.87352582350549, + "grad_norm": 7.491115725549608, + "learning_rate": 1.20328780195803e-05, + "loss": 0.3285, + "step": 14443 + }, + { + "epoch": 5.873932492883286, + "grad_norm": 0.1141742278048473, + "learning_rate": 1.2031885977302322e-05, + "loss": 0.0017, + "step": 14444 + }, + { + "epoch": 5.874339162261082, + "grad_norm": 4.4262022323668235, + "learning_rate": 1.2030893914166015e-05, + "loss": 0.1072, + "step": 14445 + }, + { + "epoch": 5.874745831638878, + "grad_norm": 5.685540718938452, + "learning_rate": 1.2029901830181558e-05, + "loss": 0.1567, + "step": 14446 + }, + { + "epoch": 5.875152501016673, + "grad_norm": 0.10603370808091232, + "learning_rate": 1.2028909725359137e-05, + "loss": 0.0014, + "step": 14447 + }, + { + "epoch": 5.875559170394469, + "grad_norm": 5.766637074444349, + "learning_rate": 1.2027917599708937e-05, + "loss": 0.1169, + "step": 14448 + }, + { + "epoch": 5.875965839772265, + "grad_norm": 10.049515031001116, + "learning_rate": 1.202692545324114e-05, + "loss": 0.4691, + "step": 14449 + }, + { + "epoch": 5.876372509150061, + "grad_norm": 13.058373522976847, + "learning_rate": 1.2025933285965936e-05, + "loss": 1.0546, + "step": 14450 + }, + { + "epoch": 5.8767791785278565, + "grad_norm": 2.118822882465434, + "learning_rate": 1.2024941097893505e-05, + "loss": 0.0396, + "step": 14451 + }, + { + "epoch": 5.877185847905652, + "grad_norm": 5.110573596676008, + "learning_rate": 1.2023948889034035e-05, + "loss": 0.1106, + "step": 14452 + }, + { + "epoch": 5.877592517283449, + "grad_norm": 2.4180644332292953, + "learning_rate": 1.202295665939771e-05, + "loss": 0.0537, + "step": 14453 + }, + { + "epoch": 5.877999186661245, + "grad_norm": 0.5328027230841235, + "learning_rate": 1.202196440899472e-05, + "loss": 0.0087, + "step": 14454 + }, + { + "epoch": 5.87840585603904, + "grad_norm": 3.15560391885334, + "learning_rate": 1.2020972137835245e-05, + "loss": 0.0944, + "step": 14455 + }, + { + "epoch": 5.878812525416836, + "grad_norm": 7.025093526466605, + "learning_rate": 1.2019979845929474e-05, + "loss": 0.1719, + "step": 14456 + }, + { + "epoch": 5.879219194794632, + "grad_norm": 9.997678858367857, + "learning_rate": 1.2018987533287596e-05, + "loss": 0.3899, + "step": 14457 + }, + { + "epoch": 5.879625864172428, + "grad_norm": 6.688108386320192, + "learning_rate": 1.201799519991979e-05, + "loss": 0.1628, + "step": 14458 + }, + { + "epoch": 5.8800325335502235, + "grad_norm": 5.769380345692295, + "learning_rate": 1.2017002845836251e-05, + "loss": 0.1736, + "step": 14459 + }, + { + "epoch": 5.880439202928019, + "grad_norm": 0.09437156345694851, + "learning_rate": 1.2016010471047164e-05, + "loss": 0.0015, + "step": 14460 + }, + { + "epoch": 5.880845872305815, + "grad_norm": 3.124197128543743, + "learning_rate": 1.2015018075562712e-05, + "loss": 0.0605, + "step": 14461 + }, + { + "epoch": 5.881252541683612, + "grad_norm": 3.7077176667568996, + "learning_rate": 1.2014025659393085e-05, + "loss": 0.095, + "step": 14462 + }, + { + "epoch": 5.881659211061407, + "grad_norm": 1.7887205450068042, + "learning_rate": 1.2013033222548473e-05, + "loss": 0.0338, + "step": 14463 + }, + { + "epoch": 5.882065880439203, + "grad_norm": 4.782234757042271, + "learning_rate": 1.201204076503906e-05, + "loss": 0.1442, + "step": 14464 + }, + { + "epoch": 5.882472549816999, + "grad_norm": 8.719489192305812, + "learning_rate": 1.2011048286875038e-05, + "loss": 0.2643, + "step": 14465 + }, + { + "epoch": 5.882879219194795, + "grad_norm": 1.082375024365848, + "learning_rate": 1.2010055788066592e-05, + "loss": 0.0167, + "step": 14466 + }, + { + "epoch": 5.8832858885725905, + "grad_norm": 10.983682785125085, + "learning_rate": 1.2009063268623913e-05, + "loss": 0.7606, + "step": 14467 + }, + { + "epoch": 5.883692557950386, + "grad_norm": 11.471895905601377, + "learning_rate": 1.2008070728557186e-05, + "loss": 0.5596, + "step": 14468 + }, + { + "epoch": 5.884099227328182, + "grad_norm": 3.694926234153098, + "learning_rate": 1.2007078167876603e-05, + "loss": 0.0654, + "step": 14469 + }, + { + "epoch": 5.884505896705978, + "grad_norm": 2.7387062192700267, + "learning_rate": 1.2006085586592353e-05, + "loss": 0.0704, + "step": 14470 + }, + { + "epoch": 5.8849125660837736, + "grad_norm": 4.07198955980342, + "learning_rate": 1.2005092984714625e-05, + "loss": 0.0578, + "step": 14471 + }, + { + "epoch": 5.885319235461569, + "grad_norm": 4.030092638562168, + "learning_rate": 1.2004100362253609e-05, + "loss": 0.086, + "step": 14472 + }, + { + "epoch": 5.885725904839366, + "grad_norm": 2.5114864754051167, + "learning_rate": 1.2003107719219493e-05, + "loss": 0.0447, + "step": 14473 + }, + { + "epoch": 5.886132574217162, + "grad_norm": 4.729566609522533, + "learning_rate": 1.200211505562247e-05, + "loss": 0.1398, + "step": 14474 + }, + { + "epoch": 5.8865392435949575, + "grad_norm": 8.959280822885622, + "learning_rate": 1.2001122371472724e-05, + "loss": 0.601, + "step": 14475 + }, + { + "epoch": 5.886945912972753, + "grad_norm": 0.9423995460311813, + "learning_rate": 1.2000129666780453e-05, + "loss": 0.0294, + "step": 14476 + }, + { + "epoch": 5.887352582350549, + "grad_norm": 4.387615348578428, + "learning_rate": 1.1999136941555844e-05, + "loss": 0.0963, + "step": 14477 + }, + { + "epoch": 5.887759251728345, + "grad_norm": 5.180539628708794, + "learning_rate": 1.1998144195809087e-05, + "loss": 0.1314, + "step": 14478 + }, + { + "epoch": 5.888165921106141, + "grad_norm": 6.6916839912331145, + "learning_rate": 1.1997151429550376e-05, + "loss": 0.1962, + "step": 14479 + }, + { + "epoch": 5.888572590483936, + "grad_norm": 2.9477588593256687, + "learning_rate": 1.19961586427899e-05, + "loss": 0.0381, + "step": 14480 + }, + { + "epoch": 5.888979259861732, + "grad_norm": 0.09640961653957472, + "learning_rate": 1.199516583553785e-05, + "loss": 0.0016, + "step": 14481 + }, + { + "epoch": 5.889385929239529, + "grad_norm": 1.1085887690640825, + "learning_rate": 1.199417300780442e-05, + "loss": 0.018, + "step": 14482 + }, + { + "epoch": 5.8897925986173245, + "grad_norm": 0.9603501433720004, + "learning_rate": 1.19931801595998e-05, + "loss": 0.0159, + "step": 14483 + }, + { + "epoch": 5.89019926799512, + "grad_norm": 2.83691681268365, + "learning_rate": 1.199218729093418e-05, + "loss": 0.0355, + "step": 14484 + }, + { + "epoch": 5.890605937372916, + "grad_norm": 4.4878392547118935, + "learning_rate": 1.1991194401817758e-05, + "loss": 0.2056, + "step": 14485 + }, + { + "epoch": 5.891012606750712, + "grad_norm": 0.9317130613207341, + "learning_rate": 1.1990201492260723e-05, + "loss": 0.0135, + "step": 14486 + }, + { + "epoch": 5.891419276128508, + "grad_norm": 6.95832613166681, + "learning_rate": 1.1989208562273266e-05, + "loss": 0.3494, + "step": 14487 + }, + { + "epoch": 5.891825945506303, + "grad_norm": 7.045103504370751, + "learning_rate": 1.1988215611865584e-05, + "loss": 0.2159, + "step": 14488 + }, + { + "epoch": 5.892232614884099, + "grad_norm": 8.35203489130912, + "learning_rate": 1.1987222641047866e-05, + "loss": 0.2031, + "step": 14489 + }, + { + "epoch": 5.892639284261895, + "grad_norm": 10.601838780092532, + "learning_rate": 1.1986229649830309e-05, + "loss": 0.4195, + "step": 14490 + }, + { + "epoch": 5.893045953639691, + "grad_norm": 8.260786383574091, + "learning_rate": 1.1985236638223104e-05, + "loss": 0.1361, + "step": 14491 + }, + { + "epoch": 5.893452623017486, + "grad_norm": 0.4072858375289836, + "learning_rate": 1.1984243606236448e-05, + "loss": 0.0059, + "step": 14492 + }, + { + "epoch": 5.893859292395282, + "grad_norm": 7.192732623086941, + "learning_rate": 1.198325055388053e-05, + "loss": 0.1938, + "step": 14493 + }, + { + "epoch": 5.894265961773079, + "grad_norm": 4.685087676303404, + "learning_rate": 1.1982257481165547e-05, + "loss": 0.1198, + "step": 14494 + }, + { + "epoch": 5.894672631150875, + "grad_norm": 2.6128117896480223, + "learning_rate": 1.1981264388101697e-05, + "loss": 0.0526, + "step": 14495 + }, + { + "epoch": 5.89507930052867, + "grad_norm": 1.6920408495954478, + "learning_rate": 1.1980271274699168e-05, + "loss": 0.0255, + "step": 14496 + }, + { + "epoch": 5.895485969906466, + "grad_norm": 3.3384808634551364, + "learning_rate": 1.1979278140968158e-05, + "loss": 0.0366, + "step": 14497 + }, + { + "epoch": 5.895892639284262, + "grad_norm": 4.6772120671894335, + "learning_rate": 1.1978284986918863e-05, + "loss": 0.2085, + "step": 14498 + }, + { + "epoch": 5.896299308662058, + "grad_norm": 8.535323378805613, + "learning_rate": 1.1977291812561476e-05, + "loss": 0.2685, + "step": 14499 + }, + { + "epoch": 5.896705978039853, + "grad_norm": 0.43987375579062393, + "learning_rate": 1.197629861790619e-05, + "loss": 0.006, + "step": 14500 + }, + { + "epoch": 5.897112647417649, + "grad_norm": 7.42123120025318, + "learning_rate": 1.1975305402963208e-05, + "loss": 0.1264, + "step": 14501 + }, + { + "epoch": 5.897519316795445, + "grad_norm": 11.453715839992824, + "learning_rate": 1.1974312167742723e-05, + "loss": 0.5778, + "step": 14502 + }, + { + "epoch": 5.897925986173242, + "grad_norm": 9.718558524612904, + "learning_rate": 1.1973318912254925e-05, + "loss": 0.5199, + "step": 14503 + }, + { + "epoch": 5.898332655551037, + "grad_norm": 2.8433945099662044, + "learning_rate": 1.197232563651002e-05, + "loss": 0.0806, + "step": 14504 + }, + { + "epoch": 5.898739324928833, + "grad_norm": 8.020760051035927, + "learning_rate": 1.1971332340518198e-05, + "loss": 0.2285, + "step": 14505 + }, + { + "epoch": 5.899145994306629, + "grad_norm": 9.793202744768909, + "learning_rate": 1.1970339024289657e-05, + "loss": 0.5526, + "step": 14506 + }, + { + "epoch": 5.899552663684425, + "grad_norm": 12.823697663735004, + "learning_rate": 1.1969345687834593e-05, + "loss": 0.645, + "step": 14507 + }, + { + "epoch": 5.89995933306222, + "grad_norm": 6.7024178671037555, + "learning_rate": 1.1968352331163207e-05, + "loss": 0.1946, + "step": 14508 + }, + { + "epoch": 5.900366002440016, + "grad_norm": 6.199050940278446, + "learning_rate": 1.1967358954285692e-05, + "loss": 0.21, + "step": 14509 + }, + { + "epoch": 5.900772671817812, + "grad_norm": 5.495513800691557, + "learning_rate": 1.1966365557212248e-05, + "loss": 0.0917, + "step": 14510 + }, + { + "epoch": 5.901179341195608, + "grad_norm": 0.21569978381231844, + "learning_rate": 1.1965372139953071e-05, + "loss": 0.0036, + "step": 14511 + }, + { + "epoch": 5.9015860105734035, + "grad_norm": 1.5470164001031448, + "learning_rate": 1.1964378702518362e-05, + "loss": 0.0292, + "step": 14512 + }, + { + "epoch": 5.901992679951199, + "grad_norm": 5.607477911805685, + "learning_rate": 1.1963385244918313e-05, + "loss": 0.1859, + "step": 14513 + }, + { + "epoch": 5.902399349328996, + "grad_norm": 4.264660769406102, + "learning_rate": 1.196239176716313e-05, + "loss": 0.0785, + "step": 14514 + }, + { + "epoch": 5.902806018706792, + "grad_norm": 19.121318936337047, + "learning_rate": 1.1961398269263007e-05, + "loss": 1.1832, + "step": 14515 + }, + { + "epoch": 5.903212688084587, + "grad_norm": 8.145842330707929, + "learning_rate": 1.1960404751228141e-05, + "loss": 0.2102, + "step": 14516 + }, + { + "epoch": 5.903619357462383, + "grad_norm": 0.4177581371139985, + "learning_rate": 1.1959411213068737e-05, + "loss": 0.0057, + "step": 14517 + }, + { + "epoch": 5.904026026840179, + "grad_norm": 0.14969912503101598, + "learning_rate": 1.195841765479499e-05, + "loss": 0.003, + "step": 14518 + }, + { + "epoch": 5.904432696217975, + "grad_norm": 1.4918840806222131, + "learning_rate": 1.19574240764171e-05, + "loss": 0.032, + "step": 14519 + }, + { + "epoch": 5.9048393655957705, + "grad_norm": 6.26421554375022, + "learning_rate": 1.1956430477945264e-05, + "loss": 0.1654, + "step": 14520 + }, + { + "epoch": 5.905246034973566, + "grad_norm": 2.27480297750264, + "learning_rate": 1.1955436859389688e-05, + "loss": 0.0384, + "step": 14521 + }, + { + "epoch": 5.905652704351362, + "grad_norm": 3.351341760162814, + "learning_rate": 1.1954443220760567e-05, + "loss": 0.0722, + "step": 14522 + }, + { + "epoch": 5.906059373729159, + "grad_norm": 5.482951651064217, + "learning_rate": 1.1953449562068103e-05, + "loss": 0.1376, + "step": 14523 + }, + { + "epoch": 5.9064660431069544, + "grad_norm": 6.147898523680505, + "learning_rate": 1.1952455883322496e-05, + "loss": 0.2862, + "step": 14524 + }, + { + "epoch": 5.90687271248475, + "grad_norm": 3.1828219001374998, + "learning_rate": 1.1951462184533949e-05, + "loss": 0.0637, + "step": 14525 + }, + { + "epoch": 5.907279381862546, + "grad_norm": 0.10907634224072506, + "learning_rate": 1.1950468465712658e-05, + "loss": 0.0011, + "step": 14526 + }, + { + "epoch": 5.907686051240342, + "grad_norm": 5.062474310183448, + "learning_rate": 1.1949474726868827e-05, + "loss": 0.1275, + "step": 14527 + }, + { + "epoch": 5.9080927206181375, + "grad_norm": 7.749262298287019, + "learning_rate": 1.1948480968012657e-05, + "loss": 0.1383, + "step": 14528 + }, + { + "epoch": 5.908499389995933, + "grad_norm": 0.45403789312169257, + "learning_rate": 1.1947487189154346e-05, + "loss": 0.0079, + "step": 14529 + }, + { + "epoch": 5.908906059373729, + "grad_norm": 2.5137298037486757, + "learning_rate": 1.19464933903041e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 5.909312728751525, + "grad_norm": 0.4193650218218792, + "learning_rate": 1.1945499571472125e-05, + "loss": 0.0049, + "step": 14531 + }, + { + "epoch": 5.909719398129321, + "grad_norm": 5.647107007860659, + "learning_rate": 1.1944505732668615e-05, + "loss": 0.0654, + "step": 14532 + }, + { + "epoch": 5.910126067507116, + "grad_norm": 1.3975386188794119, + "learning_rate": 1.1943511873903772e-05, + "loss": 0.0216, + "step": 14533 + }, + { + "epoch": 5.910532736884912, + "grad_norm": 7.708986605718986, + "learning_rate": 1.1942517995187803e-05, + "loss": 0.2217, + "step": 14534 + }, + { + "epoch": 5.910939406262709, + "grad_norm": 8.966578577932792, + "learning_rate": 1.1941524096530912e-05, + "loss": 0.2315, + "step": 14535 + }, + { + "epoch": 5.9113460756405045, + "grad_norm": 5.788514919612785, + "learning_rate": 1.1940530177943295e-05, + "loss": 0.1426, + "step": 14536 + }, + { + "epoch": 5.9117527450183, + "grad_norm": 7.984768440683149, + "learning_rate": 1.193953623943516e-05, + "loss": 0.1635, + "step": 14537 + }, + { + "epoch": 5.912159414396096, + "grad_norm": 8.61229749914575, + "learning_rate": 1.1938542281016712e-05, + "loss": 0.432, + "step": 14538 + }, + { + "epoch": 5.912566083773892, + "grad_norm": 0.15053740702437352, + "learning_rate": 1.193754830269815e-05, + "loss": 0.0019, + "step": 14539 + }, + { + "epoch": 5.912972753151688, + "grad_norm": 6.691325707268902, + "learning_rate": 1.1936554304489679e-05, + "loss": 0.2186, + "step": 14540 + }, + { + "epoch": 5.913379422529483, + "grad_norm": 1.03523354070985, + "learning_rate": 1.1935560286401504e-05, + "loss": 0.0161, + "step": 14541 + }, + { + "epoch": 5.913786091907279, + "grad_norm": 0.5565694988718712, + "learning_rate": 1.1934566248443828e-05, + "loss": 0.0085, + "step": 14542 + }, + { + "epoch": 5.914192761285075, + "grad_norm": 1.7245294262053177, + "learning_rate": 1.1933572190626856e-05, + "loss": 0.0578, + "step": 14543 + }, + { + "epoch": 5.9145994306628715, + "grad_norm": 0.03334256354993791, + "learning_rate": 1.1932578112960793e-05, + "loss": 0.0006, + "step": 14544 + }, + { + "epoch": 5.915006100040667, + "grad_norm": 2.3966691216807994, + "learning_rate": 1.1931584015455843e-05, + "loss": 0.0319, + "step": 14545 + }, + { + "epoch": 5.915412769418463, + "grad_norm": 4.643508744438724, + "learning_rate": 1.1930589898122211e-05, + "loss": 0.2174, + "step": 14546 + }, + { + "epoch": 5.915819438796259, + "grad_norm": 10.148028207987378, + "learning_rate": 1.1929595760970103e-05, + "loss": 0.2562, + "step": 14547 + }, + { + "epoch": 5.916226108174055, + "grad_norm": 6.754025758978837, + "learning_rate": 1.1928601604009722e-05, + "loss": 0.18, + "step": 14548 + }, + { + "epoch": 5.91663277755185, + "grad_norm": 1.803596733619911, + "learning_rate": 1.1927607427251276e-05, + "loss": 0.0238, + "step": 14549 + }, + { + "epoch": 5.917039446929646, + "grad_norm": 8.259991447682502, + "learning_rate": 1.1926613230704969e-05, + "loss": 0.3914, + "step": 14550 + }, + { + "epoch": 5.917446116307442, + "grad_norm": 10.128966653300232, + "learning_rate": 1.1925619014381007e-05, + "loss": 0.3596, + "step": 14551 + }, + { + "epoch": 5.917852785685238, + "grad_norm": 5.587242015487532, + "learning_rate": 1.1924624778289595e-05, + "loss": 0.348, + "step": 14552 + }, + { + "epoch": 5.918259455063033, + "grad_norm": 2.091129386899112, + "learning_rate": 1.1923630522440943e-05, + "loss": 0.0396, + "step": 14553 + }, + { + "epoch": 5.918666124440829, + "grad_norm": 8.903310694474364, + "learning_rate": 1.1922636246845255e-05, + "loss": 0.3072, + "step": 14554 + }, + { + "epoch": 5.919072793818626, + "grad_norm": 1.3552159369197558, + "learning_rate": 1.1921641951512737e-05, + "loss": 0.0165, + "step": 14555 + }, + { + "epoch": 5.919479463196422, + "grad_norm": 5.09318498482883, + "learning_rate": 1.19206476364536e-05, + "loss": 0.0997, + "step": 14556 + }, + { + "epoch": 5.919886132574217, + "grad_norm": 11.85111385846649, + "learning_rate": 1.1919653301678047e-05, + "loss": 0.518, + "step": 14557 + }, + { + "epoch": 5.920292801952013, + "grad_norm": 2.901391838449939, + "learning_rate": 1.1918658947196287e-05, + "loss": 0.078, + "step": 14558 + }, + { + "epoch": 5.920699471329809, + "grad_norm": 10.816329799421874, + "learning_rate": 1.1917664573018528e-05, + "loss": 0.6575, + "step": 14559 + }, + { + "epoch": 5.921106140707605, + "grad_norm": 13.274012999943544, + "learning_rate": 1.1916670179154977e-05, + "loss": 0.3953, + "step": 14560 + }, + { + "epoch": 5.9215128100854, + "grad_norm": 0.47972965393568273, + "learning_rate": 1.1915675765615837e-05, + "loss": 0.0071, + "step": 14561 + }, + { + "epoch": 5.921919479463196, + "grad_norm": 7.093670579911022, + "learning_rate": 1.1914681332411327e-05, + "loss": 0.4385, + "step": 14562 + }, + { + "epoch": 5.922326148840992, + "grad_norm": 6.514637054870296, + "learning_rate": 1.191368687955165e-05, + "loss": 0.1035, + "step": 14563 + }, + { + "epoch": 5.922732818218789, + "grad_norm": 8.205376642558832, + "learning_rate": 1.191269240704701e-05, + "loss": 0.1858, + "step": 14564 + }, + { + "epoch": 5.923139487596584, + "grad_norm": 0.8447854229128952, + "learning_rate": 1.191169791490762e-05, + "loss": 0.0128, + "step": 14565 + }, + { + "epoch": 5.92354615697438, + "grad_norm": 0.10391177525706027, + "learning_rate": 1.191070340314369e-05, + "loss": 0.002, + "step": 14566 + }, + { + "epoch": 5.923952826352176, + "grad_norm": 0.1279346695291001, + "learning_rate": 1.190970887176543e-05, + "loss": 0.0024, + "step": 14567 + }, + { + "epoch": 5.924359495729972, + "grad_norm": 2.0339952591064, + "learning_rate": 1.1908714320783046e-05, + "loss": 0.0278, + "step": 14568 + }, + { + "epoch": 5.924766165107767, + "grad_norm": 7.6402814893695075, + "learning_rate": 1.1907719750206747e-05, + "loss": 0.3093, + "step": 14569 + }, + { + "epoch": 5.925172834485563, + "grad_norm": 4.576824902990316, + "learning_rate": 1.1906725160046747e-05, + "loss": 0.0904, + "step": 14570 + }, + { + "epoch": 5.925579503863359, + "grad_norm": 0.6229867025016963, + "learning_rate": 1.1905730550313253e-05, + "loss": 0.0104, + "step": 14571 + }, + { + "epoch": 5.925986173241155, + "grad_norm": 1.894502344814251, + "learning_rate": 1.1904735921016475e-05, + "loss": 0.0293, + "step": 14572 + }, + { + "epoch": 5.9263928426189505, + "grad_norm": 1.5220944969777623, + "learning_rate": 1.1903741272166624e-05, + "loss": 0.0222, + "step": 14573 + }, + { + "epoch": 5.926799511996746, + "grad_norm": 5.802915398885477, + "learning_rate": 1.1902746603773912e-05, + "loss": 0.1614, + "step": 14574 + }, + { + "epoch": 5.927206181374542, + "grad_norm": 3.1481121915106463, + "learning_rate": 1.1901751915848548e-05, + "loss": 0.0669, + "step": 14575 + }, + { + "epoch": 5.927612850752339, + "grad_norm": 15.539942815709487, + "learning_rate": 1.1900757208400741e-05, + "loss": 0.6549, + "step": 14576 + }, + { + "epoch": 5.9280195201301344, + "grad_norm": 0.12114305043210431, + "learning_rate": 1.1899762481440708e-05, + "loss": 0.0019, + "step": 14577 + }, + { + "epoch": 5.92842618950793, + "grad_norm": 6.951645592153579, + "learning_rate": 1.1898767734978655e-05, + "loss": 0.1428, + "step": 14578 + }, + { + "epoch": 5.928832858885726, + "grad_norm": 11.956923879996875, + "learning_rate": 1.1897772969024797e-05, + "loss": 0.9375, + "step": 14579 + }, + { + "epoch": 5.929239528263522, + "grad_norm": 3.7610366731095155, + "learning_rate": 1.1896778183589346e-05, + "loss": 0.0692, + "step": 14580 + }, + { + "epoch": 5.9296461976413175, + "grad_norm": 8.432432785569187, + "learning_rate": 1.1895783378682509e-05, + "loss": 0.2684, + "step": 14581 + }, + { + "epoch": 5.930052867019113, + "grad_norm": 11.314013698637527, + "learning_rate": 1.1894788554314502e-05, + "loss": 0.4172, + "step": 14582 + }, + { + "epoch": 5.930459536396909, + "grad_norm": 4.0932396974088405, + "learning_rate": 1.1893793710495538e-05, + "loss": 0.1148, + "step": 14583 + }, + { + "epoch": 5.930866205774705, + "grad_norm": 0.564731991275368, + "learning_rate": 1.1892798847235827e-05, + "loss": 0.0118, + "step": 14584 + }, + { + "epoch": 5.9312728751525015, + "grad_norm": 1.7169541676475268, + "learning_rate": 1.1891803964545582e-05, + "loss": 0.0384, + "step": 14585 + }, + { + "epoch": 5.931679544530297, + "grad_norm": 3.642747341053019, + "learning_rate": 1.1890809062435023e-05, + "loss": 0.053, + "step": 14586 + }, + { + "epoch": 5.932086213908093, + "grad_norm": 14.272335820101526, + "learning_rate": 1.188981414091435e-05, + "loss": 0.2663, + "step": 14587 + }, + { + "epoch": 5.932492883285889, + "grad_norm": 12.391363089166239, + "learning_rate": 1.1888819199993788e-05, + "loss": 0.4645, + "step": 14588 + }, + { + "epoch": 5.9328995526636845, + "grad_norm": 3.9465210265170283, + "learning_rate": 1.1887824239683548e-05, + "loss": 0.0686, + "step": 14589 + }, + { + "epoch": 5.93330622204148, + "grad_norm": 13.363171676525496, + "learning_rate": 1.188682925999384e-05, + "loss": 0.1862, + "step": 14590 + }, + { + "epoch": 5.933712891419276, + "grad_norm": 2.6417984362023272, + "learning_rate": 1.1885834260934876e-05, + "loss": 0.0333, + "step": 14591 + }, + { + "epoch": 5.934119560797072, + "grad_norm": 1.18793595356453, + "learning_rate": 1.188483924251688e-05, + "loss": 0.0211, + "step": 14592 + }, + { + "epoch": 5.934526230174868, + "grad_norm": 22.864661404332296, + "learning_rate": 1.1883844204750059e-05, + "loss": 0.5679, + "step": 14593 + }, + { + "epoch": 5.934932899552663, + "grad_norm": 7.390369432664882, + "learning_rate": 1.1882849147644626e-05, + "loss": 0.2089, + "step": 14594 + }, + { + "epoch": 5.935339568930459, + "grad_norm": 4.52868962681744, + "learning_rate": 1.1881854071210805e-05, + "loss": 0.1493, + "step": 14595 + }, + { + "epoch": 5.935746238308256, + "grad_norm": 1.7040080750031161, + "learning_rate": 1.1880858975458802e-05, + "loss": 0.0483, + "step": 14596 + }, + { + "epoch": 5.9361529076860515, + "grad_norm": 5.135501734535266, + "learning_rate": 1.1879863860398832e-05, + "loss": 0.1631, + "step": 14597 + }, + { + "epoch": 5.936559577063847, + "grad_norm": 9.909496940458304, + "learning_rate": 1.1878868726041118e-05, + "loss": 0.3422, + "step": 14598 + }, + { + "epoch": 5.936966246441643, + "grad_norm": 9.481609533618302, + "learning_rate": 1.1877873572395868e-05, + "loss": 0.3894, + "step": 14599 + }, + { + "epoch": 5.937372915819439, + "grad_norm": 1.6378829270189375, + "learning_rate": 1.1876878399473299e-05, + "loss": 0.0401, + "step": 14600 + }, + { + "epoch": 5.937779585197235, + "grad_norm": 0.4579293911697854, + "learning_rate": 1.1875883207283632e-05, + "loss": 0.0066, + "step": 14601 + }, + { + "epoch": 5.93818625457503, + "grad_norm": 8.676280240261551, + "learning_rate": 1.187488799583708e-05, + "loss": 0.2872, + "step": 14602 + }, + { + "epoch": 5.938592923952826, + "grad_norm": 1.0861893133997245, + "learning_rate": 1.1873892765143855e-05, + "loss": 0.0039, + "step": 14603 + }, + { + "epoch": 5.938999593330622, + "grad_norm": 0.8634971767933407, + "learning_rate": 1.1872897515214181e-05, + "loss": 0.0226, + "step": 14604 + }, + { + "epoch": 5.9394062627084185, + "grad_norm": 4.409879286024641, + "learning_rate": 1.1871902246058271e-05, + "loss": 0.2038, + "step": 14605 + }, + { + "epoch": 5.939812932086214, + "grad_norm": 5.945713937632214, + "learning_rate": 1.1870906957686341e-05, + "loss": 0.158, + "step": 14606 + }, + { + "epoch": 5.94021960146401, + "grad_norm": 8.421737134157583, + "learning_rate": 1.1869911650108612e-05, + "loss": 0.2329, + "step": 14607 + }, + { + "epoch": 5.940626270841806, + "grad_norm": 0.7380360989148853, + "learning_rate": 1.1868916323335298e-05, + "loss": 0.0109, + "step": 14608 + }, + { + "epoch": 5.941032940219602, + "grad_norm": 1.5412179981876546, + "learning_rate": 1.1867920977376617e-05, + "loss": 0.0247, + "step": 14609 + }, + { + "epoch": 5.941439609597397, + "grad_norm": 0.1192749652828199, + "learning_rate": 1.1866925612242785e-05, + "loss": 0.0018, + "step": 14610 + }, + { + "epoch": 5.941846278975193, + "grad_norm": 4.819656473107657, + "learning_rate": 1.1865930227944025e-05, + "loss": 0.0919, + "step": 14611 + }, + { + "epoch": 5.942252948352989, + "grad_norm": 0.7460840545081753, + "learning_rate": 1.186493482449055e-05, + "loss": 0.0144, + "step": 14612 + }, + { + "epoch": 5.942659617730785, + "grad_norm": 0.0956427629846373, + "learning_rate": 1.1863939401892581e-05, + "loss": 0.0019, + "step": 14613 + }, + { + "epoch": 5.94306628710858, + "grad_norm": 6.498366267642124, + "learning_rate": 1.1862943960160335e-05, + "loss": 0.1683, + "step": 14614 + }, + { + "epoch": 5.943472956486376, + "grad_norm": 1.0431362313024448, + "learning_rate": 1.1861948499304034e-05, + "loss": 0.0157, + "step": 14615 + }, + { + "epoch": 5.943879625864172, + "grad_norm": 9.340764792312715, + "learning_rate": 1.1860953019333893e-05, + "loss": 0.6121, + "step": 14616 + }, + { + "epoch": 5.944286295241969, + "grad_norm": 3.76117483537651, + "learning_rate": 1.1859957520260132e-05, + "loss": 0.0698, + "step": 14617 + }, + { + "epoch": 5.944692964619764, + "grad_norm": 5.963088658329299, + "learning_rate": 1.1858962002092973e-05, + "loss": 0.1345, + "step": 14618 + }, + { + "epoch": 5.94509963399756, + "grad_norm": 9.115168141495712, + "learning_rate": 1.1857966464842631e-05, + "loss": 0.2521, + "step": 14619 + }, + { + "epoch": 5.945506303375356, + "grad_norm": 5.625723408997788, + "learning_rate": 1.1856970908519331e-05, + "loss": 0.1572, + "step": 14620 + }, + { + "epoch": 5.945912972753152, + "grad_norm": 1.0843743610389858, + "learning_rate": 1.1855975333133289e-05, + "loss": 0.0172, + "step": 14621 + }, + { + "epoch": 5.946319642130947, + "grad_norm": 0.3476509994056618, + "learning_rate": 1.1854979738694728e-05, + "loss": 0.0043, + "step": 14622 + }, + { + "epoch": 5.946726311508743, + "grad_norm": 6.920660714854244, + "learning_rate": 1.1853984125213862e-05, + "loss": 0.143, + "step": 14623 + }, + { + "epoch": 5.947132980886539, + "grad_norm": 5.304579554758346, + "learning_rate": 1.1852988492700919e-05, + "loss": 0.1939, + "step": 14624 + }, + { + "epoch": 5.947539650264335, + "grad_norm": 4.4849202974253455, + "learning_rate": 1.1851992841166115e-05, + "loss": 0.0746, + "step": 14625 + }, + { + "epoch": 5.947946319642131, + "grad_norm": 8.039124739519973, + "learning_rate": 1.1850997170619672e-05, + "loss": 0.2392, + "step": 14626 + }, + { + "epoch": 5.948352989019927, + "grad_norm": 8.149727216158505, + "learning_rate": 1.1850001481071813e-05, + "loss": 0.2353, + "step": 14627 + }, + { + "epoch": 5.948759658397723, + "grad_norm": 3.059082341403443, + "learning_rate": 1.1849005772532757e-05, + "loss": 0.0674, + "step": 14628 + }, + { + "epoch": 5.949166327775519, + "grad_norm": 2.4374337917222597, + "learning_rate": 1.1848010045012725e-05, + "loss": 0.0386, + "step": 14629 + }, + { + "epoch": 5.9495729971533144, + "grad_norm": 0.5236618057478867, + "learning_rate": 1.1847014298521941e-05, + "loss": 0.0082, + "step": 14630 + }, + { + "epoch": 5.94997966653111, + "grad_norm": 4.231907446025537, + "learning_rate": 1.1846018533070626e-05, + "loss": 0.0838, + "step": 14631 + }, + { + "epoch": 5.950386335908906, + "grad_norm": 9.710097023367965, + "learning_rate": 1.1845022748669001e-05, + "loss": 0.5114, + "step": 14632 + }, + { + "epoch": 5.950793005286702, + "grad_norm": 3.926390427251682, + "learning_rate": 1.1844026945327289e-05, + "loss": 0.081, + "step": 14633 + }, + { + "epoch": 5.9511996746644975, + "grad_norm": 3.565154453119106, + "learning_rate": 1.1843031123055715e-05, + "loss": 0.0608, + "step": 14634 + }, + { + "epoch": 5.951606344042293, + "grad_norm": 0.8634039480360644, + "learning_rate": 1.1842035281864496e-05, + "loss": 0.0124, + "step": 14635 + }, + { + "epoch": 5.952013013420089, + "grad_norm": 5.9873335138739785, + "learning_rate": 1.1841039421763858e-05, + "loss": 0.169, + "step": 14636 + }, + { + "epoch": 5.952419682797886, + "grad_norm": 8.034086191169592, + "learning_rate": 1.1840043542764024e-05, + "loss": 0.3313, + "step": 14637 + }, + { + "epoch": 5.9528263521756815, + "grad_norm": 7.855260416347934, + "learning_rate": 1.1839047644875219e-05, + "loss": 0.3016, + "step": 14638 + }, + { + "epoch": 5.953233021553477, + "grad_norm": 0.023517845508213297, + "learning_rate": 1.183805172810766e-05, + "loss": 0.0004, + "step": 14639 + }, + { + "epoch": 5.953639690931273, + "grad_norm": 0.6372632450738499, + "learning_rate": 1.1837055792471577e-05, + "loss": 0.0109, + "step": 14640 + }, + { + "epoch": 5.954046360309069, + "grad_norm": 5.687982384987041, + "learning_rate": 1.1836059837977194e-05, + "loss": 0.1356, + "step": 14641 + }, + { + "epoch": 5.9544530296868645, + "grad_norm": 3.4499913518951013, + "learning_rate": 1.1835063864634729e-05, + "loss": 0.0698, + "step": 14642 + }, + { + "epoch": 5.95485969906466, + "grad_norm": 9.138402545704988, + "learning_rate": 1.183406787245441e-05, + "loss": 0.4798, + "step": 14643 + }, + { + "epoch": 5.955266368442456, + "grad_norm": 2.584547440855682, + "learning_rate": 1.1833071861446467e-05, + "loss": 0.0782, + "step": 14644 + }, + { + "epoch": 5.955673037820252, + "grad_norm": 5.1599396484917985, + "learning_rate": 1.1832075831621113e-05, + "loss": 0.0856, + "step": 14645 + }, + { + "epoch": 5.9560797071980485, + "grad_norm": 12.902422334848627, + "learning_rate": 1.1831079782988581e-05, + "loss": 0.3431, + "step": 14646 + }, + { + "epoch": 5.956486376575844, + "grad_norm": 1.5060910115640442, + "learning_rate": 1.1830083715559093e-05, + "loss": 0.0283, + "step": 14647 + }, + { + "epoch": 5.95689304595364, + "grad_norm": 2.1662987385838925, + "learning_rate": 1.1829087629342872e-05, + "loss": 0.0454, + "step": 14648 + }, + { + "epoch": 5.957299715331436, + "grad_norm": 2.4251621810977753, + "learning_rate": 1.1828091524350149e-05, + "loss": 0.1278, + "step": 14649 + }, + { + "epoch": 5.9577063847092315, + "grad_norm": 7.811294123806468, + "learning_rate": 1.1827095400591146e-05, + "loss": 0.2243, + "step": 14650 + }, + { + "epoch": 5.958113054087027, + "grad_norm": 0.10959766415888587, + "learning_rate": 1.182609925807609e-05, + "loss": 0.0022, + "step": 14651 + }, + { + "epoch": 5.958519723464823, + "grad_norm": 5.145562257870536, + "learning_rate": 1.1825103096815203e-05, + "loss": 0.1217, + "step": 14652 + }, + { + "epoch": 5.958926392842619, + "grad_norm": 1.139589786297197, + "learning_rate": 1.1824106916818718e-05, + "loss": 0.0262, + "step": 14653 + }, + { + "epoch": 5.959333062220415, + "grad_norm": 1.3225806581518111, + "learning_rate": 1.1823110718096854e-05, + "loss": 0.0215, + "step": 14654 + }, + { + "epoch": 5.95973973159821, + "grad_norm": 0.4119100342493623, + "learning_rate": 1.1822114500659843e-05, + "loss": 0.0078, + "step": 14655 + }, + { + "epoch": 5.960146400976006, + "grad_norm": 0.09707700345968799, + "learning_rate": 1.1821118264517909e-05, + "loss": 0.0013, + "step": 14656 + }, + { + "epoch": 5.960553070353802, + "grad_norm": 0.9704879749364413, + "learning_rate": 1.1820122009681279e-05, + "loss": 0.016, + "step": 14657 + }, + { + "epoch": 5.9609597397315985, + "grad_norm": 1.2585979627399118, + "learning_rate": 1.181912573616018e-05, + "loss": 0.026, + "step": 14658 + }, + { + "epoch": 5.961366409109394, + "grad_norm": 7.619098709174988, + "learning_rate": 1.181812944396484e-05, + "loss": 0.1669, + "step": 14659 + }, + { + "epoch": 5.96177307848719, + "grad_norm": 0.514979613898919, + "learning_rate": 1.181713313310549e-05, + "loss": 0.007, + "step": 14660 + }, + { + "epoch": 5.962179747864986, + "grad_norm": 25.09897855251364, + "learning_rate": 1.1816136803592348e-05, + "loss": 0.2553, + "step": 14661 + }, + { + "epoch": 5.962586417242782, + "grad_norm": 8.64723145241272, + "learning_rate": 1.1815140455435649e-05, + "loss": 0.2818, + "step": 14662 + }, + { + "epoch": 5.962993086620577, + "grad_norm": 0.1979020001245677, + "learning_rate": 1.1814144088645623e-05, + "loss": 0.0035, + "step": 14663 + }, + { + "epoch": 5.963399755998373, + "grad_norm": 1.6182531893492549, + "learning_rate": 1.1813147703232493e-05, + "loss": 0.0289, + "step": 14664 + }, + { + "epoch": 5.963806425376169, + "grad_norm": 6.0183921317264275, + "learning_rate": 1.1812151299206489e-05, + "loss": 0.1292, + "step": 14665 + }, + { + "epoch": 5.964213094753965, + "grad_norm": 6.774758833181433, + "learning_rate": 1.1811154876577842e-05, + "loss": 0.2026, + "step": 14666 + }, + { + "epoch": 5.964619764131761, + "grad_norm": 7.781194776758942, + "learning_rate": 1.1810158435356775e-05, + "loss": 0.172, + "step": 14667 + }, + { + "epoch": 5.965026433509557, + "grad_norm": 0.40103247037017187, + "learning_rate": 1.1809161975553523e-05, + "loss": 0.0061, + "step": 14668 + }, + { + "epoch": 5.965433102887353, + "grad_norm": 7.045002507014591, + "learning_rate": 1.1808165497178312e-05, + "loss": 0.4242, + "step": 14669 + }, + { + "epoch": 5.965839772265149, + "grad_norm": 7.764598789123526, + "learning_rate": 1.1807169000241372e-05, + "loss": 0.2118, + "step": 14670 + }, + { + "epoch": 5.966246441642944, + "grad_norm": 1.2786681101982282, + "learning_rate": 1.1806172484752931e-05, + "loss": 0.0237, + "step": 14671 + }, + { + "epoch": 5.96665311102074, + "grad_norm": 7.3196045816069315, + "learning_rate": 1.1805175950723224e-05, + "loss": 0.2418, + "step": 14672 + }, + { + "epoch": 5.967059780398536, + "grad_norm": 5.027999832525858, + "learning_rate": 1.1804179398162476e-05, + "loss": 0.2706, + "step": 14673 + }, + { + "epoch": 5.967466449776332, + "grad_norm": 3.552140608315915, + "learning_rate": 1.1803182827080917e-05, + "loss": 0.0822, + "step": 14674 + }, + { + "epoch": 5.967873119154127, + "grad_norm": 8.484877256351298, + "learning_rate": 1.1802186237488782e-05, + "loss": 0.2636, + "step": 14675 + }, + { + "epoch": 5.968279788531923, + "grad_norm": 2.445409443795275, + "learning_rate": 1.1801189629396296e-05, + "loss": 0.0428, + "step": 14676 + }, + { + "epoch": 5.968686457909719, + "grad_norm": 22.7352445911306, + "learning_rate": 1.1800193002813691e-05, + "loss": 0.3656, + "step": 14677 + }, + { + "epoch": 5.969093127287516, + "grad_norm": 9.040617022789377, + "learning_rate": 1.1799196357751199e-05, + "loss": 0.4277, + "step": 14678 + }, + { + "epoch": 5.969499796665311, + "grad_norm": 6.6573295725188615, + "learning_rate": 1.1798199694219053e-05, + "loss": 0.08, + "step": 14679 + }, + { + "epoch": 5.969906466043107, + "grad_norm": 0.39642603623124384, + "learning_rate": 1.1797203012227482e-05, + "loss": 0.007, + "step": 14680 + }, + { + "epoch": 5.970313135420903, + "grad_norm": 0.8582537255334248, + "learning_rate": 1.1796206311786716e-05, + "loss": 0.0121, + "step": 14681 + }, + { + "epoch": 5.970719804798699, + "grad_norm": 0.6514222565630788, + "learning_rate": 1.179520959290699e-05, + "loss": 0.0142, + "step": 14682 + }, + { + "epoch": 5.9711264741764944, + "grad_norm": 11.131797467856419, + "learning_rate": 1.1794212855598533e-05, + "loss": 0.1005, + "step": 14683 + }, + { + "epoch": 5.97153314355429, + "grad_norm": 9.248538409849255, + "learning_rate": 1.1793216099871576e-05, + "loss": 0.2703, + "step": 14684 + }, + { + "epoch": 5.971939812932086, + "grad_norm": 7.5314013063547876, + "learning_rate": 1.1792219325736355e-05, + "loss": 0.1549, + "step": 14685 + }, + { + "epoch": 5.972346482309882, + "grad_norm": 2.3133894820227017, + "learning_rate": 1.1791222533203102e-05, + "loss": 0.0604, + "step": 14686 + }, + { + "epoch": 5.972753151687678, + "grad_norm": 0.9158381301761409, + "learning_rate": 1.1790225722282044e-05, + "loss": 0.0119, + "step": 14687 + }, + { + "epoch": 5.973159821065474, + "grad_norm": 9.796484520181332, + "learning_rate": 1.1789228892983421e-05, + "loss": 0.2691, + "step": 14688 + }, + { + "epoch": 5.97356649044327, + "grad_norm": 0.07626351638922622, + "learning_rate": 1.1788232045317465e-05, + "loss": 0.0012, + "step": 14689 + }, + { + "epoch": 5.973973159821066, + "grad_norm": 1.2479407503411033, + "learning_rate": 1.1787235179294403e-05, + "loss": 0.0199, + "step": 14690 + }, + { + "epoch": 5.9743798291988615, + "grad_norm": 3.6393034789181042, + "learning_rate": 1.1786238294924475e-05, + "loss": 0.0664, + "step": 14691 + }, + { + "epoch": 5.974786498576657, + "grad_norm": 0.5296282256134369, + "learning_rate": 1.1785241392217909e-05, + "loss": 0.0078, + "step": 14692 + }, + { + "epoch": 5.975193167954453, + "grad_norm": 7.433370166013061, + "learning_rate": 1.1784244471184947e-05, + "loss": 0.3784, + "step": 14693 + }, + { + "epoch": 5.975599837332249, + "grad_norm": 0.4272260118005359, + "learning_rate": 1.1783247531835814e-05, + "loss": 0.0061, + "step": 14694 + }, + { + "epoch": 5.9760065067100445, + "grad_norm": 2.7217215602856237, + "learning_rate": 1.1782250574180749e-05, + "loss": 0.038, + "step": 14695 + }, + { + "epoch": 5.97641317608784, + "grad_norm": 2.80198871587563, + "learning_rate": 1.1781253598229982e-05, + "loss": 0.035, + "step": 14696 + }, + { + "epoch": 5.976819845465636, + "grad_norm": 9.436324666111013, + "learning_rate": 1.178025660399375e-05, + "loss": 0.2, + "step": 14697 + }, + { + "epoch": 5.977226514843432, + "grad_norm": 1.567560439873886, + "learning_rate": 1.1779259591482293e-05, + "loss": 0.0281, + "step": 14698 + }, + { + "epoch": 5.9776331842212285, + "grad_norm": 4.553607412798688, + "learning_rate": 1.1778262560705836e-05, + "loss": 0.1575, + "step": 14699 + }, + { + "epoch": 5.978039853599024, + "grad_norm": 12.500385211731103, + "learning_rate": 1.177726551167462e-05, + "loss": 0.5707, + "step": 14700 + }, + { + "epoch": 5.97844652297682, + "grad_norm": 14.033976177658923, + "learning_rate": 1.1776268444398882e-05, + "loss": 1.0162, + "step": 14701 + }, + { + "epoch": 5.978853192354616, + "grad_norm": 0.07789928066846269, + "learning_rate": 1.1775271358888852e-05, + "loss": 0.0012, + "step": 14702 + }, + { + "epoch": 5.9792598617324115, + "grad_norm": 8.629008946599853, + "learning_rate": 1.1774274255154768e-05, + "loss": 0.0805, + "step": 14703 + }, + { + "epoch": 5.979666531110207, + "grad_norm": 3.5546670901975426, + "learning_rate": 1.1773277133206866e-05, + "loss": 0.0832, + "step": 14704 + }, + { + "epoch": 5.980073200488003, + "grad_norm": 6.764728045810859, + "learning_rate": 1.1772279993055383e-05, + "loss": 0.119, + "step": 14705 + }, + { + "epoch": 5.980479869865799, + "grad_norm": 0.09362535842477332, + "learning_rate": 1.1771282834710548e-05, + "loss": 0.0017, + "step": 14706 + }, + { + "epoch": 5.980886539243595, + "grad_norm": 7.682486788917247, + "learning_rate": 1.177028565818261e-05, + "loss": 0.3969, + "step": 14707 + }, + { + "epoch": 5.981293208621391, + "grad_norm": 7.146605784476195, + "learning_rate": 1.1769288463481797e-05, + "loss": 0.1752, + "step": 14708 + }, + { + "epoch": 5.981699877999187, + "grad_norm": 4.307991670596957, + "learning_rate": 1.1768291250618344e-05, + "loss": 0.0664, + "step": 14709 + }, + { + "epoch": 5.982106547376983, + "grad_norm": 8.912042280216701, + "learning_rate": 1.1767294019602496e-05, + "loss": 0.1061, + "step": 14710 + }, + { + "epoch": 5.9825132167547785, + "grad_norm": 8.14898281202029, + "learning_rate": 1.1766296770444486e-05, + "loss": 0.2336, + "step": 14711 + }, + { + "epoch": 5.982919886132574, + "grad_norm": 0.3556843988958416, + "learning_rate": 1.1765299503154548e-05, + "loss": 0.0061, + "step": 14712 + }, + { + "epoch": 5.98332655551037, + "grad_norm": 5.662742061110852, + "learning_rate": 1.1764302217742922e-05, + "loss": 0.2099, + "step": 14713 + }, + { + "epoch": 5.983733224888166, + "grad_norm": 3.4744092391865355, + "learning_rate": 1.1763304914219847e-05, + "loss": 0.055, + "step": 14714 + }, + { + "epoch": 5.984139894265962, + "grad_norm": 4.3605223048517985, + "learning_rate": 1.176230759259556e-05, + "loss": 0.2336, + "step": 14715 + }, + { + "epoch": 5.984546563643757, + "grad_norm": 8.019973960366686, + "learning_rate": 1.1761310252880293e-05, + "loss": 0.1629, + "step": 14716 + }, + { + "epoch": 5.984953233021553, + "grad_norm": 3.441405985331011, + "learning_rate": 1.1760312895084296e-05, + "loss": 0.0462, + "step": 14717 + }, + { + "epoch": 5.985359902399349, + "grad_norm": 3.660925125377505, + "learning_rate": 1.1759315519217802e-05, + "loss": 0.0729, + "step": 14718 + }, + { + "epoch": 5.9857665717771456, + "grad_norm": 2.123244490609395, + "learning_rate": 1.1758318125291043e-05, + "loss": 0.0286, + "step": 14719 + }, + { + "epoch": 5.986173241154941, + "grad_norm": 8.821215972715839, + "learning_rate": 1.1757320713314268e-05, + "loss": 0.4452, + "step": 14720 + }, + { + "epoch": 5.986579910532737, + "grad_norm": 7.052269448464384, + "learning_rate": 1.175632328329771e-05, + "loss": 0.1422, + "step": 14721 + }, + { + "epoch": 5.986986579910533, + "grad_norm": 0.653620702971979, + "learning_rate": 1.175532583525161e-05, + "loss": 0.0137, + "step": 14722 + }, + { + "epoch": 5.987393249288329, + "grad_norm": 0.026808388612940232, + "learning_rate": 1.1754328369186207e-05, + "loss": 0.0003, + "step": 14723 + }, + { + "epoch": 5.987799918666124, + "grad_norm": 4.934682387634988, + "learning_rate": 1.175333088511174e-05, + "loss": 0.1126, + "step": 14724 + }, + { + "epoch": 5.98820658804392, + "grad_norm": 0.5393834044602198, + "learning_rate": 1.175233338303845e-05, + "loss": 0.0079, + "step": 14725 + }, + { + "epoch": 5.988613257421716, + "grad_norm": 12.939127711396747, + "learning_rate": 1.1751335862976573e-05, + "loss": 0.3196, + "step": 14726 + }, + { + "epoch": 5.989019926799512, + "grad_norm": 0.37661347111187726, + "learning_rate": 1.1750338324936354e-05, + "loss": 0.006, + "step": 14727 + }, + { + "epoch": 5.989426596177308, + "grad_norm": 2.611420025453073, + "learning_rate": 1.1749340768928033e-05, + "loss": 0.0442, + "step": 14728 + }, + { + "epoch": 5.989833265555104, + "grad_norm": 7.831566020057118, + "learning_rate": 1.1748343194961844e-05, + "loss": 0.1892, + "step": 14729 + }, + { + "epoch": 5.9902399349329, + "grad_norm": 0.2056264640421164, + "learning_rate": 1.1747345603048037e-05, + "loss": 0.0027, + "step": 14730 + }, + { + "epoch": 5.990646604310696, + "grad_norm": 8.091876047880984, + "learning_rate": 1.1746347993196844e-05, + "loss": 0.2456, + "step": 14731 + }, + { + "epoch": 5.991053273688491, + "grad_norm": 0.9325347588448208, + "learning_rate": 1.1745350365418514e-05, + "loss": 0.0147, + "step": 14732 + }, + { + "epoch": 5.991459943066287, + "grad_norm": 12.047583679773226, + "learning_rate": 1.174435271972328e-05, + "loss": 0.5451, + "step": 14733 + }, + { + "epoch": 5.991866612444083, + "grad_norm": 4.1603503348601585, + "learning_rate": 1.174335505612139e-05, + "loss": 0.1536, + "step": 14734 + }, + { + "epoch": 5.992273281821879, + "grad_norm": 1.7953707079162664, + "learning_rate": 1.1742357374623081e-05, + "loss": 0.0449, + "step": 14735 + }, + { + "epoch": 5.9926799511996744, + "grad_norm": 7.894771848194661, + "learning_rate": 1.17413596752386e-05, + "loss": 0.224, + "step": 14736 + }, + { + "epoch": 5.99308662057747, + "grad_norm": 1.8569702325815418, + "learning_rate": 1.1740361957978184e-05, + "loss": 0.0324, + "step": 14737 + }, + { + "epoch": 5.993493289955266, + "grad_norm": 8.658620628300415, + "learning_rate": 1.1739364222852076e-05, + "loss": 0.1726, + "step": 14738 + }, + { + "epoch": 5.993899959333062, + "grad_norm": 1.7309001714194072, + "learning_rate": 1.1738366469870517e-05, + "loss": 0.0379, + "step": 14739 + }, + { + "epoch": 5.994306628710858, + "grad_norm": 0.8295247589326418, + "learning_rate": 1.1737368699043755e-05, + "loss": 0.0127, + "step": 14740 + }, + { + "epoch": 5.994713298088654, + "grad_norm": 6.415321322691978, + "learning_rate": 1.1736370910382026e-05, + "loss": 0.1141, + "step": 14741 + }, + { + "epoch": 5.99511996746645, + "grad_norm": 1.655410413750203, + "learning_rate": 1.1735373103895575e-05, + "loss": 0.0233, + "step": 14742 + }, + { + "epoch": 5.995526636844246, + "grad_norm": 10.791966809310756, + "learning_rate": 1.1734375279594648e-05, + "loss": 0.262, + "step": 14743 + }, + { + "epoch": 5.9959333062220415, + "grad_norm": 9.090969386893093, + "learning_rate": 1.1733377437489485e-05, + "loss": 0.2839, + "step": 14744 + }, + { + "epoch": 5.996339975599837, + "grad_norm": 4.992020033464266, + "learning_rate": 1.173237957759033e-05, + "loss": 0.0869, + "step": 14745 + }, + { + "epoch": 5.996746644977633, + "grad_norm": 0.2320718769197406, + "learning_rate": 1.173138169990743e-05, + "loss": 0.0019, + "step": 14746 + }, + { + "epoch": 5.997153314355429, + "grad_norm": 0.6008418288199809, + "learning_rate": 1.1730383804451023e-05, + "loss": 0.0117, + "step": 14747 + }, + { + "epoch": 5.9975599837332245, + "grad_norm": 1.5307825820587087, + "learning_rate": 1.1729385891231352e-05, + "loss": 0.0177, + "step": 14748 + }, + { + "epoch": 5.997966653111021, + "grad_norm": 8.60955894835349, + "learning_rate": 1.172838796025867e-05, + "loss": 0.3009, + "step": 14749 + }, + { + "epoch": 5.998373322488817, + "grad_norm": 4.460244573399168, + "learning_rate": 1.1727390011543214e-05, + "loss": 0.0822, + "step": 14750 + }, + { + "epoch": 5.998779991866613, + "grad_norm": 8.236072044991488, + "learning_rate": 1.1726392045095228e-05, + "loss": 0.4073, + "step": 14751 + }, + { + "epoch": 5.9991866612444085, + "grad_norm": 7.079991801360475, + "learning_rate": 1.1725394060924959e-05, + "loss": 0.2231, + "step": 14752 + }, + { + "epoch": 5.999593330622204, + "grad_norm": 3.510630003299984, + "learning_rate": 1.1724396059042655e-05, + "loss": 0.0508, + "step": 14753 + }, + { + "epoch": 6.0, + "grad_norm": 3.121441984361696, + "learning_rate": 1.1723398039458554e-05, + "loss": 0.0389, + "step": 14754 + }, + { + "epoch": 6.000406669377796, + "grad_norm": 0.1736416863609083, + "learning_rate": 1.1722400002182905e-05, + "loss": 0.0025, + "step": 14755 + }, + { + "epoch": 6.0008133387555915, + "grad_norm": 7.985686576116519, + "learning_rate": 1.1721401947225956e-05, + "loss": 0.1829, + "step": 14756 + }, + { + "epoch": 6.001220008133387, + "grad_norm": 7.4874497821621215, + "learning_rate": 1.1720403874597946e-05, + "loss": 0.2514, + "step": 14757 + }, + { + "epoch": 6.001626677511183, + "grad_norm": 12.976720627826632, + "learning_rate": 1.1719405784309128e-05, + "loss": 0.4923, + "step": 14758 + }, + { + "epoch": 6.002033346888979, + "grad_norm": 1.229922486166521, + "learning_rate": 1.171840767636974e-05, + "loss": 0.023, + "step": 14759 + }, + { + "epoch": 6.0024400162667755, + "grad_norm": 3.048488365855642, + "learning_rate": 1.1717409550790037e-05, + "loss": 0.0456, + "step": 14760 + }, + { + "epoch": 6.002846685644571, + "grad_norm": 0.42164342896638907, + "learning_rate": 1.1716411407580257e-05, + "loss": 0.0062, + "step": 14761 + }, + { + "epoch": 6.003253355022367, + "grad_norm": 4.922304334723665, + "learning_rate": 1.1715413246750654e-05, + "loss": 0.0889, + "step": 14762 + }, + { + "epoch": 6.003660024400163, + "grad_norm": 5.299300737031811, + "learning_rate": 1.1714415068311466e-05, + "loss": 0.0898, + "step": 14763 + }, + { + "epoch": 6.0040666937779585, + "grad_norm": 14.809367970512792, + "learning_rate": 1.1713416872272948e-05, + "loss": 0.4641, + "step": 14764 + }, + { + "epoch": 6.004473363155754, + "grad_norm": 0.02192223768552765, + "learning_rate": 1.1712418658645341e-05, + "loss": 0.0003, + "step": 14765 + }, + { + "epoch": 6.00488003253355, + "grad_norm": 6.045831152483473, + "learning_rate": 1.1711420427438896e-05, + "loss": 0.1398, + "step": 14766 + }, + { + "epoch": 6.005286701911346, + "grad_norm": 5.983366498586397, + "learning_rate": 1.1710422178663859e-05, + "loss": 0.1074, + "step": 14767 + }, + { + "epoch": 6.005693371289142, + "grad_norm": 0.04973392103847244, + "learning_rate": 1.1709423912330475e-05, + "loss": 0.0013, + "step": 14768 + }, + { + "epoch": 6.006100040666937, + "grad_norm": 7.608684232265142, + "learning_rate": 1.1708425628448998e-05, + "loss": 0.2669, + "step": 14769 + }, + { + "epoch": 6.006506710044734, + "grad_norm": 4.128382793527065, + "learning_rate": 1.170742732702967e-05, + "loss": 0.0641, + "step": 14770 + }, + { + "epoch": 6.00691337942253, + "grad_norm": 3.0219312024725453, + "learning_rate": 1.170642900808274e-05, + "loss": 0.0329, + "step": 14771 + }, + { + "epoch": 6.0073200488003256, + "grad_norm": 7.389438716034287, + "learning_rate": 1.170543067161846e-05, + "loss": 0.4464, + "step": 14772 + }, + { + "epoch": 6.007726718178121, + "grad_norm": 0.46319697040998803, + "learning_rate": 1.1704432317647076e-05, + "loss": 0.0058, + "step": 14773 + }, + { + "epoch": 6.008133387555917, + "grad_norm": 20.77000039757991, + "learning_rate": 1.1703433946178835e-05, + "loss": 0.1787, + "step": 14774 + }, + { + "epoch": 6.008540056933713, + "grad_norm": 0.087833159920883, + "learning_rate": 1.1702435557223988e-05, + "loss": 0.0015, + "step": 14775 + }, + { + "epoch": 6.008946726311509, + "grad_norm": 2.1236098741066396, + "learning_rate": 1.1701437150792782e-05, + "loss": 0.058, + "step": 14776 + }, + { + "epoch": 6.009353395689304, + "grad_norm": 4.021827146906631, + "learning_rate": 1.1700438726895467e-05, + "loss": 0.1504, + "step": 14777 + }, + { + "epoch": 6.0097600650671, + "grad_norm": 0.136265700132986, + "learning_rate": 1.1699440285542294e-05, + "loss": 0.0024, + "step": 14778 + }, + { + "epoch": 6.010166734444896, + "grad_norm": 1.603560892219309, + "learning_rate": 1.1698441826743512e-05, + "loss": 0.0218, + "step": 14779 + }, + { + "epoch": 6.010573403822693, + "grad_norm": 4.442883442224732, + "learning_rate": 1.1697443350509367e-05, + "loss": 0.1209, + "step": 14780 + }, + { + "epoch": 6.010980073200488, + "grad_norm": 17.477374356835465, + "learning_rate": 1.1696444856850114e-05, + "loss": 0.4175, + "step": 14781 + }, + { + "epoch": 6.011386742578284, + "grad_norm": 9.900903400462898, + "learning_rate": 1.1695446345776e-05, + "loss": 0.2063, + "step": 14782 + }, + { + "epoch": 6.01179341195608, + "grad_norm": 0.1028704028224768, + "learning_rate": 1.1694447817297276e-05, + "loss": 0.0015, + "step": 14783 + }, + { + "epoch": 6.012200081333876, + "grad_norm": 5.261996705888133, + "learning_rate": 1.1693449271424193e-05, + "loss": 0.178, + "step": 14784 + }, + { + "epoch": 6.012606750711671, + "grad_norm": 0.48390277256772674, + "learning_rate": 1.1692450708167e-05, + "loss": 0.0049, + "step": 14785 + }, + { + "epoch": 6.013013420089467, + "grad_norm": 0.09647915164275275, + "learning_rate": 1.1691452127535949e-05, + "loss": 0.0023, + "step": 14786 + }, + { + "epoch": 6.013420089467263, + "grad_norm": 7.368408580790257, + "learning_rate": 1.1690453529541291e-05, + "loss": 0.188, + "step": 14787 + }, + { + "epoch": 6.013826758845059, + "grad_norm": 0.3261098813153166, + "learning_rate": 1.1689454914193276e-05, + "loss": 0.0052, + "step": 14788 + }, + { + "epoch": 6.0142334282228544, + "grad_norm": 0.14506768110180993, + "learning_rate": 1.1688456281502157e-05, + "loss": 0.0028, + "step": 14789 + }, + { + "epoch": 6.014640097600651, + "grad_norm": 0.9111172217571968, + "learning_rate": 1.1687457631478184e-05, + "loss": 0.0128, + "step": 14790 + }, + { + "epoch": 6.015046766978447, + "grad_norm": 4.014195800901868, + "learning_rate": 1.1686458964131609e-05, + "loss": 0.1798, + "step": 14791 + }, + { + "epoch": 6.015453436356243, + "grad_norm": 0.27566160266927275, + "learning_rate": 1.1685460279472684e-05, + "loss": 0.0023, + "step": 14792 + }, + { + "epoch": 6.015860105734038, + "grad_norm": 0.09126470245148323, + "learning_rate": 1.1684461577511659e-05, + "loss": 0.0011, + "step": 14793 + }, + { + "epoch": 6.016266775111834, + "grad_norm": 1.035113974598668, + "learning_rate": 1.1683462858258794e-05, + "loss": 0.0135, + "step": 14794 + }, + { + "epoch": 6.01667344448963, + "grad_norm": 4.251038325608367, + "learning_rate": 1.168246412172433e-05, + "loss": 0.1163, + "step": 14795 + }, + { + "epoch": 6.017080113867426, + "grad_norm": 4.3533083256992855, + "learning_rate": 1.1681465367918528e-05, + "loss": 0.0849, + "step": 14796 + }, + { + "epoch": 6.0174867832452215, + "grad_norm": 11.242584507144041, + "learning_rate": 1.1680466596851635e-05, + "loss": 0.193, + "step": 14797 + }, + { + "epoch": 6.017893452623017, + "grad_norm": 0.9613036642263796, + "learning_rate": 1.167946780853391e-05, + "loss": 0.0158, + "step": 14798 + }, + { + "epoch": 6.018300122000813, + "grad_norm": 0.3046510083723974, + "learning_rate": 1.1678469002975598e-05, + "loss": 0.0061, + "step": 14799 + }, + { + "epoch": 6.018706791378609, + "grad_norm": 8.309889409901475, + "learning_rate": 1.167747018018696e-05, + "loss": 0.2081, + "step": 14800 + }, + { + "epoch": 6.019113460756405, + "grad_norm": 3.7912074447835065, + "learning_rate": 1.1676471340178248e-05, + "loss": 0.058, + "step": 14801 + }, + { + "epoch": 6.019520130134201, + "grad_norm": 1.673390241648875, + "learning_rate": 1.1675472482959708e-05, + "loss": 0.0231, + "step": 14802 + }, + { + "epoch": 6.019926799511997, + "grad_norm": 1.4890342945190491, + "learning_rate": 1.1674473608541606e-05, + "loss": 0.0279, + "step": 14803 + }, + { + "epoch": 6.020333468889793, + "grad_norm": 0.16637196382758637, + "learning_rate": 1.1673474716934187e-05, + "loss": 0.0029, + "step": 14804 + }, + { + "epoch": 6.0207401382675885, + "grad_norm": 3.6483329474878365, + "learning_rate": 1.167247580814771e-05, + "loss": 0.079, + "step": 14805 + }, + { + "epoch": 6.021146807645384, + "grad_norm": 0.47719573169598356, + "learning_rate": 1.1671476882192424e-05, + "loss": 0.0072, + "step": 14806 + }, + { + "epoch": 6.02155347702318, + "grad_norm": 0.257890743226016, + "learning_rate": 1.1670477939078588e-05, + "loss": 0.0035, + "step": 14807 + }, + { + "epoch": 6.021960146400976, + "grad_norm": 0.1673460701908196, + "learning_rate": 1.1669478978816455e-05, + "loss": 0.0022, + "step": 14808 + }, + { + "epoch": 6.0223668157787715, + "grad_norm": 5.013628567429442, + "learning_rate": 1.1668480001416279e-05, + "loss": 0.2054, + "step": 14809 + }, + { + "epoch": 6.022773485156567, + "grad_norm": 0.0176986723431124, + "learning_rate": 1.1667481006888317e-05, + "loss": 0.0003, + "step": 14810 + }, + { + "epoch": 6.023180154534364, + "grad_norm": 3.129554043610693, + "learning_rate": 1.1666481995242824e-05, + "loss": 0.0623, + "step": 14811 + }, + { + "epoch": 6.02358682391216, + "grad_norm": 9.719868700039687, + "learning_rate": 1.166548296649005e-05, + "loss": 0.5182, + "step": 14812 + }, + { + "epoch": 6.0239934932899555, + "grad_norm": 5.7697035954467255, + "learning_rate": 1.1664483920640261e-05, + "loss": 0.0855, + "step": 14813 + }, + { + "epoch": 6.024400162667751, + "grad_norm": 2.4404911249834695, + "learning_rate": 1.1663484857703706e-05, + "loss": 0.0786, + "step": 14814 + }, + { + "epoch": 6.024806832045547, + "grad_norm": 0.7093532097063395, + "learning_rate": 1.166248577769064e-05, + "loss": 0.0095, + "step": 14815 + }, + { + "epoch": 6.025213501423343, + "grad_norm": 0.8260732789589127, + "learning_rate": 1.166148668061132e-05, + "loss": 0.012, + "step": 14816 + }, + { + "epoch": 6.0256201708011385, + "grad_norm": 1.4027721015762495, + "learning_rate": 1.1660487566476007e-05, + "loss": 0.0242, + "step": 14817 + }, + { + "epoch": 6.026026840178934, + "grad_norm": 5.497888011144804, + "learning_rate": 1.165948843529495e-05, + "loss": 0.2055, + "step": 14818 + }, + { + "epoch": 6.02643350955673, + "grad_norm": 0.6514957265986607, + "learning_rate": 1.1658489287078409e-05, + "loss": 0.0087, + "step": 14819 + }, + { + "epoch": 6.026840178934526, + "grad_norm": 4.802328609177988, + "learning_rate": 1.1657490121836644e-05, + "loss": 0.0704, + "step": 14820 + }, + { + "epoch": 6.0272468483123225, + "grad_norm": 3.086805685795678, + "learning_rate": 1.1656490939579905e-05, + "loss": 0.088, + "step": 14821 + }, + { + "epoch": 6.027653517690118, + "grad_norm": 3.739985873953601, + "learning_rate": 1.1655491740318452e-05, + "loss": 0.0975, + "step": 14822 + }, + { + "epoch": 6.028060187067914, + "grad_norm": 0.09463042916107217, + "learning_rate": 1.1654492524062547e-05, + "loss": 0.002, + "step": 14823 + }, + { + "epoch": 6.02846685644571, + "grad_norm": 2.6802636971477742, + "learning_rate": 1.1653493290822442e-05, + "loss": 0.0305, + "step": 14824 + }, + { + "epoch": 6.0288735258235056, + "grad_norm": 0.025122588824658003, + "learning_rate": 1.1652494040608393e-05, + "loss": 0.0003, + "step": 14825 + }, + { + "epoch": 6.029280195201301, + "grad_norm": 4.099388343972402, + "learning_rate": 1.1651494773430665e-05, + "loss": 0.0753, + "step": 14826 + }, + { + "epoch": 6.029686864579097, + "grad_norm": 0.0653513047939616, + "learning_rate": 1.1650495489299511e-05, + "loss": 0.0011, + "step": 14827 + }, + { + "epoch": 6.030093533956893, + "grad_norm": 2.3522464822434763, + "learning_rate": 1.1649496188225191e-05, + "loss": 0.0273, + "step": 14828 + }, + { + "epoch": 6.030500203334689, + "grad_norm": 2.510863659986738, + "learning_rate": 1.164849687021796e-05, + "loss": 0.0338, + "step": 14829 + }, + { + "epoch": 6.030906872712484, + "grad_norm": 0.3753242788357553, + "learning_rate": 1.1647497535288082e-05, + "loss": 0.0048, + "step": 14830 + }, + { + "epoch": 6.031313542090281, + "grad_norm": 1.6717065724995877, + "learning_rate": 1.164649818344581e-05, + "loss": 0.0275, + "step": 14831 + }, + { + "epoch": 6.031720211468077, + "grad_norm": 0.07113168093496268, + "learning_rate": 1.1645498814701407e-05, + "loss": 0.0009, + "step": 14832 + }, + { + "epoch": 6.032126880845873, + "grad_norm": 1.1144073510156338, + "learning_rate": 1.164449942906513e-05, + "loss": 0.0151, + "step": 14833 + }, + { + "epoch": 6.032533550223668, + "grad_norm": 1.1031442246141416, + "learning_rate": 1.1643500026547239e-05, + "loss": 0.0121, + "step": 14834 + }, + { + "epoch": 6.032940219601464, + "grad_norm": 1.7253651668317476, + "learning_rate": 1.1642500607157991e-05, + "loss": 0.0314, + "step": 14835 + }, + { + "epoch": 6.03334688897926, + "grad_norm": 8.331585374536552, + "learning_rate": 1.164150117090765e-05, + "loss": 0.7214, + "step": 14836 + }, + { + "epoch": 6.033753558357056, + "grad_norm": 9.40789334383297, + "learning_rate": 1.1640501717806474e-05, + "loss": 0.1872, + "step": 14837 + }, + { + "epoch": 6.034160227734851, + "grad_norm": 0.3603259704874239, + "learning_rate": 1.163950224786472e-05, + "loss": 0.0045, + "step": 14838 + }, + { + "epoch": 6.034566897112647, + "grad_norm": 7.082227476785649, + "learning_rate": 1.1638502761092652e-05, + "loss": 0.1455, + "step": 14839 + }, + { + "epoch": 6.034973566490443, + "grad_norm": 5.140446173194765, + "learning_rate": 1.1637503257500526e-05, + "loss": 0.1018, + "step": 14840 + }, + { + "epoch": 6.035380235868239, + "grad_norm": 5.581411128413831, + "learning_rate": 1.1636503737098607e-05, + "loss": 0.0966, + "step": 14841 + }, + { + "epoch": 6.035786905246035, + "grad_norm": 2.8552098374251695, + "learning_rate": 1.1635504199897155e-05, + "loss": 0.0517, + "step": 14842 + }, + { + "epoch": 6.036193574623831, + "grad_norm": 6.820602875529032, + "learning_rate": 1.1634504645906425e-05, + "loss": 0.1193, + "step": 14843 + }, + { + "epoch": 6.036600244001627, + "grad_norm": 0.16969063898936917, + "learning_rate": 1.1633505075136686e-05, + "loss": 0.0027, + "step": 14844 + }, + { + "epoch": 6.037006913379423, + "grad_norm": 0.3271837786378177, + "learning_rate": 1.1632505487598195e-05, + "loss": 0.0058, + "step": 14845 + }, + { + "epoch": 6.037413582757218, + "grad_norm": 0.21200839009470882, + "learning_rate": 1.1631505883301213e-05, + "loss": 0.0033, + "step": 14846 + }, + { + "epoch": 6.037820252135014, + "grad_norm": 11.73773792742223, + "learning_rate": 1.1630506262256001e-05, + "loss": 0.2435, + "step": 14847 + }, + { + "epoch": 6.03822692151281, + "grad_norm": 7.167835679495136, + "learning_rate": 1.1629506624472822e-05, + "loss": 0.2235, + "step": 14848 + }, + { + "epoch": 6.038633590890606, + "grad_norm": 5.253373746803715, + "learning_rate": 1.1628506969961936e-05, + "loss": 0.0974, + "step": 14849 + }, + { + "epoch": 6.0390402602684015, + "grad_norm": 0.739660499223113, + "learning_rate": 1.1627507298733609e-05, + "loss": 0.0095, + "step": 14850 + }, + { + "epoch": 6.039446929646197, + "grad_norm": 5.141174105547907, + "learning_rate": 1.16265076107981e-05, + "loss": 0.0625, + "step": 14851 + }, + { + "epoch": 6.039853599023994, + "grad_norm": 0.026591444277488975, + "learning_rate": 1.1625507906165669e-05, + "loss": 0.0004, + "step": 14852 + }, + { + "epoch": 6.04026026840179, + "grad_norm": 3.5577717457438296, + "learning_rate": 1.1624508184846586e-05, + "loss": 0.0298, + "step": 14853 + }, + { + "epoch": 6.040666937779585, + "grad_norm": 6.417782280236498, + "learning_rate": 1.1623508446851105e-05, + "loss": 0.2378, + "step": 14854 + }, + { + "epoch": 6.041073607157381, + "grad_norm": 1.0200309809055954, + "learning_rate": 1.1622508692189493e-05, + "loss": 0.0137, + "step": 14855 + }, + { + "epoch": 6.041480276535177, + "grad_norm": 0.1341149682161515, + "learning_rate": 1.1621508920872015e-05, + "loss": 0.0024, + "step": 14856 + }, + { + "epoch": 6.041886945912973, + "grad_norm": 8.16833172903344, + "learning_rate": 1.1620509132908929e-05, + "loss": 0.1628, + "step": 14857 + }, + { + "epoch": 6.0422936152907685, + "grad_norm": 2.695776012032559, + "learning_rate": 1.1619509328310503e-05, + "loss": 0.0543, + "step": 14858 + }, + { + "epoch": 6.042700284668564, + "grad_norm": 7.963166031532087, + "learning_rate": 1.1618509507087e-05, + "loss": 0.336, + "step": 14859 + }, + { + "epoch": 6.04310695404636, + "grad_norm": 0.8755973195784463, + "learning_rate": 1.1617509669248677e-05, + "loss": 0.0127, + "step": 14860 + }, + { + "epoch": 6.043513623424156, + "grad_norm": 5.281863536173421, + "learning_rate": 1.1616509814805808e-05, + "loss": 0.145, + "step": 14861 + }, + { + "epoch": 6.043920292801952, + "grad_norm": 3.818958495472444, + "learning_rate": 1.161550994376865e-05, + "loss": 0.072, + "step": 14862 + }, + { + "epoch": 6.044326962179748, + "grad_norm": 2.8844968954346695, + "learning_rate": 1.1614510056147472e-05, + "loss": 0.0563, + "step": 14863 + }, + { + "epoch": 6.044733631557544, + "grad_norm": 10.134880480627325, + "learning_rate": 1.1613510151952533e-05, + "loss": 0.068, + "step": 14864 + }, + { + "epoch": 6.04514030093534, + "grad_norm": 7.653543101790837, + "learning_rate": 1.16125102311941e-05, + "loss": 0.2734, + "step": 14865 + }, + { + "epoch": 6.0455469703131355, + "grad_norm": 6.154977227572886, + "learning_rate": 1.161151029388244e-05, + "loss": 0.1185, + "step": 14866 + }, + { + "epoch": 6.045953639690931, + "grad_norm": 0.7462782592647919, + "learning_rate": 1.1610510340027815e-05, + "loss": 0.0275, + "step": 14867 + }, + { + "epoch": 6.046360309068727, + "grad_norm": 6.546388131727144, + "learning_rate": 1.1609510369640491e-05, + "loss": 0.2734, + "step": 14868 + }, + { + "epoch": 6.046766978446523, + "grad_norm": 0.08816554718432443, + "learning_rate": 1.1608510382730732e-05, + "loss": 0.0013, + "step": 14869 + }, + { + "epoch": 6.0471736478243185, + "grad_norm": 10.380227352830076, + "learning_rate": 1.1607510379308804e-05, + "loss": 0.2698, + "step": 14870 + }, + { + "epoch": 6.047580317202114, + "grad_norm": 2.794838156002753, + "learning_rate": 1.1606510359384975e-05, + "loss": 0.0567, + "step": 14871 + }, + { + "epoch": 6.047986986579911, + "grad_norm": 6.209120306001231, + "learning_rate": 1.1605510322969509e-05, + "loss": 0.2034, + "step": 14872 + }, + { + "epoch": 6.048393655957707, + "grad_norm": 8.668811065443963, + "learning_rate": 1.1604510270072668e-05, + "loss": 0.2895, + "step": 14873 + }, + { + "epoch": 6.0488003253355025, + "grad_norm": 9.749690087028391, + "learning_rate": 1.1603510200704723e-05, + "loss": 0.3716, + "step": 14874 + }, + { + "epoch": 6.049206994713298, + "grad_norm": 5.52528852325958, + "learning_rate": 1.160251011487594e-05, + "loss": 0.1354, + "step": 14875 + }, + { + "epoch": 6.049613664091094, + "grad_norm": 2.6705767084379604, + "learning_rate": 1.1601510012596584e-05, + "loss": 0.064, + "step": 14876 + }, + { + "epoch": 6.05002033346889, + "grad_norm": 0.26956499226256636, + "learning_rate": 1.1600509893876922e-05, + "loss": 0.0044, + "step": 14877 + }, + { + "epoch": 6.0504270028466856, + "grad_norm": 0.08757890858230158, + "learning_rate": 1.1599509758727219e-05, + "loss": 0.0013, + "step": 14878 + }, + { + "epoch": 6.050833672224481, + "grad_norm": 7.08560961094707, + "learning_rate": 1.1598509607157744e-05, + "loss": 0.1917, + "step": 14879 + }, + { + "epoch": 6.051240341602277, + "grad_norm": 0.16377989630611567, + "learning_rate": 1.1597509439178762e-05, + "loss": 0.002, + "step": 14880 + }, + { + "epoch": 6.051647010980073, + "grad_norm": 2.461472843426287, + "learning_rate": 1.1596509254800542e-05, + "loss": 0.043, + "step": 14881 + }, + { + "epoch": 6.052053680357869, + "grad_norm": 4.754518560378634, + "learning_rate": 1.1595509054033354e-05, + "loss": 0.1309, + "step": 14882 + }, + { + "epoch": 6.052460349735665, + "grad_norm": 7.256316419482483, + "learning_rate": 1.1594508836887458e-05, + "loss": 0.3306, + "step": 14883 + }, + { + "epoch": 6.052867019113461, + "grad_norm": 1.2163232224747416, + "learning_rate": 1.1593508603373129e-05, + "loss": 0.0158, + "step": 14884 + }, + { + "epoch": 6.053273688491257, + "grad_norm": 5.529715843861015, + "learning_rate": 1.1592508353500633e-05, + "loss": 0.1927, + "step": 14885 + }, + { + "epoch": 6.053680357869053, + "grad_norm": 0.19598536605367228, + "learning_rate": 1.1591508087280236e-05, + "loss": 0.0029, + "step": 14886 + }, + { + "epoch": 6.054087027246848, + "grad_norm": 0.4683239724479467, + "learning_rate": 1.159050780472221e-05, + "loss": 0.0052, + "step": 14887 + }, + { + "epoch": 6.054493696624644, + "grad_norm": 5.770451041081315, + "learning_rate": 1.1589507505836816e-05, + "loss": 0.1269, + "step": 14888 + }, + { + "epoch": 6.05490036600244, + "grad_norm": 0.03791919520663974, + "learning_rate": 1.1588507190634331e-05, + "loss": 0.0007, + "step": 14889 + }, + { + "epoch": 6.055307035380236, + "grad_norm": 13.274809998204997, + "learning_rate": 1.1587506859125019e-05, + "loss": 0.3927, + "step": 14890 + }, + { + "epoch": 6.055713704758031, + "grad_norm": 6.260307393902902, + "learning_rate": 1.1586506511319151e-05, + "loss": 0.1585, + "step": 14891 + }, + { + "epoch": 6.056120374135827, + "grad_norm": 9.632206076307781, + "learning_rate": 1.1585506147226995e-05, + "loss": 0.157, + "step": 14892 + }, + { + "epoch": 6.056527043513624, + "grad_norm": 1.5125455991564787, + "learning_rate": 1.1584505766858818e-05, + "loss": 0.0182, + "step": 14893 + }, + { + "epoch": 6.05693371289142, + "grad_norm": 8.908840146734855, + "learning_rate": 1.1583505370224896e-05, + "loss": 0.3835, + "step": 14894 + }, + { + "epoch": 6.057340382269215, + "grad_norm": 2.3035441041318214, + "learning_rate": 1.1582504957335494e-05, + "loss": 0.0327, + "step": 14895 + }, + { + "epoch": 6.057747051647011, + "grad_norm": 0.006262754218151396, + "learning_rate": 1.1581504528200878e-05, + "loss": 0.0001, + "step": 14896 + }, + { + "epoch": 6.058153721024807, + "grad_norm": 0.008666167092292903, + "learning_rate": 1.1580504082831326e-05, + "loss": 0.0001, + "step": 14897 + }, + { + "epoch": 6.058560390402603, + "grad_norm": 4.016690832832176, + "learning_rate": 1.1579503621237102e-05, + "loss": 0.1021, + "step": 14898 + }, + { + "epoch": 6.058967059780398, + "grad_norm": 5.394829425497135, + "learning_rate": 1.157850314342848e-05, + "loss": 0.1864, + "step": 14899 + }, + { + "epoch": 6.059373729158194, + "grad_norm": 1.0033047966412239, + "learning_rate": 1.1577502649415727e-05, + "loss": 0.0096, + "step": 14900 + }, + { + "epoch": 6.05978039853599, + "grad_norm": 1.4030348797374592, + "learning_rate": 1.1576502139209117e-05, + "loss": 0.0198, + "step": 14901 + }, + { + "epoch": 6.060187067913786, + "grad_norm": 4.131772991190966, + "learning_rate": 1.1575501612818916e-05, + "loss": 0.1091, + "step": 14902 + }, + { + "epoch": 6.060593737291582, + "grad_norm": 0.6434216997991337, + "learning_rate": 1.1574501070255401e-05, + "loss": 0.0081, + "step": 14903 + }, + { + "epoch": 6.061000406669378, + "grad_norm": 2.9594523057912947, + "learning_rate": 1.1573500511528838e-05, + "loss": 0.0527, + "step": 14904 + }, + { + "epoch": 6.061407076047174, + "grad_norm": 0.03828117649624556, + "learning_rate": 1.1572499936649501e-05, + "loss": 0.0002, + "step": 14905 + }, + { + "epoch": 6.06181374542497, + "grad_norm": 2.800479110727102, + "learning_rate": 1.1571499345627664e-05, + "loss": 0.0812, + "step": 14906 + }, + { + "epoch": 6.062220414802765, + "grad_norm": 9.911113672184426, + "learning_rate": 1.1570498738473593e-05, + "loss": 0.3964, + "step": 14907 + }, + { + "epoch": 6.062627084180561, + "grad_norm": 0.024887935616139424, + "learning_rate": 1.1569498115197561e-05, + "loss": 0.0004, + "step": 14908 + }, + { + "epoch": 6.063033753558357, + "grad_norm": 3.802638940367947, + "learning_rate": 1.1568497475809841e-05, + "loss": 0.146, + "step": 14909 + }, + { + "epoch": 6.063440422936153, + "grad_norm": 5.1165371262683434, + "learning_rate": 1.1567496820320704e-05, + "loss": 0.1044, + "step": 14910 + }, + { + "epoch": 6.0638470923139485, + "grad_norm": 4.389210099685041, + "learning_rate": 1.1566496148740426e-05, + "loss": 0.0398, + "step": 14911 + }, + { + "epoch": 6.064253761691744, + "grad_norm": 9.406142701918407, + "learning_rate": 1.1565495461079276e-05, + "loss": 0.3409, + "step": 14912 + }, + { + "epoch": 6.064660431069541, + "grad_norm": 0.38226155335613027, + "learning_rate": 1.1564494757347526e-05, + "loss": 0.0036, + "step": 14913 + }, + { + "epoch": 6.065067100447337, + "grad_norm": 5.233501482605929, + "learning_rate": 1.1563494037555451e-05, + "loss": 0.1725, + "step": 14914 + }, + { + "epoch": 6.065473769825132, + "grad_norm": 7.571421489347165, + "learning_rate": 1.1562493301713323e-05, + "loss": 0.1367, + "step": 14915 + }, + { + "epoch": 6.065880439202928, + "grad_norm": 6.160348178653, + "learning_rate": 1.1561492549831415e-05, + "loss": 0.093, + "step": 14916 + }, + { + "epoch": 6.066287108580724, + "grad_norm": 6.919411812140241, + "learning_rate": 1.1560491781919997e-05, + "loss": 0.2627, + "step": 14917 + }, + { + "epoch": 6.06669377795852, + "grad_norm": 2.088327037496266, + "learning_rate": 1.155949099798935e-05, + "loss": 0.0396, + "step": 14918 + }, + { + "epoch": 6.0671004473363155, + "grad_norm": 23.08644892453658, + "learning_rate": 1.1558490198049741e-05, + "loss": 0.5909, + "step": 14919 + }, + { + "epoch": 6.067507116714111, + "grad_norm": 6.354235721757485, + "learning_rate": 1.1557489382111444e-05, + "loss": 0.1495, + "step": 14920 + }, + { + "epoch": 6.067913786091907, + "grad_norm": 1.1743143163184604, + "learning_rate": 1.1556488550184739e-05, + "loss": 0.0166, + "step": 14921 + }, + { + "epoch": 6.068320455469703, + "grad_norm": 0.7288711655199394, + "learning_rate": 1.1555487702279891e-05, + "loss": 0.0087, + "step": 14922 + }, + { + "epoch": 6.0687271248474985, + "grad_norm": 0.7828664055190285, + "learning_rate": 1.1554486838407181e-05, + "loss": 0.012, + "step": 14923 + }, + { + "epoch": 6.069133794225295, + "grad_norm": 0.7148717033855246, + "learning_rate": 1.1553485958576882e-05, + "loss": 0.0122, + "step": 14924 + }, + { + "epoch": 6.069540463603091, + "grad_norm": 0.335509620252544, + "learning_rate": 1.1552485062799266e-05, + "loss": 0.0065, + "step": 14925 + }, + { + "epoch": 6.069947132980887, + "grad_norm": 9.857165548027702, + "learning_rate": 1.155148415108461e-05, + "loss": 0.3359, + "step": 14926 + }, + { + "epoch": 6.0703538023586825, + "grad_norm": 2.893237155581478, + "learning_rate": 1.155048322344319e-05, + "loss": 0.0643, + "step": 14927 + }, + { + "epoch": 6.070760471736478, + "grad_norm": 1.0077740443796444, + "learning_rate": 1.1549482279885276e-05, + "loss": 0.0221, + "step": 14928 + }, + { + "epoch": 6.071167141114274, + "grad_norm": 3.009546773851769, + "learning_rate": 1.154848132042115e-05, + "loss": 0.0816, + "step": 14929 + }, + { + "epoch": 6.07157381049207, + "grad_norm": 8.647652623159187, + "learning_rate": 1.1547480345061081e-05, + "loss": 0.5953, + "step": 14930 + }, + { + "epoch": 6.0719804798698656, + "grad_norm": 2.9188285225495045, + "learning_rate": 1.1546479353815352e-05, + "loss": 0.0339, + "step": 14931 + }, + { + "epoch": 6.072387149247661, + "grad_norm": 0.025896726676170545, + "learning_rate": 1.1545478346694227e-05, + "loss": 0.0006, + "step": 14932 + }, + { + "epoch": 6.072793818625457, + "grad_norm": 4.416350686403056, + "learning_rate": 1.1544477323707994e-05, + "loss": 0.163, + "step": 14933 + }, + { + "epoch": 6.073200488003254, + "grad_norm": 3.371053188000935, + "learning_rate": 1.1543476284866926e-05, + "loss": 0.0717, + "step": 14934 + }, + { + "epoch": 6.0736071573810495, + "grad_norm": 2.2711248641465023, + "learning_rate": 1.1542475230181291e-05, + "loss": 0.0323, + "step": 14935 + }, + { + "epoch": 6.074013826758845, + "grad_norm": 0.9567953270088155, + "learning_rate": 1.1541474159661377e-05, + "loss": 0.0115, + "step": 14936 + }, + { + "epoch": 6.074420496136641, + "grad_norm": 7.1642152687806915, + "learning_rate": 1.1540473073317451e-05, + "loss": 0.1023, + "step": 14937 + }, + { + "epoch": 6.074827165514437, + "grad_norm": 0.7520964166465675, + "learning_rate": 1.1539471971159794e-05, + "loss": 0.0171, + "step": 14938 + }, + { + "epoch": 6.075233834892233, + "grad_norm": 7.374146490645261, + "learning_rate": 1.1538470853198684e-05, + "loss": 0.2333, + "step": 14939 + }, + { + "epoch": 6.075640504270028, + "grad_norm": 6.181215630478609, + "learning_rate": 1.1537469719444396e-05, + "loss": 0.2111, + "step": 14940 + }, + { + "epoch": 6.076047173647824, + "grad_norm": 2.3467958073484785, + "learning_rate": 1.1536468569907207e-05, + "loss": 0.0482, + "step": 14941 + }, + { + "epoch": 6.07645384302562, + "grad_norm": 1.234367981904477, + "learning_rate": 1.1535467404597396e-05, + "loss": 0.0198, + "step": 14942 + }, + { + "epoch": 6.076860512403416, + "grad_norm": 0.22429475381647318, + "learning_rate": 1.1534466223525239e-05, + "loss": 0.0035, + "step": 14943 + }, + { + "epoch": 6.077267181781212, + "grad_norm": 7.918035715579335, + "learning_rate": 1.1533465026701014e-05, + "loss": 0.4094, + "step": 14944 + }, + { + "epoch": 6.077673851159008, + "grad_norm": 5.228555226926358, + "learning_rate": 1.1532463814134998e-05, + "loss": 0.2215, + "step": 14945 + }, + { + "epoch": 6.078080520536804, + "grad_norm": 0.07725011427701221, + "learning_rate": 1.1531462585837472e-05, + "loss": 0.0011, + "step": 14946 + }, + { + "epoch": 6.0784871899146, + "grad_norm": 4.2080132224791145, + "learning_rate": 1.1530461341818708e-05, + "loss": 0.0652, + "step": 14947 + }, + { + "epoch": 6.078893859292395, + "grad_norm": 2.6067454626456557, + "learning_rate": 1.152946008208899e-05, + "loss": 0.0904, + "step": 14948 + }, + { + "epoch": 6.079300528670191, + "grad_norm": 5.581682823511556, + "learning_rate": 1.1528458806658595e-05, + "loss": 0.1351, + "step": 14949 + }, + { + "epoch": 6.079707198047987, + "grad_norm": 9.521543391118184, + "learning_rate": 1.15274575155378e-05, + "loss": 0.1799, + "step": 14950 + }, + { + "epoch": 6.080113867425783, + "grad_norm": 2.665203731613835, + "learning_rate": 1.1526456208736884e-05, + "loss": 0.0615, + "step": 14951 + }, + { + "epoch": 6.080520536803578, + "grad_norm": 1.8164504224330917, + "learning_rate": 1.152545488626613e-05, + "loss": 0.0468, + "step": 14952 + }, + { + "epoch": 6.080927206181374, + "grad_norm": 8.135881710538875, + "learning_rate": 1.1524453548135812e-05, + "loss": 0.3462, + "step": 14953 + }, + { + "epoch": 6.081333875559171, + "grad_norm": 0.3526765696443798, + "learning_rate": 1.1523452194356209e-05, + "loss": 0.0055, + "step": 14954 + }, + { + "epoch": 6.081740544936967, + "grad_norm": 0.31754060814837265, + "learning_rate": 1.1522450824937605e-05, + "loss": 0.0039, + "step": 14955 + }, + { + "epoch": 6.082147214314762, + "grad_norm": 1.779928247350415, + "learning_rate": 1.1521449439890278e-05, + "loss": 0.0232, + "step": 14956 + }, + { + "epoch": 6.082553883692558, + "grad_norm": 0.13132926050310953, + "learning_rate": 1.1520448039224504e-05, + "loss": 0.0017, + "step": 14957 + }, + { + "epoch": 6.082960553070354, + "grad_norm": 9.33947093856541, + "learning_rate": 1.1519446622950565e-05, + "loss": 0.333, + "step": 14958 + }, + { + "epoch": 6.08336722244815, + "grad_norm": 11.794821799571135, + "learning_rate": 1.1518445191078747e-05, + "loss": 0.52, + "step": 14959 + }, + { + "epoch": 6.083773891825945, + "grad_norm": 5.757482393610155, + "learning_rate": 1.1517443743619318e-05, + "loss": 0.3814, + "step": 14960 + }, + { + "epoch": 6.084180561203741, + "grad_norm": 0.08018464323621906, + "learning_rate": 1.151644228058257e-05, + "loss": 0.0014, + "step": 14961 + }, + { + "epoch": 6.084587230581537, + "grad_norm": 0.38653760109840984, + "learning_rate": 1.1515440801978776e-05, + "loss": 0.006, + "step": 14962 + }, + { + "epoch": 6.084993899959333, + "grad_norm": 7.321813691021337, + "learning_rate": 1.1514439307818222e-05, + "loss": 0.2313, + "step": 14963 + }, + { + "epoch": 6.0854005693371285, + "grad_norm": 5.167238377769635, + "learning_rate": 1.1513437798111184e-05, + "loss": 0.0484, + "step": 14964 + }, + { + "epoch": 6.085807238714925, + "grad_norm": 10.664938056550634, + "learning_rate": 1.1512436272867949e-05, + "loss": 0.2713, + "step": 14965 + }, + { + "epoch": 6.086213908092721, + "grad_norm": 0.4884273671816783, + "learning_rate": 1.1511434732098792e-05, + "loss": 0.0098, + "step": 14966 + }, + { + "epoch": 6.086620577470517, + "grad_norm": 1.3241170845734882, + "learning_rate": 1.1510433175813995e-05, + "loss": 0.0226, + "step": 14967 + }, + { + "epoch": 6.087027246848312, + "grad_norm": 3.647602804519674, + "learning_rate": 1.1509431604023843e-05, + "loss": 0.0811, + "step": 14968 + }, + { + "epoch": 6.087433916226108, + "grad_norm": 0.7116422403364043, + "learning_rate": 1.1508430016738616e-05, + "loss": 0.0069, + "step": 14969 + }, + { + "epoch": 6.087840585603904, + "grad_norm": 0.2977525559249774, + "learning_rate": 1.1507428413968595e-05, + "loss": 0.0038, + "step": 14970 + }, + { + "epoch": 6.0882472549817, + "grad_norm": 0.08141405559314366, + "learning_rate": 1.1506426795724064e-05, + "loss": 0.002, + "step": 14971 + }, + { + "epoch": 6.0886539243594955, + "grad_norm": 6.688038351192234, + "learning_rate": 1.1505425162015304e-05, + "loss": 0.1561, + "step": 14972 + }, + { + "epoch": 6.089060593737291, + "grad_norm": 1.8392996991714936, + "learning_rate": 1.1504423512852594e-05, + "loss": 0.0252, + "step": 14973 + }, + { + "epoch": 6.089467263115087, + "grad_norm": 0.04328455743072551, + "learning_rate": 1.1503421848246223e-05, + "loss": 0.0008, + "step": 14974 + }, + { + "epoch": 6.089873932492884, + "grad_norm": 12.4597023244127, + "learning_rate": 1.1502420168206469e-05, + "loss": 0.2579, + "step": 14975 + }, + { + "epoch": 6.090280601870679, + "grad_norm": 12.554989586799804, + "learning_rate": 1.1501418472743615e-05, + "loss": 0.4218, + "step": 14976 + }, + { + "epoch": 6.090687271248475, + "grad_norm": 0.8851176486853145, + "learning_rate": 1.1500416761867945e-05, + "loss": 0.0226, + "step": 14977 + }, + { + "epoch": 6.091093940626271, + "grad_norm": 3.8182738520174024, + "learning_rate": 1.1499415035589742e-05, + "loss": 0.1203, + "step": 14978 + }, + { + "epoch": 6.091500610004067, + "grad_norm": 1.7149448891412586, + "learning_rate": 1.149841329391929e-05, + "loss": 0.0356, + "step": 14979 + }, + { + "epoch": 6.0919072793818625, + "grad_norm": 11.063702596072192, + "learning_rate": 1.149741153686687e-05, + "loss": 0.457, + "step": 14980 + }, + { + "epoch": 6.092313948759658, + "grad_norm": 0.34879424081196575, + "learning_rate": 1.1496409764442768e-05, + "loss": 0.005, + "step": 14981 + }, + { + "epoch": 6.092720618137454, + "grad_norm": 0.04049254094893534, + "learning_rate": 1.1495407976657266e-05, + "loss": 0.0006, + "step": 14982 + }, + { + "epoch": 6.09312728751525, + "grad_norm": 8.235839511968356, + "learning_rate": 1.1494406173520646e-05, + "loss": 0.1377, + "step": 14983 + }, + { + "epoch": 6.0935339568930456, + "grad_norm": 4.8086895205887075, + "learning_rate": 1.1493404355043198e-05, + "loss": 0.1153, + "step": 14984 + }, + { + "epoch": 6.093940626270842, + "grad_norm": 0.8255973135455112, + "learning_rate": 1.1492402521235205e-05, + "loss": 0.0149, + "step": 14985 + }, + { + "epoch": 6.094347295648638, + "grad_norm": 0.14007248014346937, + "learning_rate": 1.1491400672106942e-05, + "loss": 0.0022, + "step": 14986 + }, + { + "epoch": 6.094753965026434, + "grad_norm": 1.8562481520957954, + "learning_rate": 1.1490398807668706e-05, + "loss": 0.0261, + "step": 14987 + }, + { + "epoch": 6.0951606344042295, + "grad_norm": 2.245760957745336, + "learning_rate": 1.1489396927930773e-05, + "loss": 0.0539, + "step": 14988 + }, + { + "epoch": 6.095567303782025, + "grad_norm": 0.25781723435953235, + "learning_rate": 1.1488395032903435e-05, + "loss": 0.0043, + "step": 14989 + }, + { + "epoch": 6.095973973159821, + "grad_norm": 0.8331623553490698, + "learning_rate": 1.1487393122596967e-05, + "loss": 0.0173, + "step": 14990 + }, + { + "epoch": 6.096380642537617, + "grad_norm": 16.348284234640396, + "learning_rate": 1.1486391197021665e-05, + "loss": 0.3343, + "step": 14991 + }, + { + "epoch": 6.096787311915413, + "grad_norm": 3.7610936658567677, + "learning_rate": 1.1485389256187809e-05, + "loss": 0.1165, + "step": 14992 + }, + { + "epoch": 6.097193981293208, + "grad_norm": 9.181198425128635, + "learning_rate": 1.1484387300105683e-05, + "loss": 0.1627, + "step": 14993 + }, + { + "epoch": 6.097600650671004, + "grad_norm": 1.8989662784576262, + "learning_rate": 1.1483385328785574e-05, + "loss": 0.0367, + "step": 14994 + }, + { + "epoch": 6.098007320048801, + "grad_norm": 0.0917445708965094, + "learning_rate": 1.148238334223777e-05, + "loss": 0.0015, + "step": 14995 + }, + { + "epoch": 6.0984139894265965, + "grad_norm": 9.013678128125635, + "learning_rate": 1.1481381340472553e-05, + "loss": 0.302, + "step": 14996 + }, + { + "epoch": 6.098820658804392, + "grad_norm": 3.567552470859809, + "learning_rate": 1.1480379323500211e-05, + "loss": 0.0548, + "step": 14997 + }, + { + "epoch": 6.099227328182188, + "grad_norm": 2.6126267605397664, + "learning_rate": 1.147937729133103e-05, + "loss": 0.049, + "step": 14998 + }, + { + "epoch": 6.099633997559984, + "grad_norm": 5.442794470266099, + "learning_rate": 1.1478375243975298e-05, + "loss": 0.1626, + "step": 14999 + }, + { + "epoch": 6.10004066693778, + "grad_norm": 5.816951461537578, + "learning_rate": 1.14773731814433e-05, + "loss": 0.1343, + "step": 15000 + }, + { + "epoch": 6.100447336315575, + "grad_norm": 0.060038208274000825, + "learning_rate": 1.147637110374532e-05, + "loss": 0.001, + "step": 15001 + }, + { + "epoch": 6.100854005693371, + "grad_norm": 6.419724586433358, + "learning_rate": 1.147536901089165e-05, + "loss": 0.1705, + "step": 15002 + }, + { + "epoch": 6.101260675071167, + "grad_norm": 8.80710719653091, + "learning_rate": 1.1474366902892575e-05, + "loss": 0.2735, + "step": 15003 + }, + { + "epoch": 6.101667344448963, + "grad_norm": 9.468162331898771, + "learning_rate": 1.147336477975838e-05, + "loss": 0.5647, + "step": 15004 + }, + { + "epoch": 6.102074013826758, + "grad_norm": 0.8263614117052112, + "learning_rate": 1.1472362641499356e-05, + "loss": 0.015, + "step": 15005 + }, + { + "epoch": 6.102480683204555, + "grad_norm": 0.12329007812631321, + "learning_rate": 1.1471360488125788e-05, + "loss": 0.0024, + "step": 15006 + }, + { + "epoch": 6.102887352582351, + "grad_norm": 5.829975338130401, + "learning_rate": 1.1470358319647964e-05, + "loss": 0.125, + "step": 15007 + }, + { + "epoch": 6.103294021960147, + "grad_norm": 6.581420094310376, + "learning_rate": 1.1469356136076169e-05, + "loss": 0.1627, + "step": 15008 + }, + { + "epoch": 6.103700691337942, + "grad_norm": 7.262554031399239, + "learning_rate": 1.1468353937420695e-05, + "loss": 0.2613, + "step": 15009 + }, + { + "epoch": 6.104107360715738, + "grad_norm": 3.767176777824847, + "learning_rate": 1.146735172369183e-05, + "loss": 0.1469, + "step": 15010 + }, + { + "epoch": 6.104514030093534, + "grad_norm": 4.8476631539516175, + "learning_rate": 1.1466349494899862e-05, + "loss": 0.0713, + "step": 15011 + }, + { + "epoch": 6.10492069947133, + "grad_norm": 0.3090659556864198, + "learning_rate": 1.1465347251055077e-05, + "loss": 0.0049, + "step": 15012 + }, + { + "epoch": 6.105327368849125, + "grad_norm": 0.45109212163727674, + "learning_rate": 1.1464344992167764e-05, + "loss": 0.0082, + "step": 15013 + }, + { + "epoch": 6.105734038226921, + "grad_norm": 3.2601575738655986, + "learning_rate": 1.1463342718248216e-05, + "loss": 0.0681, + "step": 15014 + }, + { + "epoch": 6.106140707604717, + "grad_norm": 4.1131665295536495, + "learning_rate": 1.1462340429306713e-05, + "loss": 0.0303, + "step": 15015 + }, + { + "epoch": 6.106547376982514, + "grad_norm": 10.420598573171286, + "learning_rate": 1.1461338125353553e-05, + "loss": 0.6216, + "step": 15016 + }, + { + "epoch": 6.106954046360309, + "grad_norm": 0.09332480267160419, + "learning_rate": 1.1460335806399024e-05, + "loss": 0.0015, + "step": 15017 + }, + { + "epoch": 6.107360715738105, + "grad_norm": 8.412515668089222, + "learning_rate": 1.1459333472453408e-05, + "loss": 0.1093, + "step": 15018 + }, + { + "epoch": 6.107767385115901, + "grad_norm": 6.597688645591895, + "learning_rate": 1.1458331123527004e-05, + "loss": 0.1508, + "step": 15019 + }, + { + "epoch": 6.108174054493697, + "grad_norm": 0.13099886111562514, + "learning_rate": 1.1457328759630095e-05, + "loss": 0.002, + "step": 15020 + }, + { + "epoch": 6.108580723871492, + "grad_norm": 0.3600600591725558, + "learning_rate": 1.145632638077297e-05, + "loss": 0.0051, + "step": 15021 + }, + { + "epoch": 6.108987393249288, + "grad_norm": 10.534880674122165, + "learning_rate": 1.1455323986965927e-05, + "loss": 0.5976, + "step": 15022 + }, + { + "epoch": 6.109394062627084, + "grad_norm": 4.9468482581664075, + "learning_rate": 1.145432157821925e-05, + "loss": 0.1134, + "step": 15023 + }, + { + "epoch": 6.10980073200488, + "grad_norm": 11.204918707820172, + "learning_rate": 1.1453319154543228e-05, + "loss": 0.5597, + "step": 15024 + }, + { + "epoch": 6.1102074013826755, + "grad_norm": 8.353494549596038, + "learning_rate": 1.1452316715948154e-05, + "loss": 0.3005, + "step": 15025 + }, + { + "epoch": 6.110614070760472, + "grad_norm": 0.37734573320938236, + "learning_rate": 1.1451314262444317e-05, + "loss": 0.0052, + "step": 15026 + }, + { + "epoch": 6.111020740138268, + "grad_norm": 6.542915170008922, + "learning_rate": 1.145031179404201e-05, + "loss": 0.196, + "step": 15027 + }, + { + "epoch": 6.111427409516064, + "grad_norm": 7.26277806215795, + "learning_rate": 1.1449309310751521e-05, + "loss": 0.0839, + "step": 15028 + }, + { + "epoch": 6.111834078893859, + "grad_norm": 4.894837215695648, + "learning_rate": 1.1448306812583143e-05, + "loss": 0.1419, + "step": 15029 + }, + { + "epoch": 6.112240748271655, + "grad_norm": 0.3793726333161247, + "learning_rate": 1.1447304299547168e-05, + "loss": 0.0055, + "step": 15030 + }, + { + "epoch": 6.112647417649451, + "grad_norm": 3.4485156799687666, + "learning_rate": 1.1446301771653885e-05, + "loss": 0.0482, + "step": 15031 + }, + { + "epoch": 6.113054087027247, + "grad_norm": 0.14989945482020817, + "learning_rate": 1.1445299228913587e-05, + "loss": 0.0027, + "step": 15032 + }, + { + "epoch": 6.1134607564050425, + "grad_norm": 2.8081203961344587, + "learning_rate": 1.1444296671336566e-05, + "loss": 0.057, + "step": 15033 + }, + { + "epoch": 6.113867425782838, + "grad_norm": 3.6980677589478783, + "learning_rate": 1.1443294098933109e-05, + "loss": 0.0391, + "step": 15034 + }, + { + "epoch": 6.114274095160634, + "grad_norm": 5.652783908591381, + "learning_rate": 1.1442291511713514e-05, + "loss": 0.1107, + "step": 15035 + }, + { + "epoch": 6.114680764538431, + "grad_norm": 0.20096352607581044, + "learning_rate": 1.1441288909688073e-05, + "loss": 0.0019, + "step": 15036 + }, + { + "epoch": 6.1150874339162264, + "grad_norm": 4.390830840279658, + "learning_rate": 1.1440286292867074e-05, + "loss": 0.0738, + "step": 15037 + }, + { + "epoch": 6.115494103294022, + "grad_norm": 0.04396367316265498, + "learning_rate": 1.143928366126081e-05, + "loss": 0.0008, + "step": 15038 + }, + { + "epoch": 6.115900772671818, + "grad_norm": 0.07807232471066737, + "learning_rate": 1.1438281014879575e-05, + "loss": 0.001, + "step": 15039 + }, + { + "epoch": 6.116307442049614, + "grad_norm": 0.7390384977869521, + "learning_rate": 1.1437278353733662e-05, + "loss": 0.0077, + "step": 15040 + }, + { + "epoch": 6.1167141114274095, + "grad_norm": 5.810088516557111, + "learning_rate": 1.1436275677833363e-05, + "loss": 0.3659, + "step": 15041 + }, + { + "epoch": 6.117120780805205, + "grad_norm": 0.409027229963107, + "learning_rate": 1.1435272987188974e-05, + "loss": 0.002, + "step": 15042 + }, + { + "epoch": 6.117527450183001, + "grad_norm": 22.466988984959425, + "learning_rate": 1.1434270281810787e-05, + "loss": 0.7091, + "step": 15043 + }, + { + "epoch": 6.117934119560797, + "grad_norm": 2.9124949340420674, + "learning_rate": 1.1433267561709087e-05, + "loss": 0.037, + "step": 15044 + }, + { + "epoch": 6.118340788938593, + "grad_norm": 0.6151705164020903, + "learning_rate": 1.1432264826894179e-05, + "loss": 0.0095, + "step": 15045 + }, + { + "epoch": 6.118747458316388, + "grad_norm": 1.43440112429448, + "learning_rate": 1.143126207737635e-05, + "loss": 0.0179, + "step": 15046 + }, + { + "epoch": 6.119154127694185, + "grad_norm": 0.4090616511934253, + "learning_rate": 1.1430259313165895e-05, + "loss": 0.0064, + "step": 15047 + }, + { + "epoch": 6.119560797071981, + "grad_norm": 5.3496000650902165, + "learning_rate": 1.142925653427311e-05, + "loss": 0.2244, + "step": 15048 + }, + { + "epoch": 6.1199674664497765, + "grad_norm": 4.2538853142138215, + "learning_rate": 1.1428253740708286e-05, + "loss": 0.0564, + "step": 15049 + }, + { + "epoch": 6.120374135827572, + "grad_norm": 5.315386864231933, + "learning_rate": 1.1427250932481717e-05, + "loss": 0.1158, + "step": 15050 + }, + { + "epoch": 6.120780805205368, + "grad_norm": 8.613503131041364, + "learning_rate": 1.1426248109603702e-05, + "loss": 0.1806, + "step": 15051 + }, + { + "epoch": 6.121187474583164, + "grad_norm": 1.8984092149324765, + "learning_rate": 1.1425245272084532e-05, + "loss": 0.0317, + "step": 15052 + }, + { + "epoch": 6.12159414396096, + "grad_norm": 3.3908319297504947, + "learning_rate": 1.1424242419934496e-05, + "loss": 0.0628, + "step": 15053 + }, + { + "epoch": 6.122000813338755, + "grad_norm": 0.7850157148949096, + "learning_rate": 1.1423239553163903e-05, + "loss": 0.0127, + "step": 15054 + }, + { + "epoch": 6.122407482716551, + "grad_norm": 3.4292732596331117, + "learning_rate": 1.1422236671783036e-05, + "loss": 0.053, + "step": 15055 + }, + { + "epoch": 6.122814152094347, + "grad_norm": 3.0010576919950163, + "learning_rate": 1.1421233775802195e-05, + "loss": 0.0242, + "step": 15056 + }, + { + "epoch": 6.1232208214721435, + "grad_norm": 0.12590792933528785, + "learning_rate": 1.1420230865231672e-05, + "loss": 0.0016, + "step": 15057 + }, + { + "epoch": 6.123627490849939, + "grad_norm": 6.225513350418883, + "learning_rate": 1.1419227940081766e-05, + "loss": 0.1509, + "step": 15058 + }, + { + "epoch": 6.124034160227735, + "grad_norm": 0.6543861061872993, + "learning_rate": 1.1418225000362771e-05, + "loss": 0.0097, + "step": 15059 + }, + { + "epoch": 6.124440829605531, + "grad_norm": 7.5817594194497735, + "learning_rate": 1.1417222046084979e-05, + "loss": 0.0922, + "step": 15060 + }, + { + "epoch": 6.124847498983327, + "grad_norm": 10.044400631269763, + "learning_rate": 1.1416219077258695e-05, + "loss": 0.4756, + "step": 15061 + }, + { + "epoch": 6.125254168361122, + "grad_norm": 7.043366763761241, + "learning_rate": 1.1415216093894206e-05, + "loss": 0.1597, + "step": 15062 + }, + { + "epoch": 6.125660837738918, + "grad_norm": 0.6458814954014561, + "learning_rate": 1.141421309600181e-05, + "loss": 0.0077, + "step": 15063 + }, + { + "epoch": 6.126067507116714, + "grad_norm": 5.5531990973501895, + "learning_rate": 1.1413210083591808e-05, + "loss": 0.1453, + "step": 15064 + }, + { + "epoch": 6.12647417649451, + "grad_norm": 0.538455270794319, + "learning_rate": 1.1412207056674493e-05, + "loss": 0.0063, + "step": 15065 + }, + { + "epoch": 6.126880845872305, + "grad_norm": 0.2626877325189282, + "learning_rate": 1.141120401526016e-05, + "loss": 0.0029, + "step": 15066 + }, + { + "epoch": 6.127287515250102, + "grad_norm": 22.345871867284696, + "learning_rate": 1.1410200959359111e-05, + "loss": 1.2714, + "step": 15067 + }, + { + "epoch": 6.127694184627898, + "grad_norm": 5.943093248307916, + "learning_rate": 1.1409197888981635e-05, + "loss": 0.1134, + "step": 15068 + }, + { + "epoch": 6.128100854005694, + "grad_norm": 0.2566256581569314, + "learning_rate": 1.1408194804138037e-05, + "loss": 0.0052, + "step": 15069 + }, + { + "epoch": 6.128507523383489, + "grad_norm": 0.22086934420438165, + "learning_rate": 1.1407191704838608e-05, + "loss": 0.0037, + "step": 15070 + }, + { + "epoch": 6.128914192761285, + "grad_norm": 5.47762982422143, + "learning_rate": 1.140618859109365e-05, + "loss": 0.1146, + "step": 15071 + }, + { + "epoch": 6.129320862139081, + "grad_norm": 0.20792291398614157, + "learning_rate": 1.1405185462913458e-05, + "loss": 0.0032, + "step": 15072 + }, + { + "epoch": 6.129727531516877, + "grad_norm": 0.8656685962421539, + "learning_rate": 1.1404182320308329e-05, + "loss": 0.0122, + "step": 15073 + }, + { + "epoch": 6.130134200894672, + "grad_norm": 0.19435402632385293, + "learning_rate": 1.140317916328856e-05, + "loss": 0.0046, + "step": 15074 + }, + { + "epoch": 6.130540870272468, + "grad_norm": 1.449879303845854, + "learning_rate": 1.1402175991864455e-05, + "loss": 0.0337, + "step": 15075 + }, + { + "epoch": 6.130947539650264, + "grad_norm": 1.4085348087993852, + "learning_rate": 1.1401172806046304e-05, + "loss": 0.0078, + "step": 15076 + }, + { + "epoch": 6.131354209028061, + "grad_norm": 1.370562075053539, + "learning_rate": 1.1400169605844412e-05, + "loss": 0.0087, + "step": 15077 + }, + { + "epoch": 6.131760878405856, + "grad_norm": 6.485248143622741, + "learning_rate": 1.1399166391269072e-05, + "loss": 0.2321, + "step": 15078 + }, + { + "epoch": 6.132167547783652, + "grad_norm": 0.6729549285328724, + "learning_rate": 1.1398163162330584e-05, + "loss": 0.0099, + "step": 15079 + }, + { + "epoch": 6.132574217161448, + "grad_norm": 1.80771675909857, + "learning_rate": 1.1397159919039251e-05, + "loss": 0.0286, + "step": 15080 + }, + { + "epoch": 6.132980886539244, + "grad_norm": 8.766242180119203, + "learning_rate": 1.1396156661405366e-05, + "loss": 0.3412, + "step": 15081 + }, + { + "epoch": 6.133387555917039, + "grad_norm": 0.6015131492607251, + "learning_rate": 1.1395153389439232e-05, + "loss": 0.0059, + "step": 15082 + }, + { + "epoch": 6.133794225294835, + "grad_norm": 7.099194092128932, + "learning_rate": 1.1394150103151142e-05, + "loss": 0.2676, + "step": 15083 + }, + { + "epoch": 6.134200894672631, + "grad_norm": 10.41958751508465, + "learning_rate": 1.1393146802551405e-05, + "loss": 0.3986, + "step": 15084 + }, + { + "epoch": 6.134607564050427, + "grad_norm": 1.2255991145438165, + "learning_rate": 1.1392143487650313e-05, + "loss": 0.008, + "step": 15085 + }, + { + "epoch": 6.1350142334282225, + "grad_norm": 0.08622138106967574, + "learning_rate": 1.1391140158458165e-05, + "loss": 0.0013, + "step": 15086 + }, + { + "epoch": 6.135420902806018, + "grad_norm": 2.511868856973603, + "learning_rate": 1.1390136814985266e-05, + "loss": 0.0422, + "step": 15087 + }, + { + "epoch": 6.135827572183815, + "grad_norm": 5.2035529366140905, + "learning_rate": 1.1389133457241911e-05, + "loss": 0.1671, + "step": 15088 + }, + { + "epoch": 6.136234241561611, + "grad_norm": 0.4438054561183995, + "learning_rate": 1.1388130085238402e-05, + "loss": 0.007, + "step": 15089 + }, + { + "epoch": 6.1366409109394064, + "grad_norm": 4.731825528107997, + "learning_rate": 1.138712669898504e-05, + "loss": 0.097, + "step": 15090 + }, + { + "epoch": 6.137047580317202, + "grad_norm": 7.6514219276013495, + "learning_rate": 1.1386123298492126e-05, + "loss": 0.1549, + "step": 15091 + }, + { + "epoch": 6.137454249694998, + "grad_norm": 0.16403715329974553, + "learning_rate": 1.1385119883769958e-05, + "loss": 0.0025, + "step": 15092 + }, + { + "epoch": 6.137860919072794, + "grad_norm": 5.320832773220786, + "learning_rate": 1.1384116454828838e-05, + "loss": 0.1524, + "step": 15093 + }, + { + "epoch": 6.1382675884505895, + "grad_norm": 0.02801028248436501, + "learning_rate": 1.1383113011679063e-05, + "loss": 0.0006, + "step": 15094 + }, + { + "epoch": 6.138674257828385, + "grad_norm": 8.791520065148696, + "learning_rate": 1.1382109554330936e-05, + "loss": 0.4901, + "step": 15095 + }, + { + "epoch": 6.139080927206181, + "grad_norm": 5.840136318867933, + "learning_rate": 1.1381106082794762e-05, + "loss": 0.2934, + "step": 15096 + }, + { + "epoch": 6.139487596583977, + "grad_norm": 3.953208316829767, + "learning_rate": 1.1380102597080838e-05, + "loss": 0.0752, + "step": 15097 + }, + { + "epoch": 6.1398942659617735, + "grad_norm": 0.3772456850570327, + "learning_rate": 1.1379099097199465e-05, + "loss": 0.007, + "step": 15098 + }, + { + "epoch": 6.140300935339569, + "grad_norm": 2.9713880629800826, + "learning_rate": 1.1378095583160946e-05, + "loss": 0.0719, + "step": 15099 + }, + { + "epoch": 6.140707604717365, + "grad_norm": 0.03223007018320577, + "learning_rate": 1.1377092054975586e-05, + "loss": 0.0005, + "step": 15100 + }, + { + "epoch": 6.141114274095161, + "grad_norm": 2.147116483662377, + "learning_rate": 1.137608851265368e-05, + "loss": 0.0193, + "step": 15101 + }, + { + "epoch": 6.1415209434729565, + "grad_norm": 4.50996941691059, + "learning_rate": 1.1375084956205531e-05, + "loss": 0.0664, + "step": 15102 + }, + { + "epoch": 6.141927612850752, + "grad_norm": 0.08045968591805665, + "learning_rate": 1.1374081385641444e-05, + "loss": 0.0011, + "step": 15103 + }, + { + "epoch": 6.142334282228548, + "grad_norm": 4.564537210744847, + "learning_rate": 1.137307780097172e-05, + "loss": 0.1221, + "step": 15104 + }, + { + "epoch": 6.142740951606344, + "grad_norm": 0.8552728335158079, + "learning_rate": 1.1372074202206663e-05, + "loss": 0.0132, + "step": 15105 + }, + { + "epoch": 6.14314762098414, + "grad_norm": 5.640047978660882, + "learning_rate": 1.1371070589356571e-05, + "loss": 0.0885, + "step": 15106 + }, + { + "epoch": 6.143554290361935, + "grad_norm": 0.08370433001743391, + "learning_rate": 1.1370066962431754e-05, + "loss": 0.0014, + "step": 15107 + }, + { + "epoch": 6.143960959739732, + "grad_norm": 24.62293964597747, + "learning_rate": 1.1369063321442506e-05, + "loss": 0.8551, + "step": 15108 + }, + { + "epoch": 6.144367629117528, + "grad_norm": 0.010904934318038201, + "learning_rate": 1.1368059666399136e-05, + "loss": 0.0002, + "step": 15109 + }, + { + "epoch": 6.1447742984953235, + "grad_norm": 13.321546409039003, + "learning_rate": 1.1367055997311944e-05, + "loss": 0.2815, + "step": 15110 + }, + { + "epoch": 6.145180967873119, + "grad_norm": 0.38102976213570483, + "learning_rate": 1.1366052314191234e-05, + "loss": 0.0071, + "step": 15111 + }, + { + "epoch": 6.145587637250915, + "grad_norm": 1.5287417558170098, + "learning_rate": 1.136504861704731e-05, + "loss": 0.0258, + "step": 15112 + }, + { + "epoch": 6.145994306628711, + "grad_norm": 3.0264045150488346, + "learning_rate": 1.1364044905890475e-05, + "loss": 0.0944, + "step": 15113 + }, + { + "epoch": 6.146400976006507, + "grad_norm": 0.1545705126482043, + "learning_rate": 1.1363041180731034e-05, + "loss": 0.0028, + "step": 15114 + }, + { + "epoch": 6.146807645384302, + "grad_norm": 0.0804737290775603, + "learning_rate": 1.1362037441579285e-05, + "loss": 0.0009, + "step": 15115 + }, + { + "epoch": 6.147214314762098, + "grad_norm": 0.09049652964076879, + "learning_rate": 1.1361033688445539e-05, + "loss": 0.0017, + "step": 15116 + }, + { + "epoch": 6.147620984139894, + "grad_norm": 0.16074564896083177, + "learning_rate": 1.1360029921340097e-05, + "loss": 0.0021, + "step": 15117 + }, + { + "epoch": 6.1480276535176905, + "grad_norm": 4.318768406993206, + "learning_rate": 1.135902614027326e-05, + "loss": 0.1091, + "step": 15118 + }, + { + "epoch": 6.148434322895486, + "grad_norm": 8.282248748751378, + "learning_rate": 1.1358022345255341e-05, + "loss": 0.3785, + "step": 15119 + }, + { + "epoch": 6.148840992273282, + "grad_norm": 0.21834111646568172, + "learning_rate": 1.1357018536296638e-05, + "loss": 0.0026, + "step": 15120 + }, + { + "epoch": 6.149247661651078, + "grad_norm": 6.030280727119519, + "learning_rate": 1.1356014713407455e-05, + "loss": 0.2686, + "step": 15121 + }, + { + "epoch": 6.149654331028874, + "grad_norm": 0.3239646501257251, + "learning_rate": 1.13550108765981e-05, + "loss": 0.0051, + "step": 15122 + }, + { + "epoch": 6.150061000406669, + "grad_norm": 0.6327639422893351, + "learning_rate": 1.1354007025878877e-05, + "loss": 0.009, + "step": 15123 + }, + { + "epoch": 6.150467669784465, + "grad_norm": 0.03161835541658961, + "learning_rate": 1.1353003161260089e-05, + "loss": 0.0004, + "step": 15124 + }, + { + "epoch": 6.150874339162261, + "grad_norm": 0.1267312444656025, + "learning_rate": 1.1351999282752042e-05, + "loss": 0.0014, + "step": 15125 + }, + { + "epoch": 6.151281008540057, + "grad_norm": 2.7842420992505095, + "learning_rate": 1.1350995390365042e-05, + "loss": 0.041, + "step": 15126 + }, + { + "epoch": 6.151687677917852, + "grad_norm": 3.825238555843819, + "learning_rate": 1.1349991484109397e-05, + "loss": 0.0876, + "step": 15127 + }, + { + "epoch": 6.152094347295648, + "grad_norm": 0.015225500327809654, + "learning_rate": 1.1348987563995406e-05, + "loss": 0.0003, + "step": 15128 + }, + { + "epoch": 6.152501016673445, + "grad_norm": 0.020031417865711186, + "learning_rate": 1.1347983630033381e-05, + "loss": 0.0004, + "step": 15129 + }, + { + "epoch": 6.152907686051241, + "grad_norm": 0.6389513676889053, + "learning_rate": 1.1346979682233625e-05, + "loss": 0.011, + "step": 15130 + }, + { + "epoch": 6.153314355429036, + "grad_norm": 0.8288536523422853, + "learning_rate": 1.1345975720606445e-05, + "loss": 0.0099, + "step": 15131 + }, + { + "epoch": 6.153721024806832, + "grad_norm": 2.8476601500543333, + "learning_rate": 1.1344971745162148e-05, + "loss": 0.0421, + "step": 15132 + }, + { + "epoch": 6.154127694184628, + "grad_norm": 2.9039971936796736, + "learning_rate": 1.1343967755911038e-05, + "loss": 0.0251, + "step": 15133 + }, + { + "epoch": 6.154534363562424, + "grad_norm": 3.797856795597223, + "learning_rate": 1.134296375286342e-05, + "loss": 0.098, + "step": 15134 + }, + { + "epoch": 6.154941032940219, + "grad_norm": 9.229025292022358, + "learning_rate": 1.1341959736029607e-05, + "loss": 0.3961, + "step": 15135 + }, + { + "epoch": 6.155347702318015, + "grad_norm": 9.41875149224389, + "learning_rate": 1.13409557054199e-05, + "loss": 0.1362, + "step": 15136 + }, + { + "epoch": 6.155754371695811, + "grad_norm": 1.770762632321324, + "learning_rate": 1.1339951661044608e-05, + "loss": 0.0326, + "step": 15137 + }, + { + "epoch": 6.156161041073607, + "grad_norm": 7.922611933539953, + "learning_rate": 1.133894760291404e-05, + "loss": 0.4314, + "step": 15138 + }, + { + "epoch": 6.156567710451403, + "grad_norm": 0.2843797137365436, + "learning_rate": 1.1337943531038498e-05, + "loss": 0.004, + "step": 15139 + }, + { + "epoch": 6.156974379829199, + "grad_norm": 10.275686649464424, + "learning_rate": 1.1336939445428294e-05, + "loss": 0.5648, + "step": 15140 + }, + { + "epoch": 6.157381049206995, + "grad_norm": 1.1540050176826535, + "learning_rate": 1.1335935346093732e-05, + "loss": 0.0182, + "step": 15141 + }, + { + "epoch": 6.157787718584791, + "grad_norm": 0.15318523377637824, + "learning_rate": 1.1334931233045124e-05, + "loss": 0.0018, + "step": 15142 + }, + { + "epoch": 6.1581943879625864, + "grad_norm": 9.398998793110769, + "learning_rate": 1.1333927106292774e-05, + "loss": 0.2503, + "step": 15143 + }, + { + "epoch": 6.158601057340382, + "grad_norm": 0.567526965846042, + "learning_rate": 1.133292296584699e-05, + "loss": 0.0085, + "step": 15144 + }, + { + "epoch": 6.159007726718178, + "grad_norm": 3.765824588685214, + "learning_rate": 1.1331918811718084e-05, + "loss": 0.0599, + "step": 15145 + }, + { + "epoch": 6.159414396095974, + "grad_norm": 2.778099097552796, + "learning_rate": 1.1330914643916358e-05, + "loss": 0.0409, + "step": 15146 + }, + { + "epoch": 6.1598210654737695, + "grad_norm": 9.775369127385208, + "learning_rate": 1.1329910462452122e-05, + "loss": 0.6606, + "step": 15147 + }, + { + "epoch": 6.160227734851565, + "grad_norm": 8.599124698985817, + "learning_rate": 1.132890626733569e-05, + "loss": 0.2141, + "step": 15148 + }, + { + "epoch": 6.160634404229362, + "grad_norm": 0.973278826448455, + "learning_rate": 1.1327902058577362e-05, + "loss": 0.012, + "step": 15149 + }, + { + "epoch": 6.161041073607158, + "grad_norm": 9.328699597036994, + "learning_rate": 1.1326897836187451e-05, + "loss": 0.1986, + "step": 15150 + }, + { + "epoch": 6.1614477429849535, + "grad_norm": 0.44829063910734523, + "learning_rate": 1.132589360017627e-05, + "loss": 0.0062, + "step": 15151 + }, + { + "epoch": 6.161854412362749, + "grad_norm": 0.06666631877476814, + "learning_rate": 1.1324889350554122e-05, + "loss": 0.0013, + "step": 15152 + }, + { + "epoch": 6.162261081740545, + "grad_norm": 8.03572022071569, + "learning_rate": 1.1323885087331317e-05, + "loss": 0.1854, + "step": 15153 + }, + { + "epoch": 6.162667751118341, + "grad_norm": 2.6601813299802277, + "learning_rate": 1.1322880810518167e-05, + "loss": 0.0698, + "step": 15154 + }, + { + "epoch": 6.1630744204961365, + "grad_norm": 4.865614708307379, + "learning_rate": 1.132187652012498e-05, + "loss": 0.1138, + "step": 15155 + }, + { + "epoch": 6.163481089873932, + "grad_norm": 6.154731155046634, + "learning_rate": 1.1320872216162061e-05, + "loss": 0.1447, + "step": 15156 + }, + { + "epoch": 6.163887759251728, + "grad_norm": 1.0998536111908892, + "learning_rate": 1.1319867898639727e-05, + "loss": 0.0193, + "step": 15157 + }, + { + "epoch": 6.164294428629524, + "grad_norm": 0.08066558605205318, + "learning_rate": 1.1318863567568285e-05, + "loss": 0.0016, + "step": 15158 + }, + { + "epoch": 6.1647010980073205, + "grad_norm": 0.033317465040674066, + "learning_rate": 1.1317859222958045e-05, + "loss": 0.0005, + "step": 15159 + }, + { + "epoch": 6.165107767385116, + "grad_norm": 1.3450338820885122, + "learning_rate": 1.1316854864819315e-05, + "loss": 0.0325, + "step": 15160 + }, + { + "epoch": 6.165514436762912, + "grad_norm": 0.19051313592717256, + "learning_rate": 1.1315850493162409e-05, + "loss": 0.0031, + "step": 15161 + }, + { + "epoch": 6.165921106140708, + "grad_norm": 1.345277948087163, + "learning_rate": 1.1314846107997634e-05, + "loss": 0.0285, + "step": 15162 + }, + { + "epoch": 6.1663277755185035, + "grad_norm": 4.658111496989852, + "learning_rate": 1.1313841709335303e-05, + "loss": 0.0886, + "step": 15163 + }, + { + "epoch": 6.166734444896299, + "grad_norm": 6.093740704037228, + "learning_rate": 1.1312837297185725e-05, + "loss": 0.1312, + "step": 15164 + }, + { + "epoch": 6.167141114274095, + "grad_norm": 0.06458334203666112, + "learning_rate": 1.1311832871559211e-05, + "loss": 0.0007, + "step": 15165 + }, + { + "epoch": 6.167547783651891, + "grad_norm": 0.40066631993398544, + "learning_rate": 1.1310828432466071e-05, + "loss": 0.0054, + "step": 15166 + }, + { + "epoch": 6.167954453029687, + "grad_norm": 4.7751480076011825, + "learning_rate": 1.130982397991662e-05, + "loss": 0.108, + "step": 15167 + }, + { + "epoch": 6.168361122407482, + "grad_norm": 0.20202173125926076, + "learning_rate": 1.1308819513921167e-05, + "loss": 0.0034, + "step": 15168 + }, + { + "epoch": 6.168767791785278, + "grad_norm": 2.324197354826805, + "learning_rate": 1.1307815034490021e-05, + "loss": 0.0446, + "step": 15169 + }, + { + "epoch": 6.169174461163075, + "grad_norm": 0.16609340063433206, + "learning_rate": 1.1306810541633498e-05, + "loss": 0.0024, + "step": 15170 + }, + { + "epoch": 6.1695811305408705, + "grad_norm": 4.144465485499571, + "learning_rate": 1.1305806035361905e-05, + "loss": 0.0719, + "step": 15171 + }, + { + "epoch": 6.169987799918666, + "grad_norm": 0.061950415642948994, + "learning_rate": 1.1304801515685557e-05, + "loss": 0.0006, + "step": 15172 + }, + { + "epoch": 6.170394469296462, + "grad_norm": 3.731833062046297, + "learning_rate": 1.1303796982614762e-05, + "loss": 0.0858, + "step": 15173 + }, + { + "epoch": 6.170801138674258, + "grad_norm": 4.2829644701597, + "learning_rate": 1.1302792436159839e-05, + "loss": 0.213, + "step": 15174 + }, + { + "epoch": 6.171207808052054, + "grad_norm": 0.2035393045935386, + "learning_rate": 1.1301787876331094e-05, + "loss": 0.0035, + "step": 15175 + }, + { + "epoch": 6.171614477429849, + "grad_norm": 6.075338570598599, + "learning_rate": 1.130078330313884e-05, + "loss": 0.1162, + "step": 15176 + }, + { + "epoch": 6.172021146807645, + "grad_norm": 2.188811472315321, + "learning_rate": 1.1299778716593393e-05, + "loss": 0.0127, + "step": 15177 + }, + { + "epoch": 6.172427816185441, + "grad_norm": 3.140781186232963, + "learning_rate": 1.1298774116705062e-05, + "loss": 0.0458, + "step": 15178 + }, + { + "epoch": 6.172834485563237, + "grad_norm": 1.1719082618503036, + "learning_rate": 1.129776950348416e-05, + "loss": 0.0276, + "step": 15179 + }, + { + "epoch": 6.173241154941033, + "grad_norm": 0.006113329967108794, + "learning_rate": 1.1296764876941003e-05, + "loss": 0.0001, + "step": 15180 + }, + { + "epoch": 6.173647824318829, + "grad_norm": 5.10354330812312, + "learning_rate": 1.12957602370859e-05, + "loss": 0.0944, + "step": 15181 + }, + { + "epoch": 6.174054493696625, + "grad_norm": 2.8564311091567127, + "learning_rate": 1.1294755583929167e-05, + "loss": 0.0373, + "step": 15182 + }, + { + "epoch": 6.174461163074421, + "grad_norm": 6.143898007634328, + "learning_rate": 1.1293750917481116e-05, + "loss": 0.0872, + "step": 15183 + }, + { + "epoch": 6.174867832452216, + "grad_norm": 6.249381836630059, + "learning_rate": 1.1292746237752062e-05, + "loss": 0.317, + "step": 15184 + }, + { + "epoch": 6.175274501830012, + "grad_norm": 0.21939137366529757, + "learning_rate": 1.1291741544752317e-05, + "loss": 0.0027, + "step": 15185 + }, + { + "epoch": 6.175681171207808, + "grad_norm": 3.8795535166189734, + "learning_rate": 1.129073683849219e-05, + "loss": 0.1219, + "step": 15186 + }, + { + "epoch": 6.176087840585604, + "grad_norm": 1.524007746375413, + "learning_rate": 1.1289732118982007e-05, + "loss": 0.011, + "step": 15187 + }, + { + "epoch": 6.176494509963399, + "grad_norm": 8.857884855272628, + "learning_rate": 1.1288727386232072e-05, + "loss": 0.2564, + "step": 15188 + }, + { + "epoch": 6.176901179341195, + "grad_norm": 0.41874708900147656, + "learning_rate": 1.12877226402527e-05, + "loss": 0.0046, + "step": 15189 + }, + { + "epoch": 6.177307848718992, + "grad_norm": 0.07659205613780672, + "learning_rate": 1.1286717881054208e-05, + "loss": 0.0011, + "step": 15190 + }, + { + "epoch": 6.177714518096788, + "grad_norm": 0.24162758521812214, + "learning_rate": 1.1285713108646912e-05, + "loss": 0.0033, + "step": 15191 + }, + { + "epoch": 6.178121187474583, + "grad_norm": 10.019591845873848, + "learning_rate": 1.128470832304112e-05, + "loss": 0.2671, + "step": 15192 + }, + { + "epoch": 6.178527856852379, + "grad_norm": 0.02105578055229575, + "learning_rate": 1.1283703524247153e-05, + "loss": 0.0006, + "step": 15193 + }, + { + "epoch": 6.178934526230175, + "grad_norm": 0.055950186178952754, + "learning_rate": 1.1282698712275324e-05, + "loss": 0.001, + "step": 15194 + }, + { + "epoch": 6.179341195607971, + "grad_norm": 0.0499479605181853, + "learning_rate": 1.1281693887135946e-05, + "loss": 0.0009, + "step": 15195 + }, + { + "epoch": 6.1797478649857664, + "grad_norm": 0.08911207429178176, + "learning_rate": 1.1280689048839332e-05, + "loss": 0.0016, + "step": 15196 + }, + { + "epoch": 6.180154534363562, + "grad_norm": 9.299501270911358, + "learning_rate": 1.1279684197395807e-05, + "loss": 0.3533, + "step": 15197 + }, + { + "epoch": 6.180561203741358, + "grad_norm": 1.6114627303178557, + "learning_rate": 1.1278679332815673e-05, + "loss": 0.0296, + "step": 15198 + }, + { + "epoch": 6.180967873119154, + "grad_norm": 4.338630307023902, + "learning_rate": 1.1277674455109255e-05, + "loss": 0.1115, + "step": 15199 + }, + { + "epoch": 6.18137454249695, + "grad_norm": 8.587490235556336, + "learning_rate": 1.1276669564286866e-05, + "loss": 0.3155, + "step": 15200 + }, + { + "epoch": 6.181781211874746, + "grad_norm": 8.651012955044887, + "learning_rate": 1.1275664660358818e-05, + "loss": 0.3526, + "step": 15201 + }, + { + "epoch": 6.182187881252542, + "grad_norm": 1.4220100485007816, + "learning_rate": 1.1274659743335434e-05, + "loss": 0.0331, + "step": 15202 + }, + { + "epoch": 6.182594550630338, + "grad_norm": 0.044086579134278284, + "learning_rate": 1.1273654813227026e-05, + "loss": 0.0007, + "step": 15203 + }, + { + "epoch": 6.1830012200081335, + "grad_norm": 0.06268271069464895, + "learning_rate": 1.127264987004391e-05, + "loss": 0.001, + "step": 15204 + }, + { + "epoch": 6.183407889385929, + "grad_norm": 1.0414560727734368, + "learning_rate": 1.12716449137964e-05, + "loss": 0.0171, + "step": 15205 + }, + { + "epoch": 6.183814558763725, + "grad_norm": 3.4715349647077747, + "learning_rate": 1.1270639944494818e-05, + "loss": 0.0452, + "step": 15206 + }, + { + "epoch": 6.184221228141521, + "grad_norm": 2.2186536348262655, + "learning_rate": 1.1269634962149478e-05, + "loss": 0.0249, + "step": 15207 + }, + { + "epoch": 6.1846278975193165, + "grad_norm": 14.035004217087227, + "learning_rate": 1.1268629966770692e-05, + "loss": 0.2028, + "step": 15208 + }, + { + "epoch": 6.185034566897112, + "grad_norm": 6.296538142660966, + "learning_rate": 1.1267624958368785e-05, + "loss": 0.2676, + "step": 15209 + }, + { + "epoch": 6.185441236274908, + "grad_norm": 0.18329851142609405, + "learning_rate": 1.1266619936954068e-05, + "loss": 0.0033, + "step": 15210 + }, + { + "epoch": 6.185847905652705, + "grad_norm": 10.31508027851138, + "learning_rate": 1.126561490253686e-05, + "loss": 0.2744, + "step": 15211 + }, + { + "epoch": 6.1862545750305005, + "grad_norm": 3.0055207801491486, + "learning_rate": 1.1264609855127479e-05, + "loss": 0.0587, + "step": 15212 + }, + { + "epoch": 6.186661244408296, + "grad_norm": 2.0855656820807416, + "learning_rate": 1.126360479473624e-05, + "loss": 0.031, + "step": 15213 + }, + { + "epoch": 6.187067913786092, + "grad_norm": 3.679702989423906, + "learning_rate": 1.1262599721373462e-05, + "loss": 0.123, + "step": 15214 + }, + { + "epoch": 6.187474583163888, + "grad_norm": 7.443186832521177, + "learning_rate": 1.1261594635049465e-05, + "loss": 0.2745, + "step": 15215 + }, + { + "epoch": 6.1878812525416835, + "grad_norm": 8.05836273803899, + "learning_rate": 1.1260589535774562e-05, + "loss": 0.1794, + "step": 15216 + }, + { + "epoch": 6.188287921919479, + "grad_norm": 0.22064331720961838, + "learning_rate": 1.1259584423559074e-05, + "loss": 0.0035, + "step": 15217 + }, + { + "epoch": 6.188694591297275, + "grad_norm": 1.2885749747479667, + "learning_rate": 1.1258579298413314e-05, + "loss": 0.0192, + "step": 15218 + }, + { + "epoch": 6.189101260675071, + "grad_norm": 0.06675924854973732, + "learning_rate": 1.1257574160347609e-05, + "loss": 0.0006, + "step": 15219 + }, + { + "epoch": 6.189507930052867, + "grad_norm": 2.445792511409868, + "learning_rate": 1.125656900937227e-05, + "loss": 0.0356, + "step": 15220 + }, + { + "epoch": 6.189914599430663, + "grad_norm": 0.011081624850998293, + "learning_rate": 1.1255563845497617e-05, + "loss": 0.0002, + "step": 15221 + }, + { + "epoch": 6.190321268808459, + "grad_norm": 0.4547391943060818, + "learning_rate": 1.125455866873397e-05, + "loss": 0.0084, + "step": 15222 + }, + { + "epoch": 6.190727938186255, + "grad_norm": 1.3926580394893042, + "learning_rate": 1.125355347909165e-05, + "loss": 0.03, + "step": 15223 + }, + { + "epoch": 6.1911346075640505, + "grad_norm": 6.785280167508284, + "learning_rate": 1.1252548276580968e-05, + "loss": 0.0798, + "step": 15224 + }, + { + "epoch": 6.191541276941846, + "grad_norm": 0.08165186785738394, + "learning_rate": 1.1251543061212249e-05, + "loss": 0.0012, + "step": 15225 + }, + { + "epoch": 6.191947946319642, + "grad_norm": 0.7140795089691558, + "learning_rate": 1.1250537832995812e-05, + "loss": 0.0084, + "step": 15226 + }, + { + "epoch": 6.192354615697438, + "grad_norm": 9.954937575857862, + "learning_rate": 1.1249532591941972e-05, + "loss": 0.3133, + "step": 15227 + }, + { + "epoch": 6.192761285075234, + "grad_norm": 2.681497896251809, + "learning_rate": 1.1248527338061052e-05, + "loss": 0.0353, + "step": 15228 + }, + { + "epoch": 6.193167954453029, + "grad_norm": 6.096563571237839, + "learning_rate": 1.1247522071363373e-05, + "loss": 0.1327, + "step": 15229 + }, + { + "epoch": 6.193574623830825, + "grad_norm": 5.550724724039504, + "learning_rate": 1.1246516791859253e-05, + "loss": 0.2908, + "step": 15230 + }, + { + "epoch": 6.193981293208622, + "grad_norm": 24.05398383478838, + "learning_rate": 1.1245511499559005e-05, + "loss": 0.8672, + "step": 15231 + }, + { + "epoch": 6.1943879625864176, + "grad_norm": 0.9503839759181517, + "learning_rate": 1.1244506194472958e-05, + "loss": 0.0149, + "step": 15232 + }, + { + "epoch": 6.194794631964213, + "grad_norm": 3.681356475246354, + "learning_rate": 1.124350087661143e-05, + "loss": 0.0536, + "step": 15233 + }, + { + "epoch": 6.195201301342009, + "grad_norm": 0.02895990920130079, + "learning_rate": 1.1242495545984735e-05, + "loss": 0.0004, + "step": 15234 + }, + { + "epoch": 6.195607970719805, + "grad_norm": 0.21072987862593948, + "learning_rate": 1.1241490202603203e-05, + "loss": 0.0034, + "step": 15235 + }, + { + "epoch": 6.196014640097601, + "grad_norm": 0.049514715790213236, + "learning_rate": 1.1240484846477146e-05, + "loss": 0.0008, + "step": 15236 + }, + { + "epoch": 6.196421309475396, + "grad_norm": 0.3775262963684036, + "learning_rate": 1.1239479477616889e-05, + "loss": 0.0056, + "step": 15237 + }, + { + "epoch": 6.196827978853192, + "grad_norm": 6.366030468316009, + "learning_rate": 1.123847409603275e-05, + "loss": 0.2249, + "step": 15238 + }, + { + "epoch": 6.197234648230988, + "grad_norm": 0.28901887455916203, + "learning_rate": 1.1237468701735055e-05, + "loss": 0.0032, + "step": 15239 + }, + { + "epoch": 6.197641317608784, + "grad_norm": 0.22339736166216398, + "learning_rate": 1.1236463294734115e-05, + "loss": 0.0029, + "step": 15240 + }, + { + "epoch": 6.19804798698658, + "grad_norm": 2.702436361062441, + "learning_rate": 1.1235457875040261e-05, + "loss": 0.0389, + "step": 15241 + }, + { + "epoch": 6.198454656364376, + "grad_norm": 6.3604903669423605, + "learning_rate": 1.123445244266381e-05, + "loss": 0.15, + "step": 15242 + }, + { + "epoch": 6.198861325742172, + "grad_norm": 5.644233628159928, + "learning_rate": 1.1233446997615082e-05, + "loss": 0.1089, + "step": 15243 + }, + { + "epoch": 6.199267995119968, + "grad_norm": 0.33499129776135594, + "learning_rate": 1.1232441539904401e-05, + "loss": 0.0036, + "step": 15244 + }, + { + "epoch": 6.199674664497763, + "grad_norm": 3.763106857571304, + "learning_rate": 1.123143606954209e-05, + "loss": 0.0597, + "step": 15245 + }, + { + "epoch": 6.200081333875559, + "grad_norm": 3.2991861605704367, + "learning_rate": 1.1230430586538463e-05, + "loss": 0.081, + "step": 15246 + }, + { + "epoch": 6.200488003253355, + "grad_norm": 0.7845532706451313, + "learning_rate": 1.122942509090385e-05, + "loss": 0.0088, + "step": 15247 + }, + { + "epoch": 6.200894672631151, + "grad_norm": 0.06365659165224356, + "learning_rate": 1.1228419582648566e-05, + "loss": 0.0016, + "step": 15248 + }, + { + "epoch": 6.2013013420089464, + "grad_norm": 5.862972087130235, + "learning_rate": 1.1227414061782943e-05, + "loss": 0.103, + "step": 15249 + }, + { + "epoch": 6.201708011386742, + "grad_norm": 0.5100702246104184, + "learning_rate": 1.122640852831729e-05, + "loss": 0.007, + "step": 15250 + }, + { + "epoch": 6.202114680764538, + "grad_norm": 7.30285300282239, + "learning_rate": 1.122540298226194e-05, + "loss": 0.2532, + "step": 15251 + }, + { + "epoch": 6.202521350142335, + "grad_norm": 2.6431922103051404, + "learning_rate": 1.1224397423627212e-05, + "loss": 0.0366, + "step": 15252 + }, + { + "epoch": 6.20292801952013, + "grad_norm": 7.984265751520518, + "learning_rate": 1.1223391852423425e-05, + "loss": 0.2278, + "step": 15253 + }, + { + "epoch": 6.203334688897926, + "grad_norm": 0.09521878684753779, + "learning_rate": 1.1222386268660907e-05, + "loss": 0.0014, + "step": 15254 + }, + { + "epoch": 6.203741358275722, + "grad_norm": 6.137231399537984, + "learning_rate": 1.1221380672349979e-05, + "loss": 0.1863, + "step": 15255 + }, + { + "epoch": 6.204148027653518, + "grad_norm": 1.3004498863367462, + "learning_rate": 1.122037506350096e-05, + "loss": 0.0087, + "step": 15256 + }, + { + "epoch": 6.2045546970313135, + "grad_norm": 3.0302286047942877, + "learning_rate": 1.1219369442124181e-05, + "loss": 0.0687, + "step": 15257 + }, + { + "epoch": 6.204961366409109, + "grad_norm": 0.1406690910081208, + "learning_rate": 1.1218363808229961e-05, + "loss": 0.0031, + "step": 15258 + }, + { + "epoch": 6.205368035786905, + "grad_norm": 0.6511729836400857, + "learning_rate": 1.1217358161828622e-05, + "loss": 0.0068, + "step": 15259 + }, + { + "epoch": 6.205774705164701, + "grad_norm": 21.34373180477992, + "learning_rate": 1.121635250293049e-05, + "loss": 0.62, + "step": 15260 + }, + { + "epoch": 6.2061813745424965, + "grad_norm": 3.047783226278691, + "learning_rate": 1.1215346831545885e-05, + "loss": 0.0416, + "step": 15261 + }, + { + "epoch": 6.206588043920293, + "grad_norm": 0.4362313975607051, + "learning_rate": 1.1214341147685137e-05, + "loss": 0.005, + "step": 15262 + }, + { + "epoch": 6.206994713298089, + "grad_norm": 0.7861786154748541, + "learning_rate": 1.1213335451358562e-05, + "loss": 0.0165, + "step": 15263 + }, + { + "epoch": 6.207401382675885, + "grad_norm": 0.26260198202423607, + "learning_rate": 1.121232974257649e-05, + "loss": 0.0019, + "step": 15264 + }, + { + "epoch": 6.2078080520536805, + "grad_norm": 4.529588444019456, + "learning_rate": 1.1211324021349243e-05, + "loss": 0.1824, + "step": 15265 + }, + { + "epoch": 6.208214721431476, + "grad_norm": 1.2736730843535018, + "learning_rate": 1.1210318287687144e-05, + "loss": 0.0163, + "step": 15266 + }, + { + "epoch": 6.208621390809272, + "grad_norm": 0.37389362695874356, + "learning_rate": 1.1209312541600521e-05, + "loss": 0.0044, + "step": 15267 + }, + { + "epoch": 6.209028060187068, + "grad_norm": 14.37420630622158, + "learning_rate": 1.1208306783099694e-05, + "loss": 0.4623, + "step": 15268 + }, + { + "epoch": 6.2094347295648635, + "grad_norm": 2.416531003533017, + "learning_rate": 1.1207301012194989e-05, + "loss": 0.0213, + "step": 15269 + }, + { + "epoch": 6.209841398942659, + "grad_norm": 0.01495910276401229, + "learning_rate": 1.1206295228896733e-05, + "loss": 0.0003, + "step": 15270 + }, + { + "epoch": 6.210248068320455, + "grad_norm": 14.179541871213662, + "learning_rate": 1.1205289433215249e-05, + "loss": 0.4456, + "step": 15271 + }, + { + "epoch": 6.210654737698252, + "grad_norm": 1.571148572831864, + "learning_rate": 1.1204283625160863e-05, + "loss": 0.0287, + "step": 15272 + }, + { + "epoch": 6.2110614070760475, + "grad_norm": 1.6023700455611578, + "learning_rate": 1.12032778047439e-05, + "loss": 0.022, + "step": 15273 + }, + { + "epoch": 6.211468076453843, + "grad_norm": 0.2710921278872686, + "learning_rate": 1.1202271971974683e-05, + "loss": 0.0031, + "step": 15274 + }, + { + "epoch": 6.211874745831639, + "grad_norm": 0.01921080973023661, + "learning_rate": 1.120126612686354e-05, + "loss": 0.0003, + "step": 15275 + }, + { + "epoch": 6.212281415209435, + "grad_norm": 0.05526865156949533, + "learning_rate": 1.1200260269420795e-05, + "loss": 0.0003, + "step": 15276 + }, + { + "epoch": 6.2126880845872305, + "grad_norm": 3.313400301721843, + "learning_rate": 1.1199254399656776e-05, + "loss": 0.1128, + "step": 15277 + }, + { + "epoch": 6.213094753965026, + "grad_norm": 25.507202366634058, + "learning_rate": 1.1198248517581807e-05, + "loss": 0.4379, + "step": 15278 + }, + { + "epoch": 6.213501423342822, + "grad_norm": 7.836880256484847, + "learning_rate": 1.119724262320621e-05, + "loss": 0.2263, + "step": 15279 + }, + { + "epoch": 6.213908092720618, + "grad_norm": 0.06384345226031377, + "learning_rate": 1.1196236716540318e-05, + "loss": 0.0011, + "step": 15280 + }, + { + "epoch": 6.214314762098414, + "grad_norm": 6.962219729477556, + "learning_rate": 1.1195230797594454e-05, + "loss": 0.1008, + "step": 15281 + }, + { + "epoch": 6.21472143147621, + "grad_norm": 8.79828671971199, + "learning_rate": 1.1194224866378942e-05, + "loss": 0.386, + "step": 15282 + }, + { + "epoch": 6.215128100854006, + "grad_norm": 6.135253387706172, + "learning_rate": 1.1193218922904115e-05, + "loss": 0.1, + "step": 15283 + }, + { + "epoch": 6.215534770231802, + "grad_norm": 1.518539661640929, + "learning_rate": 1.1192212967180294e-05, + "loss": 0.0202, + "step": 15284 + }, + { + "epoch": 6.2159414396095976, + "grad_norm": 3.8272366311147175, + "learning_rate": 1.1191206999217804e-05, + "loss": 0.1121, + "step": 15285 + }, + { + "epoch": 6.216348108987393, + "grad_norm": 4.13740299919592, + "learning_rate": 1.1190201019026977e-05, + "loss": 0.0645, + "step": 15286 + }, + { + "epoch": 6.216754778365189, + "grad_norm": 0.03872147894610304, + "learning_rate": 1.1189195026618137e-05, + "loss": 0.0006, + "step": 15287 + }, + { + "epoch": 6.217161447742985, + "grad_norm": 0.08281532443443025, + "learning_rate": 1.1188189022001612e-05, + "loss": 0.0007, + "step": 15288 + }, + { + "epoch": 6.217568117120781, + "grad_norm": 1.592875270534434, + "learning_rate": 1.1187183005187727e-05, + "loss": 0.0194, + "step": 15289 + }, + { + "epoch": 6.217974786498576, + "grad_norm": 0.196429431188048, + "learning_rate": 1.1186176976186814e-05, + "loss": 0.0029, + "step": 15290 + }, + { + "epoch": 6.218381455876372, + "grad_norm": 2.85428638838735, + "learning_rate": 1.1185170935009195e-05, + "loss": 0.0465, + "step": 15291 + }, + { + "epoch": 6.218788125254168, + "grad_norm": 12.479032059481122, + "learning_rate": 1.1184164881665197e-05, + "loss": 0.3248, + "step": 15292 + }, + { + "epoch": 6.219194794631965, + "grad_norm": 5.946292129480726, + "learning_rate": 1.1183158816165157e-05, + "loss": 0.1049, + "step": 15293 + }, + { + "epoch": 6.21960146400976, + "grad_norm": 0.10560301959113252, + "learning_rate": 1.1182152738519392e-05, + "loss": 0.0016, + "step": 15294 + }, + { + "epoch": 6.220008133387556, + "grad_norm": 2.8770803676527006, + "learning_rate": 1.1181146648738238e-05, + "loss": 0.0679, + "step": 15295 + }, + { + "epoch": 6.220414802765352, + "grad_norm": 7.289899877610753, + "learning_rate": 1.1180140546832017e-05, + "loss": 0.3082, + "step": 15296 + }, + { + "epoch": 6.220821472143148, + "grad_norm": 0.03692078849123642, + "learning_rate": 1.1179134432811058e-05, + "loss": 0.0004, + "step": 15297 + }, + { + "epoch": 6.221228141520943, + "grad_norm": 0.16142117669047581, + "learning_rate": 1.1178128306685689e-05, + "loss": 0.0035, + "step": 15298 + }, + { + "epoch": 6.221634810898739, + "grad_norm": 0.0994191471111005, + "learning_rate": 1.1177122168466244e-05, + "loss": 0.0009, + "step": 15299 + }, + { + "epoch": 6.222041480276535, + "grad_norm": 0.22107513393537806, + "learning_rate": 1.1176116018163046e-05, + "loss": 0.0029, + "step": 15300 + }, + { + "epoch": 6.222448149654331, + "grad_norm": 3.089739139289472, + "learning_rate": 1.1175109855786423e-05, + "loss": 0.0324, + "step": 15301 + }, + { + "epoch": 6.222854819032127, + "grad_norm": 4.383059742867542, + "learning_rate": 1.1174103681346711e-05, + "loss": 0.0717, + "step": 15302 + }, + { + "epoch": 6.223261488409923, + "grad_norm": 14.938132503617444, + "learning_rate": 1.117309749485423e-05, + "loss": 0.4048, + "step": 15303 + }, + { + "epoch": 6.223668157787719, + "grad_norm": 8.577046744573225, + "learning_rate": 1.1172091296319311e-05, + "loss": 0.2387, + "step": 15304 + }, + { + "epoch": 6.224074827165515, + "grad_norm": 0.05086459019306862, + "learning_rate": 1.1171085085752289e-05, + "loss": 0.0009, + "step": 15305 + }, + { + "epoch": 6.22448149654331, + "grad_norm": 50.46451549990207, + "learning_rate": 1.117007886316349e-05, + "loss": 1.1632, + "step": 15306 + }, + { + "epoch": 6.224888165921106, + "grad_norm": 0.05065719095637842, + "learning_rate": 1.116907262856324e-05, + "loss": 0.0008, + "step": 15307 + }, + { + "epoch": 6.225294835298902, + "grad_norm": 0.4589848172749699, + "learning_rate": 1.116806638196187e-05, + "loss": 0.0091, + "step": 15308 + }, + { + "epoch": 6.225701504676698, + "grad_norm": 0.1164647533485083, + "learning_rate": 1.1167060123369713e-05, + "loss": 0.0019, + "step": 15309 + }, + { + "epoch": 6.2261081740544935, + "grad_norm": 6.587184320153599, + "learning_rate": 1.1166053852797096e-05, + "loss": 0.1521, + "step": 15310 + }, + { + "epoch": 6.226514843432289, + "grad_norm": 0.5540719441291642, + "learning_rate": 1.1165047570254346e-05, + "loss": 0.0056, + "step": 15311 + }, + { + "epoch": 6.226921512810085, + "grad_norm": 6.726538740480531, + "learning_rate": 1.11640412757518e-05, + "loss": 0.4382, + "step": 15312 + }, + { + "epoch": 6.227328182187882, + "grad_norm": 11.929212327773277, + "learning_rate": 1.1163034969299784e-05, + "loss": 0.6217, + "step": 15313 + }, + { + "epoch": 6.227734851565677, + "grad_norm": 1.5225105831158359, + "learning_rate": 1.1162028650908627e-05, + "loss": 0.0161, + "step": 15314 + }, + { + "epoch": 6.228141520943473, + "grad_norm": 0.07195395147880028, + "learning_rate": 1.1161022320588664e-05, + "loss": 0.0009, + "step": 15315 + }, + { + "epoch": 6.228548190321269, + "grad_norm": 5.5649277058359345, + "learning_rate": 1.1160015978350223e-05, + "loss": 0.1254, + "step": 15316 + }, + { + "epoch": 6.228954859699065, + "grad_norm": 10.922644871481454, + "learning_rate": 1.115900962420363e-05, + "loss": 0.6619, + "step": 15317 + }, + { + "epoch": 6.2293615290768605, + "grad_norm": 0.7041107678465764, + "learning_rate": 1.1158003258159222e-05, + "loss": 0.0093, + "step": 15318 + }, + { + "epoch": 6.229768198454656, + "grad_norm": 4.318546384014807, + "learning_rate": 1.1156996880227329e-05, + "loss": 0.2016, + "step": 15319 + }, + { + "epoch": 6.230174867832452, + "grad_norm": 4.647705917380365, + "learning_rate": 1.115599049041828e-05, + "loss": 0.1088, + "step": 15320 + }, + { + "epoch": 6.230581537210248, + "grad_norm": 12.331808442388866, + "learning_rate": 1.1154984088742404e-05, + "loss": 0.6019, + "step": 15321 + }, + { + "epoch": 6.2309882065880435, + "grad_norm": 3.6248057003503886, + "learning_rate": 1.115397767521004e-05, + "loss": 0.1436, + "step": 15322 + }, + { + "epoch": 6.23139487596584, + "grad_norm": 0.45138406353153804, + "learning_rate": 1.115297124983151e-05, + "loss": 0.0054, + "step": 15323 + }, + { + "epoch": 6.231801545343636, + "grad_norm": 14.955846753171016, + "learning_rate": 1.115196481261715e-05, + "loss": 0.3551, + "step": 15324 + }, + { + "epoch": 6.232208214721432, + "grad_norm": 0.858492127633808, + "learning_rate": 1.1150958363577296e-05, + "loss": 0.0128, + "step": 15325 + }, + { + "epoch": 6.2326148840992275, + "grad_norm": 6.326115820199272, + "learning_rate": 1.1149951902722271e-05, + "loss": 0.2122, + "step": 15326 + }, + { + "epoch": 6.233021553477023, + "grad_norm": 0.08551715944878698, + "learning_rate": 1.114894543006241e-05, + "loss": 0.0014, + "step": 15327 + }, + { + "epoch": 6.233428222854819, + "grad_norm": 3.847682718486079, + "learning_rate": 1.114793894560805e-05, + "loss": 0.0943, + "step": 15328 + }, + { + "epoch": 6.233834892232615, + "grad_norm": 2.9919668372387527, + "learning_rate": 1.1146932449369516e-05, + "loss": 0.0607, + "step": 15329 + }, + { + "epoch": 6.2342415616104105, + "grad_norm": 3.053063774672452, + "learning_rate": 1.1145925941357143e-05, + "loss": 0.0482, + "step": 15330 + }, + { + "epoch": 6.234648230988206, + "grad_norm": 2.0654673924623084, + "learning_rate": 1.1144919421581267e-05, + "loss": 0.0266, + "step": 15331 + }, + { + "epoch": 6.235054900366002, + "grad_norm": 4.062842669887426, + "learning_rate": 1.1143912890052215e-05, + "loss": 0.0996, + "step": 15332 + }, + { + "epoch": 6.235461569743798, + "grad_norm": 15.995860322359963, + "learning_rate": 1.114290634678032e-05, + "loss": 0.3887, + "step": 15333 + }, + { + "epoch": 6.2358682391215945, + "grad_norm": 6.640085069640779, + "learning_rate": 1.1141899791775916e-05, + "loss": 0.1585, + "step": 15334 + }, + { + "epoch": 6.23627490849939, + "grad_norm": 11.840433973793576, + "learning_rate": 1.1140893225049339e-05, + "loss": 0.1693, + "step": 15335 + }, + { + "epoch": 6.236681577877186, + "grad_norm": 0.7900865656790513, + "learning_rate": 1.1139886646610918e-05, + "loss": 0.0113, + "step": 15336 + }, + { + "epoch": 6.237088247254982, + "grad_norm": 4.677573232224505, + "learning_rate": 1.1138880056470985e-05, + "loss": 0.0659, + "step": 15337 + }, + { + "epoch": 6.2374949166327776, + "grad_norm": 2.642240420223187, + "learning_rate": 1.1137873454639876e-05, + "loss": 0.0372, + "step": 15338 + }, + { + "epoch": 6.237901586010573, + "grad_norm": 0.15694094649628257, + "learning_rate": 1.1136866841127922e-05, + "loss": 0.0016, + "step": 15339 + }, + { + "epoch": 6.238308255388369, + "grad_norm": 3.9957722910273175, + "learning_rate": 1.1135860215945458e-05, + "loss": 0.0482, + "step": 15340 + }, + { + "epoch": 6.238714924766165, + "grad_norm": 3.767077709507266, + "learning_rate": 1.1134853579102818e-05, + "loss": 0.0823, + "step": 15341 + }, + { + "epoch": 6.239121594143961, + "grad_norm": 3.9554804359495477, + "learning_rate": 1.113384693061034e-05, + "loss": 0.0948, + "step": 15342 + }, + { + "epoch": 6.239528263521757, + "grad_norm": 1.6367844579563595, + "learning_rate": 1.1132840270478344e-05, + "loss": 0.0273, + "step": 15343 + }, + { + "epoch": 6.239934932899553, + "grad_norm": 2.2996768861869614, + "learning_rate": 1.1131833598717176e-05, + "loss": 0.0288, + "step": 15344 + }, + { + "epoch": 6.240341602277349, + "grad_norm": 3.051599328274621, + "learning_rate": 1.1130826915337167e-05, + "loss": 0.0687, + "step": 15345 + }, + { + "epoch": 6.240748271655145, + "grad_norm": 0.6999011397448079, + "learning_rate": 1.1129820220348651e-05, + "loss": 0.0078, + "step": 15346 + }, + { + "epoch": 6.24115494103294, + "grad_norm": 3.9208005423812873, + "learning_rate": 1.112881351376196e-05, + "loss": 0.088, + "step": 15347 + }, + { + "epoch": 6.241561610410736, + "grad_norm": 7.046847849833762, + "learning_rate": 1.1127806795587432e-05, + "loss": 0.2119, + "step": 15348 + }, + { + "epoch": 6.241968279788532, + "grad_norm": 0.6024170335423726, + "learning_rate": 1.1126800065835397e-05, + "loss": 0.0053, + "step": 15349 + }, + { + "epoch": 6.242374949166328, + "grad_norm": 3.6177040886231646, + "learning_rate": 1.1125793324516195e-05, + "loss": 0.0541, + "step": 15350 + }, + { + "epoch": 6.242781618544123, + "grad_norm": 5.160761911667225, + "learning_rate": 1.1124786571640156e-05, + "loss": 0.2534, + "step": 15351 + }, + { + "epoch": 6.243188287921919, + "grad_norm": 2.0807607122984604, + "learning_rate": 1.1123779807217616e-05, + "loss": 0.0475, + "step": 15352 + }, + { + "epoch": 6.243594957299715, + "grad_norm": 0.07575492118933316, + "learning_rate": 1.1122773031258913e-05, + "loss": 0.0015, + "step": 15353 + }, + { + "epoch": 6.244001626677512, + "grad_norm": 2.107339645004273, + "learning_rate": 1.112176624377438e-05, + "loss": 0.0381, + "step": 15354 + }, + { + "epoch": 6.244408296055307, + "grad_norm": 0.1016931570459547, + "learning_rate": 1.1120759444774351e-05, + "loss": 0.0015, + "step": 15355 + }, + { + "epoch": 6.244814965433103, + "grad_norm": 0.46924867997543535, + "learning_rate": 1.111975263426916e-05, + "loss": 0.0066, + "step": 15356 + }, + { + "epoch": 6.245221634810899, + "grad_norm": 5.274288595536946, + "learning_rate": 1.1118745812269149e-05, + "loss": 0.0678, + "step": 15357 + }, + { + "epoch": 6.245628304188695, + "grad_norm": 0.8659876185251223, + "learning_rate": 1.1117738978784649e-05, + "loss": 0.0116, + "step": 15358 + }, + { + "epoch": 6.24603497356649, + "grad_norm": 10.954503017342445, + "learning_rate": 1.1116732133825992e-05, + "loss": 0.3705, + "step": 15359 + }, + { + "epoch": 6.246441642944286, + "grad_norm": 0.05640190658078123, + "learning_rate": 1.111572527740352e-05, + "loss": 0.0012, + "step": 15360 + }, + { + "epoch": 6.246848312322082, + "grad_norm": 0.28670010217613656, + "learning_rate": 1.1114718409527565e-05, + "loss": 0.0034, + "step": 15361 + }, + { + "epoch": 6.247254981699878, + "grad_norm": 5.045179038742834, + "learning_rate": 1.1113711530208464e-05, + "loss": 0.1475, + "step": 15362 + }, + { + "epoch": 6.2476616510776735, + "grad_norm": 5.463305000209639, + "learning_rate": 1.1112704639456557e-05, + "loss": 0.1221, + "step": 15363 + }, + { + "epoch": 6.24806832045547, + "grad_norm": 6.587405923191077, + "learning_rate": 1.1111697737282174e-05, + "loss": 0.1491, + "step": 15364 + }, + { + "epoch": 6.248474989833266, + "grad_norm": 0.3562333537479697, + "learning_rate": 1.1110690823695656e-05, + "loss": 0.0058, + "step": 15365 + }, + { + "epoch": 6.248881659211062, + "grad_norm": 0.49971581281793276, + "learning_rate": 1.1109683898707336e-05, + "loss": 0.0092, + "step": 15366 + }, + { + "epoch": 6.249288328588857, + "grad_norm": 0.2538231851221749, + "learning_rate": 1.1108676962327554e-05, + "loss": 0.0034, + "step": 15367 + }, + { + "epoch": 6.249694997966653, + "grad_norm": 0.27021784033589785, + "learning_rate": 1.1107670014566644e-05, + "loss": 0.0036, + "step": 15368 + }, + { + "epoch": 6.250101667344449, + "grad_norm": 0.4620489218710631, + "learning_rate": 1.1106663055434942e-05, + "loss": 0.0065, + "step": 15369 + }, + { + "epoch": 6.250508336722245, + "grad_norm": 5.6166765211953, + "learning_rate": 1.110565608494279e-05, + "loss": 0.3027, + "step": 15370 + }, + { + "epoch": 6.2509150061000405, + "grad_norm": 2.895355386404805, + "learning_rate": 1.1104649103100523e-05, + "loss": 0.0736, + "step": 15371 + }, + { + "epoch": 6.251321675477836, + "grad_norm": 6.956927401334109, + "learning_rate": 1.1103642109918471e-05, + "loss": 0.0909, + "step": 15372 + }, + { + "epoch": 6.251728344855632, + "grad_norm": 0.6090353681995422, + "learning_rate": 1.1102635105406982e-05, + "loss": 0.0077, + "step": 15373 + }, + { + "epoch": 6.252135014233428, + "grad_norm": 3.348074442234303, + "learning_rate": 1.1101628089576388e-05, + "loss": 0.0492, + "step": 15374 + }, + { + "epoch": 6.252541683611224, + "grad_norm": 4.777832756178873, + "learning_rate": 1.1100621062437025e-05, + "loss": 0.1027, + "step": 15375 + }, + { + "epoch": 6.25294835298902, + "grad_norm": 2.4678704344519233, + "learning_rate": 1.1099614023999236e-05, + "loss": 0.0267, + "step": 15376 + }, + { + "epoch": 6.253355022366816, + "grad_norm": 2.625127078195464, + "learning_rate": 1.1098606974273354e-05, + "loss": 0.0374, + "step": 15377 + }, + { + "epoch": 6.253761691744612, + "grad_norm": 1.9439769409980028, + "learning_rate": 1.109759991326972e-05, + "loss": 0.0202, + "step": 15378 + }, + { + "epoch": 6.2541683611224075, + "grad_norm": 7.875343365175278, + "learning_rate": 1.1096592840998667e-05, + "loss": 0.3064, + "step": 15379 + }, + { + "epoch": 6.254575030500203, + "grad_norm": 6.82250157852487, + "learning_rate": 1.109558575747054e-05, + "loss": 0.1381, + "step": 15380 + }, + { + "epoch": 6.254981699877999, + "grad_norm": 9.771887216131097, + "learning_rate": 1.1094578662695674e-05, + "loss": 0.4715, + "step": 15381 + }, + { + "epoch": 6.255388369255795, + "grad_norm": 4.985342104956074, + "learning_rate": 1.1093571556684404e-05, + "loss": 0.0815, + "step": 15382 + }, + { + "epoch": 6.2557950386335905, + "grad_norm": 0.02434591207821381, + "learning_rate": 1.1092564439447074e-05, + "loss": 0.0002, + "step": 15383 + }, + { + "epoch": 6.256201708011387, + "grad_norm": 0.04241166680722078, + "learning_rate": 1.109155731099402e-05, + "loss": 0.0007, + "step": 15384 + }, + { + "epoch": 6.256608377389183, + "grad_norm": 0.595936927853232, + "learning_rate": 1.1090550171335578e-05, + "loss": 0.0072, + "step": 15385 + }, + { + "epoch": 6.257015046766979, + "grad_norm": 10.371885416200069, + "learning_rate": 1.1089543020482092e-05, + "loss": 0.3292, + "step": 15386 + }, + { + "epoch": 6.2574217161447745, + "grad_norm": 0.35987020316201307, + "learning_rate": 1.10885358584439e-05, + "loss": 0.004, + "step": 15387 + }, + { + "epoch": 6.25782838552257, + "grad_norm": 0.023837852662546642, + "learning_rate": 1.1087528685231336e-05, + "loss": 0.0002, + "step": 15388 + }, + { + "epoch": 6.258235054900366, + "grad_norm": 8.255126852773856, + "learning_rate": 1.1086521500854746e-05, + "loss": 0.2359, + "step": 15389 + }, + { + "epoch": 6.258641724278162, + "grad_norm": 5.608655994973606, + "learning_rate": 1.1085514305324465e-05, + "loss": 0.1583, + "step": 15390 + }, + { + "epoch": 6.2590483936559576, + "grad_norm": 4.205566486913152, + "learning_rate": 1.1084507098650833e-05, + "loss": 0.1089, + "step": 15391 + }, + { + "epoch": 6.259455063033753, + "grad_norm": 0.03773403327066515, + "learning_rate": 1.108349988084419e-05, + "loss": 0.0004, + "step": 15392 + }, + { + "epoch": 6.259861732411549, + "grad_norm": 1.6498924637786843, + "learning_rate": 1.1082492651914878e-05, + "loss": 0.0213, + "step": 15393 + }, + { + "epoch": 6.260268401789345, + "grad_norm": 1.3115536782313924, + "learning_rate": 1.108148541187323e-05, + "loss": 0.0218, + "step": 15394 + }, + { + "epoch": 6.2606750711671415, + "grad_norm": 0.07666606514092132, + "learning_rate": 1.108047816072959e-05, + "loss": 0.0013, + "step": 15395 + }, + { + "epoch": 6.261081740544937, + "grad_norm": 3.2505557209541087, + "learning_rate": 1.10794708984943e-05, + "loss": 0.0904, + "step": 15396 + }, + { + "epoch": 6.261488409922733, + "grad_norm": 2.306991557077192, + "learning_rate": 1.1078463625177696e-05, + "loss": 0.0416, + "step": 15397 + }, + { + "epoch": 6.261895079300529, + "grad_norm": 9.802616654514054, + "learning_rate": 1.1077456340790123e-05, + "loss": 0.2133, + "step": 15398 + }, + { + "epoch": 6.262301748678325, + "grad_norm": 0.2832682774638968, + "learning_rate": 1.1076449045341918e-05, + "loss": 0.006, + "step": 15399 + }, + { + "epoch": 6.26270841805612, + "grad_norm": 5.078027069541346, + "learning_rate": 1.107544173884342e-05, + "loss": 0.0582, + "step": 15400 + }, + { + "epoch": 6.263115087433916, + "grad_norm": 4.31355944327801, + "learning_rate": 1.1074434421304969e-05, + "loss": 0.0367, + "step": 15401 + }, + { + "epoch": 6.263521756811712, + "grad_norm": 5.281645266283499, + "learning_rate": 1.1073427092736913e-05, + "loss": 0.3367, + "step": 15402 + }, + { + "epoch": 6.263928426189508, + "grad_norm": 11.118169759582212, + "learning_rate": 1.1072419753149585e-05, + "loss": 0.2688, + "step": 15403 + }, + { + "epoch": 6.264335095567303, + "grad_norm": 2.996204898089303, + "learning_rate": 1.1071412402553328e-05, + "loss": 0.1062, + "step": 15404 + }, + { + "epoch": 6.2647417649451, + "grad_norm": 5.425778085631827, + "learning_rate": 1.1070405040958483e-05, + "loss": 0.26, + "step": 15405 + }, + { + "epoch": 6.265148434322896, + "grad_norm": 0.1886450633109641, + "learning_rate": 1.1069397668375394e-05, + "loss": 0.003, + "step": 15406 + }, + { + "epoch": 6.265555103700692, + "grad_norm": 1.8056766113939415, + "learning_rate": 1.1068390284814396e-05, + "loss": 0.029, + "step": 15407 + }, + { + "epoch": 6.265961773078487, + "grad_norm": 1.210453802841019, + "learning_rate": 1.1067382890285839e-05, + "loss": 0.0092, + "step": 15408 + }, + { + "epoch": 6.266368442456283, + "grad_norm": 1.6491845691462463, + "learning_rate": 1.1066375484800056e-05, + "loss": 0.0329, + "step": 15409 + }, + { + "epoch": 6.266775111834079, + "grad_norm": 0.8827664539019981, + "learning_rate": 1.1065368068367395e-05, + "loss": 0.0239, + "step": 15410 + }, + { + "epoch": 6.267181781211875, + "grad_norm": 0.45156413845833143, + "learning_rate": 1.1064360640998191e-05, + "loss": 0.0056, + "step": 15411 + }, + { + "epoch": 6.26758845058967, + "grad_norm": 3.146353454207396, + "learning_rate": 1.1063353202702792e-05, + "loss": 0.0398, + "step": 15412 + }, + { + "epoch": 6.267995119967466, + "grad_norm": 4.386303347738337, + "learning_rate": 1.1062345753491535e-05, + "loss": 0.0728, + "step": 15413 + }, + { + "epoch": 6.268401789345262, + "grad_norm": 2.132319497270731, + "learning_rate": 1.1061338293374763e-05, + "loss": 0.0383, + "step": 15414 + }, + { + "epoch": 6.268808458723058, + "grad_norm": 8.09745623308872, + "learning_rate": 1.1060330822362823e-05, + "loss": 0.1464, + "step": 15415 + }, + { + "epoch": 6.269215128100854, + "grad_norm": 4.845373555507298, + "learning_rate": 1.1059323340466052e-05, + "loss": 0.2318, + "step": 15416 + }, + { + "epoch": 6.26962179747865, + "grad_norm": 0.36129982135080535, + "learning_rate": 1.1058315847694793e-05, + "loss": 0.0041, + "step": 15417 + }, + { + "epoch": 6.270028466856446, + "grad_norm": 24.116401895437843, + "learning_rate": 1.105730834405939e-05, + "loss": 0.4201, + "step": 15418 + }, + { + "epoch": 6.270435136234242, + "grad_norm": 14.60861846729564, + "learning_rate": 1.1056300829570185e-05, + "loss": 0.8604, + "step": 15419 + }, + { + "epoch": 6.270841805612037, + "grad_norm": 11.182997843568385, + "learning_rate": 1.105529330423752e-05, + "loss": 0.352, + "step": 15420 + }, + { + "epoch": 6.271248474989833, + "grad_norm": 9.088853210786045, + "learning_rate": 1.1054285768071737e-05, + "loss": 0.278, + "step": 15421 + }, + { + "epoch": 6.271655144367629, + "grad_norm": 0.3793411068160508, + "learning_rate": 1.1053278221083181e-05, + "loss": 0.0044, + "step": 15422 + }, + { + "epoch": 6.272061813745425, + "grad_norm": 3.8648308874793935, + "learning_rate": 1.1052270663282196e-05, + "loss": 0.1154, + "step": 15423 + }, + { + "epoch": 6.2724684831232205, + "grad_norm": 3.1619476447235697, + "learning_rate": 1.105126309467912e-05, + "loss": 0.024, + "step": 15424 + }, + { + "epoch": 6.272875152501017, + "grad_norm": 17.373621945568978, + "learning_rate": 1.1050255515284301e-05, + "loss": 0.9389, + "step": 15425 + }, + { + "epoch": 6.273281821878813, + "grad_norm": 5.729053811208949, + "learning_rate": 1.104924792510808e-05, + "loss": 0.0557, + "step": 15426 + }, + { + "epoch": 6.273688491256609, + "grad_norm": 0.42451148492750945, + "learning_rate": 1.10482403241608e-05, + "loss": 0.0047, + "step": 15427 + }, + { + "epoch": 6.274095160634404, + "grad_norm": 3.414484588001086, + "learning_rate": 1.1047232712452806e-05, + "loss": 0.0626, + "step": 15428 + }, + { + "epoch": 6.2745018300122, + "grad_norm": 1.9983900545496533, + "learning_rate": 1.1046225089994443e-05, + "loss": 0.0193, + "step": 15429 + }, + { + "epoch": 6.274908499389996, + "grad_norm": 8.073138095936345, + "learning_rate": 1.104521745679605e-05, + "loss": 0.2224, + "step": 15430 + }, + { + "epoch": 6.275315168767792, + "grad_norm": 2.703166301665775, + "learning_rate": 1.1044209812867975e-05, + "loss": 0.0534, + "step": 15431 + }, + { + "epoch": 6.2757218381455875, + "grad_norm": 1.6131307913353128, + "learning_rate": 1.104320215822056e-05, + "loss": 0.0176, + "step": 15432 + }, + { + "epoch": 6.276128507523383, + "grad_norm": 0.38436793906553357, + "learning_rate": 1.104219449286415e-05, + "loss": 0.0061, + "step": 15433 + }, + { + "epoch": 6.276535176901179, + "grad_norm": 4.046984209999807, + "learning_rate": 1.104118681680909e-05, + "loss": 0.0373, + "step": 15434 + }, + { + "epoch": 6.276941846278975, + "grad_norm": 1.7353288581137847, + "learning_rate": 1.1040179130065725e-05, + "loss": 0.0186, + "step": 15435 + }, + { + "epoch": 6.277348515656771, + "grad_norm": 1.098363978780214, + "learning_rate": 1.1039171432644395e-05, + "loss": 0.0135, + "step": 15436 + }, + { + "epoch": 6.277755185034567, + "grad_norm": 10.362055287565294, + "learning_rate": 1.1038163724555447e-05, + "loss": 0.1902, + "step": 15437 + }, + { + "epoch": 6.278161854412363, + "grad_norm": 2.7170933442394354, + "learning_rate": 1.1037156005809226e-05, + "loss": 0.0493, + "step": 15438 + }, + { + "epoch": 6.278568523790159, + "grad_norm": 1.8177483090942446, + "learning_rate": 1.1036148276416079e-05, + "loss": 0.0233, + "step": 15439 + }, + { + "epoch": 6.2789751931679545, + "grad_norm": 4.657850975245136, + "learning_rate": 1.1035140536386346e-05, + "loss": 0.1426, + "step": 15440 + }, + { + "epoch": 6.27938186254575, + "grad_norm": 1.9142996787046922, + "learning_rate": 1.1034132785730375e-05, + "loss": 0.0211, + "step": 15441 + }, + { + "epoch": 6.279788531923546, + "grad_norm": 4.686844365044867, + "learning_rate": 1.1033125024458511e-05, + "loss": 0.1046, + "step": 15442 + }, + { + "epoch": 6.280195201301342, + "grad_norm": 2.4402783612092653, + "learning_rate": 1.10321172525811e-05, + "loss": 0.1036, + "step": 15443 + }, + { + "epoch": 6.2806018706791376, + "grad_norm": 2.701630344412538, + "learning_rate": 1.1031109470108485e-05, + "loss": 0.0612, + "step": 15444 + }, + { + "epoch": 6.281008540056933, + "grad_norm": 0.07341628937171998, + "learning_rate": 1.1030101677051013e-05, + "loss": 0.0009, + "step": 15445 + }, + { + "epoch": 6.28141520943473, + "grad_norm": 3.049154095227845, + "learning_rate": 1.1029093873419025e-05, + "loss": 0.0795, + "step": 15446 + }, + { + "epoch": 6.281821878812526, + "grad_norm": 4.330699481504434, + "learning_rate": 1.1028086059222874e-05, + "loss": 0.103, + "step": 15447 + }, + { + "epoch": 6.2822285481903215, + "grad_norm": 5.714610774446973, + "learning_rate": 1.10270782344729e-05, + "loss": 0.1552, + "step": 15448 + }, + { + "epoch": 6.282635217568117, + "grad_norm": 0.31717009195207435, + "learning_rate": 1.102607039917945e-05, + "loss": 0.0051, + "step": 15449 + }, + { + "epoch": 6.283041886945913, + "grad_norm": 0.514006938573262, + "learning_rate": 1.1025062553352877e-05, + "loss": 0.0058, + "step": 15450 + }, + { + "epoch": 6.283448556323709, + "grad_norm": 2.7647065246005527, + "learning_rate": 1.1024054697003516e-05, + "loss": 0.071, + "step": 15451 + }, + { + "epoch": 6.283855225701505, + "grad_norm": 6.499319645288447, + "learning_rate": 1.1023046830141716e-05, + "loss": 0.4114, + "step": 15452 + }, + { + "epoch": 6.2842618950793, + "grad_norm": 9.406227454169555, + "learning_rate": 1.102203895277783e-05, + "loss": 0.2395, + "step": 15453 + }, + { + "epoch": 6.284668564457096, + "grad_norm": 1.7329613201368879, + "learning_rate": 1.10210310649222e-05, + "loss": 0.0221, + "step": 15454 + }, + { + "epoch": 6.285075233834892, + "grad_norm": 3.902946826034521, + "learning_rate": 1.102002316658517e-05, + "loss": 0.0696, + "step": 15455 + }, + { + "epoch": 6.285481903212688, + "grad_norm": 1.9038373056137035, + "learning_rate": 1.101901525777709e-05, + "loss": 0.0291, + "step": 15456 + }, + { + "epoch": 6.285888572590484, + "grad_norm": 1.0740571129904486, + "learning_rate": 1.1018007338508306e-05, + "loss": 0.0111, + "step": 15457 + }, + { + "epoch": 6.28629524196828, + "grad_norm": 0.6113979270934757, + "learning_rate": 1.1016999408789165e-05, + "loss": 0.0073, + "step": 15458 + }, + { + "epoch": 6.286701911346076, + "grad_norm": 6.735208047159448, + "learning_rate": 1.1015991468630007e-05, + "loss": 0.1774, + "step": 15459 + }, + { + "epoch": 6.287108580723872, + "grad_norm": 11.851194592261772, + "learning_rate": 1.101498351804119e-05, + "loss": 0.4905, + "step": 15460 + }, + { + "epoch": 6.287515250101667, + "grad_norm": 1.2525270317391328, + "learning_rate": 1.1013975557033059e-05, + "loss": 0.0247, + "step": 15461 + }, + { + "epoch": 6.287921919479463, + "grad_norm": 8.342288404491441, + "learning_rate": 1.1012967585615955e-05, + "loss": 0.2952, + "step": 15462 + }, + { + "epoch": 6.288328588857259, + "grad_norm": 1.001638406647997, + "learning_rate": 1.1011959603800228e-05, + "loss": 0.0166, + "step": 15463 + }, + { + "epoch": 6.288735258235055, + "grad_norm": 0.49002103766334243, + "learning_rate": 1.101095161159623e-05, + "loss": 0.0038, + "step": 15464 + }, + { + "epoch": 6.28914192761285, + "grad_norm": 0.6020023390885194, + "learning_rate": 1.1009943609014303e-05, + "loss": 0.0094, + "step": 15465 + }, + { + "epoch": 6.289548596990647, + "grad_norm": 1.782630925357036, + "learning_rate": 1.1008935596064796e-05, + "loss": 0.0271, + "step": 15466 + }, + { + "epoch": 6.289955266368443, + "grad_norm": 2.2775396329052873, + "learning_rate": 1.1007927572758057e-05, + "loss": 0.031, + "step": 15467 + }, + { + "epoch": 6.290361935746239, + "grad_norm": 6.213948489356942, + "learning_rate": 1.1006919539104438e-05, + "loss": 0.2304, + "step": 15468 + }, + { + "epoch": 6.290768605124034, + "grad_norm": 5.57404619777212, + "learning_rate": 1.1005911495114277e-05, + "loss": 0.0954, + "step": 15469 + }, + { + "epoch": 6.29117527450183, + "grad_norm": 0.30626977331636485, + "learning_rate": 1.1004903440797931e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 6.291581943879626, + "grad_norm": 0.14942909857108652, + "learning_rate": 1.1003895376165747e-05, + "loss": 0.0025, + "step": 15471 + }, + { + "epoch": 6.291988613257422, + "grad_norm": 7.1006657866499046, + "learning_rate": 1.100288730122807e-05, + "loss": 0.2106, + "step": 15472 + }, + { + "epoch": 6.292395282635217, + "grad_norm": 1.4038682778980818, + "learning_rate": 1.1001879215995251e-05, + "loss": 0.0167, + "step": 15473 + }, + { + "epoch": 6.292801952013013, + "grad_norm": 0.9998566484998347, + "learning_rate": 1.1000871120477639e-05, + "loss": 0.0181, + "step": 15474 + }, + { + "epoch": 6.293208621390809, + "grad_norm": 6.514423033083293, + "learning_rate": 1.0999863014685576e-05, + "loss": 0.2428, + "step": 15475 + }, + { + "epoch": 6.293615290768605, + "grad_norm": 1.7975731164396165, + "learning_rate": 1.099885489862942e-05, + "loss": 0.0231, + "step": 15476 + }, + { + "epoch": 6.294021960146401, + "grad_norm": 8.074547173967886, + "learning_rate": 1.0997846772319516e-05, + "loss": 0.1993, + "step": 15477 + }, + { + "epoch": 6.294428629524197, + "grad_norm": 13.317827291582699, + "learning_rate": 1.0996838635766212e-05, + "loss": 0.2319, + "step": 15478 + }, + { + "epoch": 6.294835298901993, + "grad_norm": 8.472637123949863, + "learning_rate": 1.0995830488979856e-05, + "loss": 0.2096, + "step": 15479 + }, + { + "epoch": 6.295241968279789, + "grad_norm": 10.088076968101456, + "learning_rate": 1.0994822331970803e-05, + "loss": 0.2028, + "step": 15480 + }, + { + "epoch": 6.295648637657584, + "grad_norm": 1.4596059532239043, + "learning_rate": 1.0993814164749395e-05, + "loss": 0.0236, + "step": 15481 + }, + { + "epoch": 6.29605530703538, + "grad_norm": 5.868714245368889, + "learning_rate": 1.0992805987325982e-05, + "loss": 0.1425, + "step": 15482 + }, + { + "epoch": 6.296461976413176, + "grad_norm": 4.258648596268541, + "learning_rate": 1.0991797799710921e-05, + "loss": 0.0753, + "step": 15483 + }, + { + "epoch": 6.296868645790972, + "grad_norm": 4.77761051929555, + "learning_rate": 1.0990789601914553e-05, + "loss": 0.1271, + "step": 15484 + }, + { + "epoch": 6.2972753151687675, + "grad_norm": 3.7986437640649355, + "learning_rate": 1.0989781393947232e-05, + "loss": 0.0722, + "step": 15485 + }, + { + "epoch": 6.297681984546563, + "grad_norm": 0.05698401751115884, + "learning_rate": 1.0988773175819307e-05, + "loss": 0.0009, + "step": 15486 + }, + { + "epoch": 6.29808865392436, + "grad_norm": 3.7693367912543634, + "learning_rate": 1.0987764947541128e-05, + "loss": 0.0776, + "step": 15487 + }, + { + "epoch": 6.298495323302156, + "grad_norm": 0.2139507135162688, + "learning_rate": 1.0986756709123043e-05, + "loss": 0.004, + "step": 15488 + }, + { + "epoch": 6.298901992679951, + "grad_norm": 0.1769988047488325, + "learning_rate": 1.0985748460575407e-05, + "loss": 0.0022, + "step": 15489 + }, + { + "epoch": 6.299308662057747, + "grad_norm": 0.06175779496798553, + "learning_rate": 1.0984740201908566e-05, + "loss": 0.0009, + "step": 15490 + }, + { + "epoch": 6.299715331435543, + "grad_norm": 0.40109882892956505, + "learning_rate": 1.0983731933132868e-05, + "loss": 0.0051, + "step": 15491 + }, + { + "epoch": 6.300122000813339, + "grad_norm": 2.4268768693657625, + "learning_rate": 1.098272365425867e-05, + "loss": 0.0411, + "step": 15492 + }, + { + "epoch": 6.3005286701911345, + "grad_norm": 50.2032011846587, + "learning_rate": 1.0981715365296321e-05, + "loss": 0.4356, + "step": 15493 + }, + { + "epoch": 6.30093533956893, + "grad_norm": 0.051759341590743135, + "learning_rate": 1.0980707066256165e-05, + "loss": 0.0007, + "step": 15494 + }, + { + "epoch": 6.301342008946726, + "grad_norm": 0.21299012714947738, + "learning_rate": 1.097969875714856e-05, + "loss": 0.0028, + "step": 15495 + }, + { + "epoch": 6.301748678324522, + "grad_norm": 2.9639080674810256, + "learning_rate": 1.0978690437983854e-05, + "loss": 0.0483, + "step": 15496 + }, + { + "epoch": 6.302155347702318, + "grad_norm": 14.460308572374037, + "learning_rate": 1.0977682108772396e-05, + "loss": 0.5241, + "step": 15497 + }, + { + "epoch": 6.302562017080114, + "grad_norm": 1.2269108384041079, + "learning_rate": 1.0976673769524542e-05, + "loss": 0.0199, + "step": 15498 + }, + { + "epoch": 6.30296868645791, + "grad_norm": 2.1758264187763436, + "learning_rate": 1.0975665420250638e-05, + "loss": 0.0375, + "step": 15499 + }, + { + "epoch": 6.303375355835706, + "grad_norm": 6.059847309442583, + "learning_rate": 1.0974657060961038e-05, + "loss": 0.0583, + "step": 15500 + }, + { + "epoch": 6.3037820252135015, + "grad_norm": 2.5320102446254773, + "learning_rate": 1.0973648691666092e-05, + "loss": 0.0094, + "step": 15501 + }, + { + "epoch": 6.304188694591297, + "grad_norm": 9.667682920141734, + "learning_rate": 1.0972640312376154e-05, + "loss": 0.2539, + "step": 15502 + }, + { + "epoch": 6.304595363969093, + "grad_norm": 12.276065794798722, + "learning_rate": 1.0971631923101573e-05, + "loss": 0.4427, + "step": 15503 + }, + { + "epoch": 6.305002033346889, + "grad_norm": 1.2918787179801352, + "learning_rate": 1.0970623523852699e-05, + "loss": 0.0131, + "step": 15504 + }, + { + "epoch": 6.305408702724685, + "grad_norm": 11.263356224553247, + "learning_rate": 1.0969615114639889e-05, + "loss": 0.4436, + "step": 15505 + }, + { + "epoch": 6.30581537210248, + "grad_norm": 0.1796409782618427, + "learning_rate": 1.0968606695473491e-05, + "loss": 0.0027, + "step": 15506 + }, + { + "epoch": 6.306222041480277, + "grad_norm": 4.927406382332775, + "learning_rate": 1.0967598266363857e-05, + "loss": 0.0805, + "step": 15507 + }, + { + "epoch": 6.306628710858073, + "grad_norm": 1.1105529406533603, + "learning_rate": 1.0966589827321341e-05, + "loss": 0.0078, + "step": 15508 + }, + { + "epoch": 6.3070353802358685, + "grad_norm": 5.207327418264378, + "learning_rate": 1.0965581378356296e-05, + "loss": 0.0793, + "step": 15509 + }, + { + "epoch": 6.307442049613664, + "grad_norm": 0.3053104714055031, + "learning_rate": 1.0964572919479065e-05, + "loss": 0.0045, + "step": 15510 + }, + { + "epoch": 6.30784871899146, + "grad_norm": 3.0307192274752994, + "learning_rate": 1.0963564450700015e-05, + "loss": 0.0467, + "step": 15511 + }, + { + "epoch": 6.308255388369256, + "grad_norm": 8.775701474710797, + "learning_rate": 1.0962555972029487e-05, + "loss": 0.1907, + "step": 15512 + }, + { + "epoch": 6.308662057747052, + "grad_norm": 6.016848455139385, + "learning_rate": 1.096154748347784e-05, + "loss": 0.1276, + "step": 15513 + }, + { + "epoch": 6.309068727124847, + "grad_norm": 8.35429152644027, + "learning_rate": 1.0960538985055423e-05, + "loss": 0.3096, + "step": 15514 + }, + { + "epoch": 6.309475396502643, + "grad_norm": 0.04048577453064665, + "learning_rate": 1.0959530476772589e-05, + "loss": 0.0005, + "step": 15515 + }, + { + "epoch": 6.309882065880439, + "grad_norm": 0.25374476094219534, + "learning_rate": 1.0958521958639693e-05, + "loss": 0.0028, + "step": 15516 + }, + { + "epoch": 6.310288735258235, + "grad_norm": 12.443507950763518, + "learning_rate": 1.0957513430667084e-05, + "loss": 0.5096, + "step": 15517 + }, + { + "epoch": 6.310695404636031, + "grad_norm": 0.8861484346437863, + "learning_rate": 1.0956504892865121e-05, + "loss": 0.0111, + "step": 15518 + }, + { + "epoch": 6.311102074013827, + "grad_norm": 5.831830999787574, + "learning_rate": 1.0955496345244154e-05, + "loss": 0.2259, + "step": 15519 + }, + { + "epoch": 6.311508743391623, + "grad_norm": 0.46411316902224725, + "learning_rate": 1.0954487787814532e-05, + "loss": 0.0072, + "step": 15520 + }, + { + "epoch": 6.311915412769419, + "grad_norm": 6.822020733530921, + "learning_rate": 1.0953479220586616e-05, + "loss": 0.1872, + "step": 15521 + }, + { + "epoch": 6.312322082147214, + "grad_norm": 3.2035824484664857, + "learning_rate": 1.0952470643570756e-05, + "loss": 0.0442, + "step": 15522 + }, + { + "epoch": 6.31272875152501, + "grad_norm": 0.7061353156145971, + "learning_rate": 1.0951462056777302e-05, + "loss": 0.0123, + "step": 15523 + }, + { + "epoch": 6.313135420902806, + "grad_norm": 2.1920777038463743, + "learning_rate": 1.0950453460216615e-05, + "loss": 0.0152, + "step": 15524 + }, + { + "epoch": 6.313542090280602, + "grad_norm": 6.460654817166448, + "learning_rate": 1.0949444853899045e-05, + "loss": 0.1555, + "step": 15525 + }, + { + "epoch": 6.313948759658397, + "grad_norm": 3.659105914054076, + "learning_rate": 1.0948436237834946e-05, + "loss": 0.1055, + "step": 15526 + }, + { + "epoch": 6.314355429036193, + "grad_norm": 4.268589147054691, + "learning_rate": 1.0947427612034668e-05, + "loss": 0.1887, + "step": 15527 + }, + { + "epoch": 6.31476209841399, + "grad_norm": 4.813535531097434, + "learning_rate": 1.094641897650857e-05, + "loss": 0.0916, + "step": 15528 + }, + { + "epoch": 6.315168767791786, + "grad_norm": 24.548493818182422, + "learning_rate": 1.0945410331267007e-05, + "loss": 1.1527, + "step": 15529 + }, + { + "epoch": 6.315575437169581, + "grad_norm": 27.633485482001547, + "learning_rate": 1.0944401676320329e-05, + "loss": 1.0011, + "step": 15530 + }, + { + "epoch": 6.315982106547377, + "grad_norm": 7.531045504986995, + "learning_rate": 1.0943393011678894e-05, + "loss": 0.1117, + "step": 15531 + }, + { + "epoch": 6.316388775925173, + "grad_norm": 6.7534353837817545, + "learning_rate": 1.0942384337353053e-05, + "loss": 0.1632, + "step": 15532 + }, + { + "epoch": 6.316795445302969, + "grad_norm": 9.513317634097547, + "learning_rate": 1.0941375653353163e-05, + "loss": 0.1223, + "step": 15533 + }, + { + "epoch": 6.317202114680764, + "grad_norm": 8.9205340581933, + "learning_rate": 1.0940366959689581e-05, + "loss": 0.1377, + "step": 15534 + }, + { + "epoch": 6.31760878405856, + "grad_norm": 8.378171389706392, + "learning_rate": 1.0939358256372656e-05, + "loss": 0.1183, + "step": 15535 + }, + { + "epoch": 6.318015453436356, + "grad_norm": 3.217795017492784, + "learning_rate": 1.0938349543412745e-05, + "loss": 0.054, + "step": 15536 + }, + { + "epoch": 6.318422122814152, + "grad_norm": 9.550955236250468, + "learning_rate": 1.0937340820820205e-05, + "loss": 0.2226, + "step": 15537 + }, + { + "epoch": 6.3188287921919475, + "grad_norm": 2.8767606115803463, + "learning_rate": 1.0936332088605392e-05, + "loss": 0.0445, + "step": 15538 + }, + { + "epoch": 6.319235461569744, + "grad_norm": 2.9829490221669617, + "learning_rate": 1.0935323346778656e-05, + "loss": 0.0513, + "step": 15539 + }, + { + "epoch": 6.31964213094754, + "grad_norm": 11.503735662953929, + "learning_rate": 1.0934314595350354e-05, + "loss": 0.3416, + "step": 15540 + }, + { + "epoch": 6.320048800325336, + "grad_norm": 12.025990254190523, + "learning_rate": 1.0933305834330847e-05, + "loss": 0.3785, + "step": 15541 + }, + { + "epoch": 6.320455469703131, + "grad_norm": 6.721201891878805, + "learning_rate": 1.0932297063730483e-05, + "loss": 0.2095, + "step": 15542 + }, + { + "epoch": 6.320862139080927, + "grad_norm": 4.21390302087272, + "learning_rate": 1.0931288283559623e-05, + "loss": 0.2309, + "step": 15543 + }, + { + "epoch": 6.321268808458723, + "grad_norm": 37.8716070253511, + "learning_rate": 1.0930279493828618e-05, + "loss": 1.4852, + "step": 15544 + }, + { + "epoch": 6.321675477836519, + "grad_norm": 12.383072429329609, + "learning_rate": 1.0929270694547821e-05, + "loss": 0.3463, + "step": 15545 + }, + { + "epoch": 6.3220821472143145, + "grad_norm": 1.4066023584353455, + "learning_rate": 1.09282618857276e-05, + "loss": 0.0284, + "step": 15546 + }, + { + "epoch": 6.32248881659211, + "grad_norm": 4.593521820961102, + "learning_rate": 1.09272530673783e-05, + "loss": 0.0768, + "step": 15547 + }, + { + "epoch": 6.322895485969907, + "grad_norm": 4.080837739311267, + "learning_rate": 1.0926244239510281e-05, + "loss": 0.1115, + "step": 15548 + }, + { + "epoch": 6.323302155347703, + "grad_norm": 4.747686835466368, + "learning_rate": 1.0925235402133898e-05, + "loss": 0.2094, + "step": 15549 + }, + { + "epoch": 6.3237088247254984, + "grad_norm": 1.5612128564519634, + "learning_rate": 1.092422655525951e-05, + "loss": 0.0254, + "step": 15550 + }, + { + "epoch": 6.324115494103294, + "grad_norm": 12.335691590093564, + "learning_rate": 1.092321769889747e-05, + "loss": 0.1007, + "step": 15551 + }, + { + "epoch": 6.32452216348109, + "grad_norm": 7.054094142095581, + "learning_rate": 1.0922208833058137e-05, + "loss": 0.2841, + "step": 15552 + }, + { + "epoch": 6.324928832858886, + "grad_norm": 2.684893232431464, + "learning_rate": 1.0921199957751865e-05, + "loss": 0.047, + "step": 15553 + }, + { + "epoch": 6.3253355022366815, + "grad_norm": 0.5687423799282862, + "learning_rate": 1.0920191072989013e-05, + "loss": 0.0093, + "step": 15554 + }, + { + "epoch": 6.325742171614477, + "grad_norm": 0.14437292698546425, + "learning_rate": 1.0919182178779932e-05, + "loss": 0.0012, + "step": 15555 + }, + { + "epoch": 6.326148840992273, + "grad_norm": 7.138923372490772, + "learning_rate": 1.0918173275134986e-05, + "loss": 0.3825, + "step": 15556 + }, + { + "epoch": 6.326555510370069, + "grad_norm": 0.91836535951628, + "learning_rate": 1.0917164362064532e-05, + "loss": 0.012, + "step": 15557 + }, + { + "epoch": 6.326962179747865, + "grad_norm": 10.411330623928402, + "learning_rate": 1.0916155439578921e-05, + "loss": 0.3255, + "step": 15558 + }, + { + "epoch": 6.327368849125661, + "grad_norm": 4.161697713049712, + "learning_rate": 1.0915146507688515e-05, + "loss": 0.0833, + "step": 15559 + }, + { + "epoch": 6.327775518503457, + "grad_norm": 1.6432535345878903, + "learning_rate": 1.0914137566403668e-05, + "loss": 0.0201, + "step": 15560 + }, + { + "epoch": 6.328182187881253, + "grad_norm": 3.1108143548994507, + "learning_rate": 1.0913128615734743e-05, + "loss": 0.0653, + "step": 15561 + }, + { + "epoch": 6.3285888572590485, + "grad_norm": 7.486731238973257, + "learning_rate": 1.0912119655692086e-05, + "loss": 0.2779, + "step": 15562 + }, + { + "epoch": 6.328995526636844, + "grad_norm": 5.412874483327964, + "learning_rate": 1.0911110686286069e-05, + "loss": 0.1514, + "step": 15563 + }, + { + "epoch": 6.32940219601464, + "grad_norm": 0.16189017346336665, + "learning_rate": 1.091010170752704e-05, + "loss": 0.0027, + "step": 15564 + }, + { + "epoch": 6.329808865392436, + "grad_norm": 0.42008126633009124, + "learning_rate": 1.0909092719425354e-05, + "loss": 0.0085, + "step": 15565 + }, + { + "epoch": 6.330215534770232, + "grad_norm": 2.8577968525831507, + "learning_rate": 1.0908083721991379e-05, + "loss": 0.0653, + "step": 15566 + }, + { + "epoch": 6.330622204148027, + "grad_norm": 6.971403486344765, + "learning_rate": 1.0907074715235466e-05, + "loss": 0.1591, + "step": 15567 + }, + { + "epoch": 6.331028873525823, + "grad_norm": 3.3644834474603162, + "learning_rate": 1.0906065699167974e-05, + "loss": 0.1506, + "step": 15568 + }, + { + "epoch": 6.33143554290362, + "grad_norm": 18.669960285717526, + "learning_rate": 1.0905056673799261e-05, + "loss": 0.7605, + "step": 15569 + }, + { + "epoch": 6.3318422122814155, + "grad_norm": 2.546237969694217, + "learning_rate": 1.0904047639139687e-05, + "loss": 0.0516, + "step": 15570 + }, + { + "epoch": 6.332248881659211, + "grad_norm": 0.9411373685494488, + "learning_rate": 1.090303859519961e-05, + "loss": 0.0165, + "step": 15571 + }, + { + "epoch": 6.332655551037007, + "grad_norm": 5.767209476656545, + "learning_rate": 1.0902029541989386e-05, + "loss": 0.0821, + "step": 15572 + }, + { + "epoch": 6.333062220414803, + "grad_norm": 7.400422624492149, + "learning_rate": 1.0901020479519376e-05, + "loss": 0.1755, + "step": 15573 + }, + { + "epoch": 6.333468889792599, + "grad_norm": 2.3470583925570856, + "learning_rate": 1.0900011407799936e-05, + "loss": 0.0325, + "step": 15574 + }, + { + "epoch": 6.333875559170394, + "grad_norm": 0.6432547387347259, + "learning_rate": 1.0899002326841424e-05, + "loss": 0.0088, + "step": 15575 + }, + { + "epoch": 6.33428222854819, + "grad_norm": 2.992285209870642, + "learning_rate": 1.0897993236654203e-05, + "loss": 0.0875, + "step": 15576 + }, + { + "epoch": 6.334688897925986, + "grad_norm": 1.1774764323893838, + "learning_rate": 1.0896984137248632e-05, + "loss": 0.011, + "step": 15577 + }, + { + "epoch": 6.335095567303782, + "grad_norm": 5.27926972661322, + "learning_rate": 1.0895975028635062e-05, + "loss": 0.1168, + "step": 15578 + }, + { + "epoch": 6.335502236681577, + "grad_norm": 3.145528719393472, + "learning_rate": 1.089496591082386e-05, + "loss": 0.0749, + "step": 15579 + }, + { + "epoch": 6.335908906059374, + "grad_norm": 2.338927896724386, + "learning_rate": 1.0893956783825383e-05, + "loss": 0.0615, + "step": 15580 + }, + { + "epoch": 6.33631557543717, + "grad_norm": 5.764315387133384, + "learning_rate": 1.089294764764999e-05, + "loss": 0.12, + "step": 15581 + }, + { + "epoch": 6.336722244814966, + "grad_norm": 4.008822920292695, + "learning_rate": 1.089193850230804e-05, + "loss": 0.0719, + "step": 15582 + }, + { + "epoch": 6.337128914192761, + "grad_norm": 0.7552281198079646, + "learning_rate": 1.089092934780989e-05, + "loss": 0.0114, + "step": 15583 + }, + { + "epoch": 6.337535583570557, + "grad_norm": 8.124522923942637, + "learning_rate": 1.0889920184165905e-05, + "loss": 0.1846, + "step": 15584 + }, + { + "epoch": 6.337942252948353, + "grad_norm": 1.7970556729546447, + "learning_rate": 1.0888911011386439e-05, + "loss": 0.0314, + "step": 15585 + }, + { + "epoch": 6.338348922326149, + "grad_norm": 21.28866980304231, + "learning_rate": 1.0887901829481854e-05, + "loss": 0.4263, + "step": 15586 + }, + { + "epoch": 6.338755591703944, + "grad_norm": 0.462223392681922, + "learning_rate": 1.0886892638462511e-05, + "loss": 0.0093, + "step": 15587 + }, + { + "epoch": 6.33916226108174, + "grad_norm": 2.9075957239047066, + "learning_rate": 1.088588343833877e-05, + "loss": 0.0388, + "step": 15588 + }, + { + "epoch": 6.339568930459537, + "grad_norm": 0.6167118669217683, + "learning_rate": 1.0884874229120988e-05, + "loss": 0.0124, + "step": 15589 + }, + { + "epoch": 6.339975599837333, + "grad_norm": 0.4580082392133373, + "learning_rate": 1.0883865010819528e-05, + "loss": 0.0077, + "step": 15590 + }, + { + "epoch": 6.340382269215128, + "grad_norm": 0.6004180643239729, + "learning_rate": 1.0882855783444746e-05, + "loss": 0.0089, + "step": 15591 + }, + { + "epoch": 6.340788938592924, + "grad_norm": 2.4304559367882126, + "learning_rate": 1.0881846547007008e-05, + "loss": 0.0359, + "step": 15592 + }, + { + "epoch": 6.34119560797072, + "grad_norm": 0.32974202764169497, + "learning_rate": 1.0880837301516672e-05, + "loss": 0.0061, + "step": 15593 + }, + { + "epoch": 6.341602277348516, + "grad_norm": 0.2816498688794923, + "learning_rate": 1.0879828046984096e-05, + "loss": 0.0042, + "step": 15594 + }, + { + "epoch": 6.342008946726311, + "grad_norm": 9.813199224414706, + "learning_rate": 1.0878818783419642e-05, + "loss": 0.4239, + "step": 15595 + }, + { + "epoch": 6.342415616104107, + "grad_norm": 5.055038230557987, + "learning_rate": 1.0877809510833672e-05, + "loss": 0.1269, + "step": 15596 + }, + { + "epoch": 6.342822285481903, + "grad_norm": 2.062132895059673, + "learning_rate": 1.0876800229236542e-05, + "loss": 0.0287, + "step": 15597 + }, + { + "epoch": 6.343228954859699, + "grad_norm": 6.5073325263807815, + "learning_rate": 1.087579093863862e-05, + "loss": 0.1534, + "step": 15598 + }, + { + "epoch": 6.3436356242374945, + "grad_norm": 4.488090062655185, + "learning_rate": 1.0874781639050263e-05, + "loss": 0.1881, + "step": 15599 + }, + { + "epoch": 6.344042293615291, + "grad_norm": 2.556171359872266, + "learning_rate": 1.0873772330481831e-05, + "loss": 0.057, + "step": 15600 + }, + { + "epoch": 6.344448962993087, + "grad_norm": 7.111516125615984, + "learning_rate": 1.0872763012943686e-05, + "loss": 0.0884, + "step": 15601 + }, + { + "epoch": 6.344855632370883, + "grad_norm": 7.836258742542668, + "learning_rate": 1.0871753686446191e-05, + "loss": 0.231, + "step": 15602 + }, + { + "epoch": 6.3452623017486784, + "grad_norm": 1.008318546221797, + "learning_rate": 1.0870744350999702e-05, + "loss": 0.02, + "step": 15603 + }, + { + "epoch": 6.345668971126474, + "grad_norm": 2.8623544623346255, + "learning_rate": 1.086973500661459e-05, + "loss": 0.0391, + "step": 15604 + }, + { + "epoch": 6.34607564050427, + "grad_norm": 0.16037748726591156, + "learning_rate": 1.0868725653301206e-05, + "loss": 0.0026, + "step": 15605 + }, + { + "epoch": 6.346482309882066, + "grad_norm": 2.887966826890695, + "learning_rate": 1.086771629106992e-05, + "loss": 0.0528, + "step": 15606 + }, + { + "epoch": 6.3468889792598615, + "grad_norm": 8.767554723088473, + "learning_rate": 1.0866706919931085e-05, + "loss": 0.323, + "step": 15607 + }, + { + "epoch": 6.347295648637657, + "grad_norm": 1.03936561337239, + "learning_rate": 1.0865697539895069e-05, + "loss": 0.016, + "step": 15608 + }, + { + "epoch": 6.347702318015453, + "grad_norm": 3.0440717311351864, + "learning_rate": 1.0864688150972234e-05, + "loss": 0.0545, + "step": 15609 + }, + { + "epoch": 6.34810898739325, + "grad_norm": 0.2621348970229048, + "learning_rate": 1.0863678753172936e-05, + "loss": 0.004, + "step": 15610 + }, + { + "epoch": 6.3485156567710455, + "grad_norm": 5.160662318837962, + "learning_rate": 1.0862669346507544e-05, + "loss": 0.1013, + "step": 15611 + }, + { + "epoch": 6.348922326148841, + "grad_norm": 2.4440158827060383, + "learning_rate": 1.0861659930986414e-05, + "loss": 0.0468, + "step": 15612 + }, + { + "epoch": 6.349328995526637, + "grad_norm": 0.2744150459881337, + "learning_rate": 1.0860650506619912e-05, + "loss": 0.0022, + "step": 15613 + }, + { + "epoch": 6.349735664904433, + "grad_norm": 4.9926098886352746, + "learning_rate": 1.08596410734184e-05, + "loss": 0.1182, + "step": 15614 + }, + { + "epoch": 6.3501423342822285, + "grad_norm": 4.463552602737762, + "learning_rate": 1.0858631631392243e-05, + "loss": 0.0851, + "step": 15615 + }, + { + "epoch": 6.350549003660024, + "grad_norm": 4.746872682485304, + "learning_rate": 1.0857622180551798e-05, + "loss": 0.0842, + "step": 15616 + }, + { + "epoch": 6.35095567303782, + "grad_norm": 17.528034763006993, + "learning_rate": 1.0856612720907427e-05, + "loss": 1.2839, + "step": 15617 + }, + { + "epoch": 6.351362342415616, + "grad_norm": 4.82376661701252, + "learning_rate": 1.0855603252469498e-05, + "loss": 0.1945, + "step": 15618 + }, + { + "epoch": 6.351769011793412, + "grad_norm": 0.05897864411832308, + "learning_rate": 1.0854593775248372e-05, + "loss": 0.0012, + "step": 15619 + }, + { + "epoch": 6.352175681171207, + "grad_norm": 5.812397322401554, + "learning_rate": 1.0853584289254406e-05, + "loss": 0.1596, + "step": 15620 + }, + { + "epoch": 6.352582350549004, + "grad_norm": 6.586648043498574, + "learning_rate": 1.0852574794497972e-05, + "loss": 0.2165, + "step": 15621 + }, + { + "epoch": 6.3529890199268, + "grad_norm": 1.131096383668118, + "learning_rate": 1.085156529098943e-05, + "loss": 0.0173, + "step": 15622 + }, + { + "epoch": 6.3533956893045955, + "grad_norm": 6.187006776666371, + "learning_rate": 1.0850555778739136e-05, + "loss": 0.2464, + "step": 15623 + }, + { + "epoch": 6.353802358682391, + "grad_norm": 1.8279087234702756, + "learning_rate": 1.0849546257757465e-05, + "loss": 0.0302, + "step": 15624 + }, + { + "epoch": 6.354209028060187, + "grad_norm": 0.957000998809496, + "learning_rate": 1.084853672805477e-05, + "loss": 0.014, + "step": 15625 + }, + { + "epoch": 6.354615697437983, + "grad_norm": 4.4735921020171014, + "learning_rate": 1.0847527189641418e-05, + "loss": 0.076, + "step": 15626 + }, + { + "epoch": 6.355022366815779, + "grad_norm": 3.971517982478601, + "learning_rate": 1.0846517642527774e-05, + "loss": 0.0391, + "step": 15627 + }, + { + "epoch": 6.355429036193574, + "grad_norm": 2.838608485895077, + "learning_rate": 1.0845508086724204e-05, + "loss": 0.0415, + "step": 15628 + }, + { + "epoch": 6.35583570557137, + "grad_norm": 6.082474790615124, + "learning_rate": 1.0844498522241063e-05, + "loss": 0.2755, + "step": 15629 + }, + { + "epoch": 6.356242374949167, + "grad_norm": 1.6479496314436783, + "learning_rate": 1.084348894908872e-05, + "loss": 0.0337, + "step": 15630 + }, + { + "epoch": 6.3566490443269625, + "grad_norm": 3.336805451510595, + "learning_rate": 1.084247936727754e-05, + "loss": 0.0848, + "step": 15631 + }, + { + "epoch": 6.357055713704758, + "grad_norm": 5.418958632902574, + "learning_rate": 1.0841469776817885e-05, + "loss": 0.1533, + "step": 15632 + }, + { + "epoch": 6.357462383082554, + "grad_norm": 6.970159267731751, + "learning_rate": 1.084046017772012e-05, + "loss": 0.1817, + "step": 15633 + }, + { + "epoch": 6.35786905246035, + "grad_norm": 1.6566007674693919, + "learning_rate": 1.0839450569994608e-05, + "loss": 0.0259, + "step": 15634 + }, + { + "epoch": 6.358275721838146, + "grad_norm": 5.606121137416774, + "learning_rate": 1.0838440953651714e-05, + "loss": 0.1645, + "step": 15635 + }, + { + "epoch": 6.358682391215941, + "grad_norm": 8.784119630410032, + "learning_rate": 1.08374313287018e-05, + "loss": 0.5107, + "step": 15636 + }, + { + "epoch": 6.359089060593737, + "grad_norm": 0.03276379848020007, + "learning_rate": 1.0836421695155233e-05, + "loss": 0.0002, + "step": 15637 + }, + { + "epoch": 6.359495729971533, + "grad_norm": 6.657215235911612, + "learning_rate": 1.0835412053022376e-05, + "loss": 0.055, + "step": 15638 + }, + { + "epoch": 6.359902399349329, + "grad_norm": 0.12931736393720816, + "learning_rate": 1.083440240231359e-05, + "loss": 0.0016, + "step": 15639 + }, + { + "epoch": 6.360309068727124, + "grad_norm": 7.306327918161752, + "learning_rate": 1.0833392743039248e-05, + "loss": 0.049, + "step": 15640 + }, + { + "epoch": 6.360715738104921, + "grad_norm": 0.35115756502492984, + "learning_rate": 1.0832383075209714e-05, + "loss": 0.0071, + "step": 15641 + }, + { + "epoch": 6.361122407482717, + "grad_norm": 0.11923375782814458, + "learning_rate": 1.083137339883534e-05, + "loss": 0.0013, + "step": 15642 + }, + { + "epoch": 6.361529076860513, + "grad_norm": 0.5606477210892341, + "learning_rate": 1.0830363713926506e-05, + "loss": 0.0052, + "step": 15643 + }, + { + "epoch": 6.361935746238308, + "grad_norm": 6.55451010366233, + "learning_rate": 1.082935402049357e-05, + "loss": 0.1074, + "step": 15644 + }, + { + "epoch": 6.362342415616104, + "grad_norm": 1.2314455819262, + "learning_rate": 1.0828344318546892e-05, + "loss": 0.0141, + "step": 15645 + }, + { + "epoch": 6.3627490849939, + "grad_norm": 6.707697557118306, + "learning_rate": 1.0827334608096849e-05, + "loss": 0.3503, + "step": 15646 + }, + { + "epoch": 6.363155754371696, + "grad_norm": 0.2304724126336981, + "learning_rate": 1.0826324889153796e-05, + "loss": 0.0029, + "step": 15647 + }, + { + "epoch": 6.363562423749491, + "grad_norm": 1.0863916663642847, + "learning_rate": 1.0825315161728103e-05, + "loss": 0.0143, + "step": 15648 + }, + { + "epoch": 6.363969093127287, + "grad_norm": 8.874756963980126, + "learning_rate": 1.0824305425830135e-05, + "loss": 0.1646, + "step": 15649 + }, + { + "epoch": 6.364375762505083, + "grad_norm": 2.96329711572396, + "learning_rate": 1.0823295681470258e-05, + "loss": 0.043, + "step": 15650 + }, + { + "epoch": 6.36478243188288, + "grad_norm": 2.701485580702701, + "learning_rate": 1.0822285928658836e-05, + "loss": 0.0729, + "step": 15651 + }, + { + "epoch": 6.365189101260675, + "grad_norm": 3.923991204547538, + "learning_rate": 1.0821276167406233e-05, + "loss": 0.0713, + "step": 15652 + }, + { + "epoch": 6.365595770638471, + "grad_norm": 1.1932225266685412, + "learning_rate": 1.0820266397722818e-05, + "loss": 0.0171, + "step": 15653 + }, + { + "epoch": 6.366002440016267, + "grad_norm": 4.676632756126243, + "learning_rate": 1.0819256619618956e-05, + "loss": 0.0804, + "step": 15654 + }, + { + "epoch": 6.366409109394063, + "grad_norm": 2.3853248890180585, + "learning_rate": 1.0818246833105013e-05, + "loss": 0.032, + "step": 15655 + }, + { + "epoch": 6.3668157787718584, + "grad_norm": 0.3546757797499388, + "learning_rate": 1.0817237038191353e-05, + "loss": 0.0066, + "step": 15656 + }, + { + "epoch": 6.367222448149654, + "grad_norm": 4.298535084442226, + "learning_rate": 1.0816227234888345e-05, + "loss": 0.0753, + "step": 15657 + }, + { + "epoch": 6.36762911752745, + "grad_norm": 6.1516518255215535, + "learning_rate": 1.0815217423206352e-05, + "loss": 0.0891, + "step": 15658 + }, + { + "epoch": 6.368035786905246, + "grad_norm": 0.4732835100927212, + "learning_rate": 1.0814207603155743e-05, + "loss": 0.0065, + "step": 15659 + }, + { + "epoch": 6.3684424562830415, + "grad_norm": 1.0773495238019664, + "learning_rate": 1.0813197774746885e-05, + "loss": 0.0173, + "step": 15660 + }, + { + "epoch": 6.368849125660837, + "grad_norm": 4.014090426402805, + "learning_rate": 1.0812187937990138e-05, + "loss": 0.0751, + "step": 15661 + }, + { + "epoch": 6.369255795038634, + "grad_norm": 9.435035235091723, + "learning_rate": 1.0811178092895876e-05, + "loss": 0.3671, + "step": 15662 + }, + { + "epoch": 6.36966246441643, + "grad_norm": 0.683592975415915, + "learning_rate": 1.0810168239474464e-05, + "loss": 0.0091, + "step": 15663 + }, + { + "epoch": 6.3700691337942255, + "grad_norm": 9.256525293730341, + "learning_rate": 1.0809158377736264e-05, + "loss": 0.2556, + "step": 15664 + }, + { + "epoch": 6.370475803172021, + "grad_norm": 0.08232457576317288, + "learning_rate": 1.0808148507691647e-05, + "loss": 0.0007, + "step": 15665 + }, + { + "epoch": 6.370882472549817, + "grad_norm": 7.660153159676357, + "learning_rate": 1.080713862935098e-05, + "loss": 0.2307, + "step": 15666 + }, + { + "epoch": 6.371289141927613, + "grad_norm": 1.6938953791559732, + "learning_rate": 1.0806128742724629e-05, + "loss": 0.0165, + "step": 15667 + }, + { + "epoch": 6.3716958113054085, + "grad_norm": 0.36610243167341255, + "learning_rate": 1.0805118847822958e-05, + "loss": 0.0045, + "step": 15668 + }, + { + "epoch": 6.372102480683204, + "grad_norm": 0.8402824585858002, + "learning_rate": 1.0804108944656338e-05, + "loss": 0.0196, + "step": 15669 + }, + { + "epoch": 6.372509150061, + "grad_norm": 8.773570276276427, + "learning_rate": 1.0803099033235136e-05, + "loss": 0.1451, + "step": 15670 + }, + { + "epoch": 6.372915819438797, + "grad_norm": 5.682471654427691, + "learning_rate": 1.0802089113569716e-05, + "loss": 0.122, + "step": 15671 + }, + { + "epoch": 6.3733224888165925, + "grad_norm": 7.578033557566274, + "learning_rate": 1.080107918567045e-05, + "loss": 0.225, + "step": 15672 + }, + { + "epoch": 6.373729158194388, + "grad_norm": 7.259845920495343, + "learning_rate": 1.0800069249547704e-05, + "loss": 0.2332, + "step": 15673 + }, + { + "epoch": 6.374135827572184, + "grad_norm": 14.302264112977108, + "learning_rate": 1.079905930521184e-05, + "loss": 0.1138, + "step": 15674 + }, + { + "epoch": 6.37454249694998, + "grad_norm": 0.9748808651304095, + "learning_rate": 1.0798049352673235e-05, + "loss": 0.0191, + "step": 15675 + }, + { + "epoch": 6.3749491663277755, + "grad_norm": 0.5699910396576399, + "learning_rate": 1.079703939194225e-05, + "loss": 0.0101, + "step": 15676 + }, + { + "epoch": 6.375355835705571, + "grad_norm": 2.8092284042599456, + "learning_rate": 1.0796029423029256e-05, + "loss": 0.0519, + "step": 15677 + }, + { + "epoch": 6.375762505083367, + "grad_norm": 0.11480512535867476, + "learning_rate": 1.0795019445944615e-05, + "loss": 0.0015, + "step": 15678 + }, + { + "epoch": 6.376169174461163, + "grad_norm": 0.16218296322011058, + "learning_rate": 1.0794009460698703e-05, + "loss": 0.0023, + "step": 15679 + }, + { + "epoch": 6.376575843838959, + "grad_norm": 2.1792491707428505, + "learning_rate": 1.0792999467301884e-05, + "loss": 0.0345, + "step": 15680 + }, + { + "epoch": 6.376982513216754, + "grad_norm": 0.0482256778008119, + "learning_rate": 1.0791989465764523e-05, + "loss": 0.0006, + "step": 15681 + }, + { + "epoch": 6.377389182594551, + "grad_norm": 11.842307503657583, + "learning_rate": 1.0790979456096994e-05, + "loss": 0.3499, + "step": 15682 + }, + { + "epoch": 6.377795851972347, + "grad_norm": 4.774180903040508, + "learning_rate": 1.0789969438309664e-05, + "loss": 0.0679, + "step": 15683 + }, + { + "epoch": 6.3782025213501425, + "grad_norm": 0.07190204847834403, + "learning_rate": 1.0788959412412896e-05, + "loss": 0.0009, + "step": 15684 + }, + { + "epoch": 6.378609190727938, + "grad_norm": 8.322609419616189, + "learning_rate": 1.0787949378417066e-05, + "loss": 0.1533, + "step": 15685 + }, + { + "epoch": 6.379015860105734, + "grad_norm": 0.5429135181273371, + "learning_rate": 1.078693933633254e-05, + "loss": 0.0061, + "step": 15686 + }, + { + "epoch": 6.37942252948353, + "grad_norm": 0.11619854866457911, + "learning_rate": 1.0785929286169685e-05, + "loss": 0.0011, + "step": 15687 + }, + { + "epoch": 6.379829198861326, + "grad_norm": 0.04398207832010943, + "learning_rate": 1.0784919227938868e-05, + "loss": 0.0006, + "step": 15688 + }, + { + "epoch": 6.380235868239121, + "grad_norm": 0.31788570265736604, + "learning_rate": 1.0783909161650461e-05, + "loss": 0.0048, + "step": 15689 + }, + { + "epoch": 6.380642537616917, + "grad_norm": 2.092265366704933, + "learning_rate": 1.0782899087314833e-05, + "loss": 0.0336, + "step": 15690 + }, + { + "epoch": 6.381049206994713, + "grad_norm": 5.355352504464721, + "learning_rate": 1.0781889004942352e-05, + "loss": 0.0926, + "step": 15691 + }, + { + "epoch": 6.3814558763725096, + "grad_norm": 2.1384036085325033, + "learning_rate": 1.0780878914543386e-05, + "loss": 0.0362, + "step": 15692 + }, + { + "epoch": 6.381862545750305, + "grad_norm": 1.7482052805106978, + "learning_rate": 1.0779868816128305e-05, + "loss": 0.0229, + "step": 15693 + }, + { + "epoch": 6.382269215128101, + "grad_norm": 8.218467271042966, + "learning_rate": 1.0778858709707478e-05, + "loss": 0.1506, + "step": 15694 + }, + { + "epoch": 6.382675884505897, + "grad_norm": 6.377938217848945, + "learning_rate": 1.0777848595291276e-05, + "loss": 0.1406, + "step": 15695 + }, + { + "epoch": 6.383082553883693, + "grad_norm": 3.367058712470929, + "learning_rate": 1.0776838472890065e-05, + "loss": 0.0705, + "step": 15696 + }, + { + "epoch": 6.383489223261488, + "grad_norm": 9.379688918032421, + "learning_rate": 1.0775828342514214e-05, + "loss": 0.1501, + "step": 15697 + }, + { + "epoch": 6.383895892639284, + "grad_norm": 0.021220282135233883, + "learning_rate": 1.0774818204174098e-05, + "loss": 0.0003, + "step": 15698 + }, + { + "epoch": 6.38430256201708, + "grad_norm": 3.475754234834283, + "learning_rate": 1.0773808057880084e-05, + "loss": 0.2599, + "step": 15699 + }, + { + "epoch": 6.384709231394876, + "grad_norm": 1.2025141723928787, + "learning_rate": 1.0772797903642536e-05, + "loss": 0.0189, + "step": 15700 + }, + { + "epoch": 6.385115900772671, + "grad_norm": 5.113440025998506, + "learning_rate": 1.0771787741471832e-05, + "loss": 0.0941, + "step": 15701 + }, + { + "epoch": 6.385522570150467, + "grad_norm": 5.841823341913252, + "learning_rate": 1.0770777571378339e-05, + "loss": 0.2215, + "step": 15702 + }, + { + "epoch": 6.385929239528264, + "grad_norm": 7.733109367709774, + "learning_rate": 1.0769767393372424e-05, + "loss": 0.2394, + "step": 15703 + }, + { + "epoch": 6.38633590890606, + "grad_norm": 12.110153214901576, + "learning_rate": 1.0768757207464462e-05, + "loss": 0.7688, + "step": 15704 + }, + { + "epoch": 6.386742578283855, + "grad_norm": 6.185332102192504, + "learning_rate": 1.0767747013664817e-05, + "loss": 0.1221, + "step": 15705 + }, + { + "epoch": 6.387149247661651, + "grad_norm": 12.067110180042143, + "learning_rate": 1.0766736811983864e-05, + "loss": 0.3898, + "step": 15706 + }, + { + "epoch": 6.387555917039447, + "grad_norm": 0.25665347868058863, + "learning_rate": 1.0765726602431974e-05, + "loss": 0.0049, + "step": 15707 + }, + { + "epoch": 6.387962586417243, + "grad_norm": 1.8289643947217784, + "learning_rate": 1.0764716385019513e-05, + "loss": 0.035, + "step": 15708 + }, + { + "epoch": 6.3883692557950384, + "grad_norm": 4.2009337007863525, + "learning_rate": 1.0763706159756854e-05, + "loss": 0.0861, + "step": 15709 + }, + { + "epoch": 6.388775925172834, + "grad_norm": 5.639073166344317, + "learning_rate": 1.0762695926654365e-05, + "loss": 0.1011, + "step": 15710 + }, + { + "epoch": 6.38918259455063, + "grad_norm": 6.812230086160785, + "learning_rate": 1.076168568572242e-05, + "loss": 0.2343, + "step": 15711 + }, + { + "epoch": 6.389589263928427, + "grad_norm": 3.3334009003331873, + "learning_rate": 1.076067543697139e-05, + "loss": 0.0506, + "step": 15712 + }, + { + "epoch": 6.389995933306222, + "grad_norm": 2.108153593339897, + "learning_rate": 1.075966518041164e-05, + "loss": 0.0306, + "step": 15713 + }, + { + "epoch": 6.390402602684018, + "grad_norm": 0.3264916040984237, + "learning_rate": 1.0758654916053546e-05, + "loss": 0.0053, + "step": 15714 + }, + { + "epoch": 6.390809272061814, + "grad_norm": 17.985072124899716, + "learning_rate": 1.075764464390748e-05, + "loss": 0.3496, + "step": 15715 + }, + { + "epoch": 6.39121594143961, + "grad_norm": 1.1748568982868979, + "learning_rate": 1.0756634363983807e-05, + "loss": 0.0165, + "step": 15716 + }, + { + "epoch": 6.3916226108174055, + "grad_norm": 1.8596252718881117, + "learning_rate": 1.0755624076292903e-05, + "loss": 0.0243, + "step": 15717 + }, + { + "epoch": 6.392029280195201, + "grad_norm": 0.16878668211066403, + "learning_rate": 1.0754613780845138e-05, + "loss": 0.004, + "step": 15718 + }, + { + "epoch": 6.392435949572997, + "grad_norm": 0.13415060707602086, + "learning_rate": 1.0753603477650881e-05, + "loss": 0.0017, + "step": 15719 + }, + { + "epoch": 6.392842618950793, + "grad_norm": 2.5096990608128777, + "learning_rate": 1.0752593166720506e-05, + "loss": 0.0366, + "step": 15720 + }, + { + "epoch": 6.3932492883285885, + "grad_norm": 0.51573559325049, + "learning_rate": 1.0751582848064384e-05, + "loss": 0.0089, + "step": 15721 + }, + { + "epoch": 6.393655957706384, + "grad_norm": 0.05431154033047648, + "learning_rate": 1.0750572521692886e-05, + "loss": 0.0009, + "step": 15722 + }, + { + "epoch": 6.394062627084181, + "grad_norm": 4.98660364194825, + "learning_rate": 1.074956218761638e-05, + "loss": 0.1089, + "step": 15723 + }, + { + "epoch": 6.394469296461977, + "grad_norm": 7.292306039791082, + "learning_rate": 1.0748551845845246e-05, + "loss": 0.2054, + "step": 15724 + }, + { + "epoch": 6.3948759658397725, + "grad_norm": 0.07225773291597476, + "learning_rate": 1.0747541496389848e-05, + "loss": 0.001, + "step": 15725 + }, + { + "epoch": 6.395282635217568, + "grad_norm": 0.10412473493505679, + "learning_rate": 1.0746531139260559e-05, + "loss": 0.0012, + "step": 15726 + }, + { + "epoch": 6.395689304595364, + "grad_norm": 8.733564832621418, + "learning_rate": 1.0745520774467752e-05, + "loss": 0.1962, + "step": 15727 + }, + { + "epoch": 6.39609597397316, + "grad_norm": 1.5361411310530813, + "learning_rate": 1.0744510402021802e-05, + "loss": 0.0199, + "step": 15728 + }, + { + "epoch": 6.3965026433509555, + "grad_norm": 4.216781783927251, + "learning_rate": 1.0743500021933076e-05, + "loss": 0.096, + "step": 15729 + }, + { + "epoch": 6.396909312728751, + "grad_norm": 6.254292530412173, + "learning_rate": 1.0742489634211947e-05, + "loss": 0.1969, + "step": 15730 + }, + { + "epoch": 6.397315982106547, + "grad_norm": 1.6322968029149867, + "learning_rate": 1.0741479238868793e-05, + "loss": 0.0209, + "step": 15731 + }, + { + "epoch": 6.397722651484343, + "grad_norm": 6.78659727565956, + "learning_rate": 1.0740468835913975e-05, + "loss": 0.0987, + "step": 15732 + }, + { + "epoch": 6.3981293208621395, + "grad_norm": 2.276028723584685, + "learning_rate": 1.0739458425357875e-05, + "loss": 0.0386, + "step": 15733 + }, + { + "epoch": 6.398535990239935, + "grad_norm": 11.030802440074234, + "learning_rate": 1.0738448007210864e-05, + "loss": 0.6377, + "step": 15734 + }, + { + "epoch": 6.398942659617731, + "grad_norm": 0.1032590060871227, + "learning_rate": 1.0737437581483312e-05, + "loss": 0.0017, + "step": 15735 + }, + { + "epoch": 6.399349328995527, + "grad_norm": 0.23706072152237642, + "learning_rate": 1.0736427148185588e-05, + "loss": 0.003, + "step": 15736 + }, + { + "epoch": 6.3997559983733225, + "grad_norm": 6.626053059740049, + "learning_rate": 1.0735416707328072e-05, + "loss": 0.1375, + "step": 15737 + }, + { + "epoch": 6.400162667751118, + "grad_norm": 3.1687406289214075, + "learning_rate": 1.0734406258921133e-05, + "loss": 0.0659, + "step": 15738 + }, + { + "epoch": 6.400569337128914, + "grad_norm": 0.034442933861954454, + "learning_rate": 1.0733395802975144e-05, + "loss": 0.0008, + "step": 15739 + }, + { + "epoch": 6.40097600650671, + "grad_norm": 0.20662443876632314, + "learning_rate": 1.0732385339500478e-05, + "loss": 0.0028, + "step": 15740 + }, + { + "epoch": 6.401382675884506, + "grad_norm": 0.5518898442230163, + "learning_rate": 1.0731374868507508e-05, + "loss": 0.0103, + "step": 15741 + }, + { + "epoch": 6.401789345262301, + "grad_norm": 4.800538852790082, + "learning_rate": 1.0730364390006606e-05, + "loss": 0.0493, + "step": 15742 + }, + { + "epoch": 6.402196014640097, + "grad_norm": 0.7257640124315788, + "learning_rate": 1.0729353904008148e-05, + "loss": 0.0148, + "step": 15743 + }, + { + "epoch": 6.402602684017894, + "grad_norm": 0.9265117913426224, + "learning_rate": 1.0728343410522507e-05, + "loss": 0.0115, + "step": 15744 + }, + { + "epoch": 6.4030093533956896, + "grad_norm": 6.39942344293441, + "learning_rate": 1.0727332909560046e-05, + "loss": 0.2683, + "step": 15745 + }, + { + "epoch": 6.403416022773485, + "grad_norm": 1.208401494345437, + "learning_rate": 1.0726322401131152e-05, + "loss": 0.0307, + "step": 15746 + }, + { + "epoch": 6.403822692151281, + "grad_norm": 8.62213673092381, + "learning_rate": 1.0725311885246194e-05, + "loss": 0.3325, + "step": 15747 + }, + { + "epoch": 6.404229361529077, + "grad_norm": 3.066209598479977, + "learning_rate": 1.0724301361915543e-05, + "loss": 0.0326, + "step": 15748 + }, + { + "epoch": 6.404636030906873, + "grad_norm": 5.814853501782587, + "learning_rate": 1.0723290831149576e-05, + "loss": 0.1695, + "step": 15749 + }, + { + "epoch": 6.405042700284668, + "grad_norm": 1.5686621495802882, + "learning_rate": 1.0722280292958662e-05, + "loss": 0.0135, + "step": 15750 + }, + { + "epoch": 6.405449369662464, + "grad_norm": 3.2872495689435737, + "learning_rate": 1.0721269747353175e-05, + "loss": 0.0498, + "step": 15751 + }, + { + "epoch": 6.40585603904026, + "grad_norm": 0.05092256962781737, + "learning_rate": 1.0720259194343496e-05, + "loss": 0.001, + "step": 15752 + }, + { + "epoch": 6.406262708418057, + "grad_norm": 3.9754756689884796, + "learning_rate": 1.0719248633939992e-05, + "loss": 0.0587, + "step": 15753 + }, + { + "epoch": 6.406669377795852, + "grad_norm": 4.4918855049893835, + "learning_rate": 1.071823806615304e-05, + "loss": 0.1121, + "step": 15754 + }, + { + "epoch": 6.407076047173648, + "grad_norm": 0.5680852819824687, + "learning_rate": 1.0717227490993009e-05, + "loss": 0.0047, + "step": 15755 + }, + { + "epoch": 6.407482716551444, + "grad_norm": 0.11337048688175191, + "learning_rate": 1.0716216908470281e-05, + "loss": 0.001, + "step": 15756 + }, + { + "epoch": 6.40788938592924, + "grad_norm": 0.39246886190838415, + "learning_rate": 1.0715206318595224e-05, + "loss": 0.005, + "step": 15757 + }, + { + "epoch": 6.408296055307035, + "grad_norm": 0.4194521504112615, + "learning_rate": 1.0714195721378213e-05, + "loss": 0.0058, + "step": 15758 + }, + { + "epoch": 6.408702724684831, + "grad_norm": 1.6753494389863817, + "learning_rate": 1.0713185116829625e-05, + "loss": 0.022, + "step": 15759 + }, + { + "epoch": 6.409109394062627, + "grad_norm": 3.4786129371863916, + "learning_rate": 1.0712174504959834e-05, + "loss": 0.1695, + "step": 15760 + }, + { + "epoch": 6.409516063440423, + "grad_norm": 4.165471504604261, + "learning_rate": 1.071116388577921e-05, + "loss": 0.134, + "step": 15761 + }, + { + "epoch": 6.4099227328182184, + "grad_norm": 0.32234418759873784, + "learning_rate": 1.0710153259298133e-05, + "loss": 0.003, + "step": 15762 + }, + { + "epoch": 6.410329402196014, + "grad_norm": 0.04859801390330634, + "learning_rate": 1.0709142625526976e-05, + "loss": 0.0005, + "step": 15763 + }, + { + "epoch": 6.410736071573811, + "grad_norm": 7.988005520042209, + "learning_rate": 1.0708131984476111e-05, + "loss": 0.1141, + "step": 15764 + }, + { + "epoch": 6.411142740951607, + "grad_norm": 1.0492923164609038, + "learning_rate": 1.0707121336155917e-05, + "loss": 0.0179, + "step": 15765 + }, + { + "epoch": 6.411549410329402, + "grad_norm": 4.2816130543355255, + "learning_rate": 1.0706110680576766e-05, + "loss": 0.0521, + "step": 15766 + }, + { + "epoch": 6.411956079707198, + "grad_norm": 0.12264774184431097, + "learning_rate": 1.0705100017749034e-05, + "loss": 0.0022, + "step": 15767 + }, + { + "epoch": 6.412362749084994, + "grad_norm": 1.0472281078662102, + "learning_rate": 1.0704089347683092e-05, + "loss": 0.0193, + "step": 15768 + }, + { + "epoch": 6.41276941846279, + "grad_norm": 10.059677215295515, + "learning_rate": 1.0703078670389323e-05, + "loss": 0.3554, + "step": 15769 + }, + { + "epoch": 6.4131760878405855, + "grad_norm": 5.696253166023497, + "learning_rate": 1.0702067985878095e-05, + "loss": 0.0738, + "step": 15770 + }, + { + "epoch": 6.413582757218381, + "grad_norm": 12.34852019485279, + "learning_rate": 1.0701057294159785e-05, + "loss": 0.6251, + "step": 15771 + }, + { + "epoch": 6.413989426596177, + "grad_norm": 6.509376282541316, + "learning_rate": 1.070004659524477e-05, + "loss": 0.1273, + "step": 15772 + }, + { + "epoch": 6.414396095973973, + "grad_norm": 2.3536031597922205, + "learning_rate": 1.0699035889143427e-05, + "loss": 0.0381, + "step": 15773 + }, + { + "epoch": 6.414802765351769, + "grad_norm": 5.086939849573491, + "learning_rate": 1.0698025175866125e-05, + "loss": 0.1019, + "step": 15774 + }, + { + "epoch": 6.415209434729565, + "grad_norm": 0.018915209852319502, + "learning_rate": 1.0697014455423245e-05, + "loss": 0.0003, + "step": 15775 + }, + { + "epoch": 6.415616104107361, + "grad_norm": 0.010812644520188864, + "learning_rate": 1.0696003727825161e-05, + "loss": 0.0001, + "step": 15776 + }, + { + "epoch": 6.416022773485157, + "grad_norm": 1.4436045220792666, + "learning_rate": 1.0694992993082247e-05, + "loss": 0.0177, + "step": 15777 + }, + { + "epoch": 6.4164294428629525, + "grad_norm": 0.07313369229074106, + "learning_rate": 1.0693982251204882e-05, + "loss": 0.0012, + "step": 15778 + }, + { + "epoch": 6.416836112240748, + "grad_norm": 1.769249675913228, + "learning_rate": 1.069297150220344e-05, + "loss": 0.0071, + "step": 15779 + }, + { + "epoch": 6.417242781618544, + "grad_norm": 2.7797132945458727, + "learning_rate": 1.0691960746088296e-05, + "loss": 0.0386, + "step": 15780 + }, + { + "epoch": 6.41764945099634, + "grad_norm": 1.27134755921981, + "learning_rate": 1.0690949982869826e-05, + "loss": 0.0131, + "step": 15781 + }, + { + "epoch": 6.4180561203741355, + "grad_norm": 3.9084428759446648, + "learning_rate": 1.0689939212558406e-05, + "loss": 0.0328, + "step": 15782 + }, + { + "epoch": 6.418462789751931, + "grad_norm": 8.17456866638967, + "learning_rate": 1.0688928435164417e-05, + "loss": 0.3405, + "step": 15783 + }, + { + "epoch": 6.418869459129727, + "grad_norm": 0.08604887159922861, + "learning_rate": 1.0687917650698225e-05, + "loss": 0.0005, + "step": 15784 + }, + { + "epoch": 6.419276128507524, + "grad_norm": 3.2268129208929284, + "learning_rate": 1.0686906859170217e-05, + "loss": 0.04, + "step": 15785 + }, + { + "epoch": 6.4196827978853195, + "grad_norm": 7.580840543130035, + "learning_rate": 1.0685896060590762e-05, + "loss": 0.2859, + "step": 15786 + }, + { + "epoch": 6.420089467263115, + "grad_norm": 4.032060743703601, + "learning_rate": 1.0684885254970238e-05, + "loss": 0.0631, + "step": 15787 + }, + { + "epoch": 6.420496136640911, + "grad_norm": 0.06053462099852286, + "learning_rate": 1.0683874442319025e-05, + "loss": 0.0007, + "step": 15788 + }, + { + "epoch": 6.420902806018707, + "grad_norm": 4.128639212556069, + "learning_rate": 1.0682863622647495e-05, + "loss": 0.069, + "step": 15789 + }, + { + "epoch": 6.4213094753965025, + "grad_norm": 0.24280730007835663, + "learning_rate": 1.0681852795966023e-05, + "loss": 0.0024, + "step": 15790 + }, + { + "epoch": 6.421716144774298, + "grad_norm": 0.024167797268436153, + "learning_rate": 1.0680841962284995e-05, + "loss": 0.0003, + "step": 15791 + }, + { + "epoch": 6.422122814152094, + "grad_norm": 1.521949012527261, + "learning_rate": 1.067983112161478e-05, + "loss": 0.0195, + "step": 15792 + }, + { + "epoch": 6.42252948352989, + "grad_norm": 3.214757103921263, + "learning_rate": 1.0678820273965753e-05, + "loss": 0.0933, + "step": 15793 + }, + { + "epoch": 6.4229361529076865, + "grad_norm": 0.029060977801450852, + "learning_rate": 1.0677809419348297e-05, + "loss": 0.0003, + "step": 15794 + }, + { + "epoch": 6.423342822285482, + "grad_norm": 0.6990890847635002, + "learning_rate": 1.0676798557772787e-05, + "loss": 0.0092, + "step": 15795 + }, + { + "epoch": 6.423749491663278, + "grad_norm": 0.1829510241080996, + "learning_rate": 1.0675787689249594e-05, + "loss": 0.0026, + "step": 15796 + }, + { + "epoch": 6.424156161041074, + "grad_norm": 4.565687157515744, + "learning_rate": 1.0674776813789106e-05, + "loss": 0.0398, + "step": 15797 + }, + { + "epoch": 6.4245628304188696, + "grad_norm": 0.5157319244722782, + "learning_rate": 1.0673765931401695e-05, + "loss": 0.0102, + "step": 15798 + }, + { + "epoch": 6.424969499796665, + "grad_norm": 1.1257031800330097, + "learning_rate": 1.0672755042097733e-05, + "loss": 0.0126, + "step": 15799 + }, + { + "epoch": 6.425376169174461, + "grad_norm": 0.5172528957176342, + "learning_rate": 1.0671744145887603e-05, + "loss": 0.0053, + "step": 15800 + }, + { + "epoch": 6.425782838552257, + "grad_norm": 2.9902759604549227, + "learning_rate": 1.0670733242781684e-05, + "loss": 0.1237, + "step": 15801 + }, + { + "epoch": 6.426189507930053, + "grad_norm": 4.486435291885277, + "learning_rate": 1.0669722332790348e-05, + "loss": 0.0569, + "step": 15802 + }, + { + "epoch": 6.426596177307848, + "grad_norm": 7.851234335233114, + "learning_rate": 1.0668711415923974e-05, + "loss": 0.1099, + "step": 15803 + }, + { + "epoch": 6.427002846685644, + "grad_norm": 12.460838600890305, + "learning_rate": 1.0667700492192942e-05, + "loss": 0.2403, + "step": 15804 + }, + { + "epoch": 6.427409516063441, + "grad_norm": 0.399439770902897, + "learning_rate": 1.066668956160763e-05, + "loss": 0.0053, + "step": 15805 + }, + { + "epoch": 6.427816185441237, + "grad_norm": 1.6947391025537144, + "learning_rate": 1.0665678624178413e-05, + "loss": 0.0197, + "step": 15806 + }, + { + "epoch": 6.428222854819032, + "grad_norm": 9.69590817541989, + "learning_rate": 1.066466767991567e-05, + "loss": 0.1098, + "step": 15807 + }, + { + "epoch": 6.428629524196828, + "grad_norm": 6.314528648813888, + "learning_rate": 1.0663656728829781e-05, + "loss": 0.2816, + "step": 15808 + }, + { + "epoch": 6.429036193574624, + "grad_norm": 0.7123624963381883, + "learning_rate": 1.0662645770931118e-05, + "loss": 0.0097, + "step": 15809 + }, + { + "epoch": 6.42944286295242, + "grad_norm": 0.13723137147187145, + "learning_rate": 1.0661634806230063e-05, + "loss": 0.002, + "step": 15810 + }, + { + "epoch": 6.429849532330215, + "grad_norm": 1.9067453279255129, + "learning_rate": 1.0660623834736996e-05, + "loss": 0.0251, + "step": 15811 + }, + { + "epoch": 6.430256201708011, + "grad_norm": 1.5486444790981695, + "learning_rate": 1.0659612856462294e-05, + "loss": 0.0207, + "step": 15812 + }, + { + "epoch": 6.430662871085807, + "grad_norm": 1.543052061237086, + "learning_rate": 1.065860187141633e-05, + "loss": 0.0236, + "step": 15813 + }, + { + "epoch": 6.431069540463603, + "grad_norm": 8.418905595040362, + "learning_rate": 1.0657590879609488e-05, + "loss": 0.3499, + "step": 15814 + }, + { + "epoch": 6.431476209841399, + "grad_norm": 0.31116425730572966, + "learning_rate": 1.0656579881052145e-05, + "loss": 0.0034, + "step": 15815 + }, + { + "epoch": 6.431882879219195, + "grad_norm": 0.15840027035794602, + "learning_rate": 1.0655568875754678e-05, + "loss": 0.0012, + "step": 15816 + }, + { + "epoch": 6.432289548596991, + "grad_norm": 2.806856195465225, + "learning_rate": 1.0654557863727469e-05, + "loss": 0.0592, + "step": 15817 + }, + { + "epoch": 6.432696217974787, + "grad_norm": 0.22289358755761873, + "learning_rate": 1.0653546844980891e-05, + "loss": 0.0024, + "step": 15818 + }, + { + "epoch": 6.433102887352582, + "grad_norm": 3.2891483977546714, + "learning_rate": 1.0652535819525328e-05, + "loss": 0.078, + "step": 15819 + }, + { + "epoch": 6.433509556730378, + "grad_norm": 12.772861546763444, + "learning_rate": 1.0651524787371155e-05, + "loss": 0.3317, + "step": 15820 + }, + { + "epoch": 6.433916226108174, + "grad_norm": 0.024073551575326393, + "learning_rate": 1.0650513748528755e-05, + "loss": 0.0004, + "step": 15821 + }, + { + "epoch": 6.43432289548597, + "grad_norm": 7.394876238800152, + "learning_rate": 1.0649502703008499e-05, + "loss": 0.1219, + "step": 15822 + }, + { + "epoch": 6.4347295648637655, + "grad_norm": 1.3119174502738469, + "learning_rate": 1.0648491650820774e-05, + "loss": 0.015, + "step": 15823 + }, + { + "epoch": 6.435136234241561, + "grad_norm": 0.7971986274607838, + "learning_rate": 1.0647480591975956e-05, + "loss": 0.0113, + "step": 15824 + }, + { + "epoch": 6.435542903619357, + "grad_norm": 9.63444028761721, + "learning_rate": 1.0646469526484423e-05, + "loss": 0.1734, + "step": 15825 + }, + { + "epoch": 6.435949572997154, + "grad_norm": 0.1483637404304447, + "learning_rate": 1.0645458454356554e-05, + "loss": 0.0024, + "step": 15826 + }, + { + "epoch": 6.436356242374949, + "grad_norm": 4.674490662003217, + "learning_rate": 1.0644447375602731e-05, + "loss": 0.0801, + "step": 15827 + }, + { + "epoch": 6.436762911752745, + "grad_norm": 1.3196776910649812, + "learning_rate": 1.0643436290233331e-05, + "loss": 0.0077, + "step": 15828 + }, + { + "epoch": 6.437169581130541, + "grad_norm": 8.780346464189236, + "learning_rate": 1.0642425198258732e-05, + "loss": 0.1178, + "step": 15829 + }, + { + "epoch": 6.437576250508337, + "grad_norm": 0.3591428528367536, + "learning_rate": 1.0641414099689318e-05, + "loss": 0.0065, + "step": 15830 + }, + { + "epoch": 6.4379829198861325, + "grad_norm": 0.24370770713941803, + "learning_rate": 1.0640402994535464e-05, + "loss": 0.0046, + "step": 15831 + }, + { + "epoch": 6.438389589263928, + "grad_norm": 5.524588684638567, + "learning_rate": 1.063939188280755e-05, + "loss": 0.2089, + "step": 15832 + }, + { + "epoch": 6.438796258641724, + "grad_norm": 0.4321356571472994, + "learning_rate": 1.0638380764515956e-05, + "loss": 0.0059, + "step": 15833 + }, + { + "epoch": 6.43920292801952, + "grad_norm": 6.0556952975986595, + "learning_rate": 1.0637369639671065e-05, + "loss": 0.1083, + "step": 15834 + }, + { + "epoch": 6.439609597397316, + "grad_norm": 8.285225793293943, + "learning_rate": 1.063635850828325e-05, + "loss": 0.328, + "step": 15835 + }, + { + "epoch": 6.440016266775112, + "grad_norm": 3.404452189466092, + "learning_rate": 1.0635347370362898e-05, + "loss": 0.09, + "step": 15836 + }, + { + "epoch": 6.440422936152908, + "grad_norm": 3.578570476059452, + "learning_rate": 1.0634336225920385e-05, + "loss": 0.0873, + "step": 15837 + }, + { + "epoch": 6.440829605530704, + "grad_norm": 0.41046421053626, + "learning_rate": 1.0633325074966092e-05, + "loss": 0.0037, + "step": 15838 + }, + { + "epoch": 6.4412362749084995, + "grad_norm": 4.000095214842074, + "learning_rate": 1.0632313917510395e-05, + "loss": 0.0846, + "step": 15839 + }, + { + "epoch": 6.441642944286295, + "grad_norm": 3.4331772576613155, + "learning_rate": 1.0631302753563681e-05, + "loss": 0.0646, + "step": 15840 + }, + { + "epoch": 6.442049613664091, + "grad_norm": 9.390896803560898, + "learning_rate": 1.0630291583136325e-05, + "loss": 0.2901, + "step": 15841 + }, + { + "epoch": 6.442456283041887, + "grad_norm": 0.7226776100760229, + "learning_rate": 1.0629280406238711e-05, + "loss": 0.009, + "step": 15842 + }, + { + "epoch": 6.4428629524196825, + "grad_norm": 12.073162262184624, + "learning_rate": 1.0628269222881216e-05, + "loss": 0.1648, + "step": 15843 + }, + { + "epoch": 6.443269621797478, + "grad_norm": 1.8961564688060422, + "learning_rate": 1.062725803307422e-05, + "loss": 0.0202, + "step": 15844 + }, + { + "epoch": 6.443676291175274, + "grad_norm": 0.023790045526020896, + "learning_rate": 1.0626246836828102e-05, + "loss": 0.0003, + "step": 15845 + }, + { + "epoch": 6.444082960553071, + "grad_norm": 3.7794915845280177, + "learning_rate": 1.0625235634153248e-05, + "loss": 0.0433, + "step": 15846 + }, + { + "epoch": 6.4444896299308665, + "grad_norm": 0.27791023129380427, + "learning_rate": 1.0624224425060037e-05, + "loss": 0.0045, + "step": 15847 + }, + { + "epoch": 6.444896299308662, + "grad_norm": 0.14134885337041173, + "learning_rate": 1.0623213209558845e-05, + "loss": 0.0025, + "step": 15848 + }, + { + "epoch": 6.445302968686458, + "grad_norm": 0.41528332013641206, + "learning_rate": 1.0622201987660057e-05, + "loss": 0.0046, + "step": 15849 + }, + { + "epoch": 6.445709638064254, + "grad_norm": 0.7896177809970737, + "learning_rate": 1.0621190759374052e-05, + "loss": 0.0096, + "step": 15850 + }, + { + "epoch": 6.4461163074420496, + "grad_norm": 0.011729373438425315, + "learning_rate": 1.0620179524711211e-05, + "loss": 0.0002, + "step": 15851 + }, + { + "epoch": 6.446522976819845, + "grad_norm": 0.9876608232931482, + "learning_rate": 1.0619168283681916e-05, + "loss": 0.014, + "step": 15852 + }, + { + "epoch": 6.446929646197641, + "grad_norm": 3.9791838548719287, + "learning_rate": 1.0618157036296548e-05, + "loss": 0.1007, + "step": 15853 + }, + { + "epoch": 6.447336315575437, + "grad_norm": 0.2856062773758719, + "learning_rate": 1.0617145782565483e-05, + "loss": 0.0043, + "step": 15854 + }, + { + "epoch": 6.447742984953233, + "grad_norm": 13.262925221364881, + "learning_rate": 1.0616134522499109e-05, + "loss": 0.6625, + "step": 15855 + }, + { + "epoch": 6.448149654331029, + "grad_norm": 0.1410838507916841, + "learning_rate": 1.0615123256107805e-05, + "loss": 0.0021, + "step": 15856 + }, + { + "epoch": 6.448556323708825, + "grad_norm": 5.349790830962602, + "learning_rate": 1.0614111983401949e-05, + "loss": 0.1255, + "step": 15857 + }, + { + "epoch": 6.448962993086621, + "grad_norm": 1.6784580768618784, + "learning_rate": 1.0613100704391923e-05, + "loss": 0.0285, + "step": 15858 + }, + { + "epoch": 6.449369662464417, + "grad_norm": 0.12424001838895897, + "learning_rate": 1.0612089419088114e-05, + "loss": 0.0021, + "step": 15859 + }, + { + "epoch": 6.449776331842212, + "grad_norm": 2.592926552843799, + "learning_rate": 1.0611078127500897e-05, + "loss": 0.0241, + "step": 15860 + }, + { + "epoch": 6.450183001220008, + "grad_norm": 0.5164404390889017, + "learning_rate": 1.0610066829640653e-05, + "loss": 0.0069, + "step": 15861 + }, + { + "epoch": 6.450589670597804, + "grad_norm": 2.097522434747201, + "learning_rate": 1.060905552551777e-05, + "loss": 0.0243, + "step": 15862 + }, + { + "epoch": 6.4509963399756, + "grad_norm": 3.2303028588789076, + "learning_rate": 1.0608044215142622e-05, + "loss": 0.0342, + "step": 15863 + }, + { + "epoch": 6.451403009353395, + "grad_norm": 0.23979602435995453, + "learning_rate": 1.0607032898525596e-05, + "loss": 0.0038, + "step": 15864 + }, + { + "epoch": 6.451809678731191, + "grad_norm": 0.3028751940686876, + "learning_rate": 1.0606021575677073e-05, + "loss": 0.0035, + "step": 15865 + }, + { + "epoch": 6.452216348108988, + "grad_norm": 5.117025735169192, + "learning_rate": 1.0605010246607432e-05, + "loss": 0.1226, + "step": 15866 + }, + { + "epoch": 6.452623017486784, + "grad_norm": 0.3259108610813429, + "learning_rate": 1.0603998911327056e-05, + "loss": 0.0043, + "step": 15867 + }, + { + "epoch": 6.453029686864579, + "grad_norm": 14.900318062383466, + "learning_rate": 1.0602987569846327e-05, + "loss": 0.6121, + "step": 15868 + }, + { + "epoch": 6.453436356242375, + "grad_norm": 4.05957760281398, + "learning_rate": 1.060197622217563e-05, + "loss": 0.1577, + "step": 15869 + }, + { + "epoch": 6.453843025620171, + "grad_norm": 2.7098118709255856, + "learning_rate": 1.0600964868325344e-05, + "loss": 0.0627, + "step": 15870 + }, + { + "epoch": 6.454249694997967, + "grad_norm": 4.159292091090483, + "learning_rate": 1.0599953508305845e-05, + "loss": 0.1198, + "step": 15871 + }, + { + "epoch": 6.454656364375762, + "grad_norm": 1.976923505117282, + "learning_rate": 1.0598942142127527e-05, + "loss": 0.0659, + "step": 15872 + }, + { + "epoch": 6.455063033753558, + "grad_norm": 7.597234227097311, + "learning_rate": 1.0597930769800765e-05, + "loss": 0.1431, + "step": 15873 + }, + { + "epoch": 6.455469703131354, + "grad_norm": 24.947506399388224, + "learning_rate": 1.059691939133594e-05, + "loss": 0.5226, + "step": 15874 + }, + { + "epoch": 6.45587637250915, + "grad_norm": 12.719135081401085, + "learning_rate": 1.0595908006743441e-05, + "loss": 0.5856, + "step": 15875 + }, + { + "epoch": 6.456283041886946, + "grad_norm": 1.2521942843897973, + "learning_rate": 1.0594896616033645e-05, + "loss": 0.0222, + "step": 15876 + }, + { + "epoch": 6.456689711264742, + "grad_norm": 0.03321765856310193, + "learning_rate": 1.0593885219216935e-05, + "loss": 0.0004, + "step": 15877 + }, + { + "epoch": 6.457096380642538, + "grad_norm": 11.598629089178862, + "learning_rate": 1.0592873816303697e-05, + "loss": 0.3705, + "step": 15878 + }, + { + "epoch": 6.457503050020334, + "grad_norm": 8.197531462347346, + "learning_rate": 1.059186240730431e-05, + "loss": 0.4334, + "step": 15879 + }, + { + "epoch": 6.457909719398129, + "grad_norm": 0.8231484196080509, + "learning_rate": 1.0590850992229154e-05, + "loss": 0.0133, + "step": 15880 + }, + { + "epoch": 6.458316388775925, + "grad_norm": 1.4571320470762408, + "learning_rate": 1.0589839571088619e-05, + "loss": 0.0157, + "step": 15881 + }, + { + "epoch": 6.458723058153721, + "grad_norm": 0.3349929701143239, + "learning_rate": 1.0588828143893082e-05, + "loss": 0.004, + "step": 15882 + }, + { + "epoch": 6.459129727531517, + "grad_norm": 1.2238935446247132, + "learning_rate": 1.0587816710652929e-05, + "loss": 0.0129, + "step": 15883 + }, + { + "epoch": 6.4595363969093125, + "grad_norm": 2.6065449926068283, + "learning_rate": 1.058680527137854e-05, + "loss": 0.0432, + "step": 15884 + }, + { + "epoch": 6.459943066287108, + "grad_norm": 6.418011187641216, + "learning_rate": 1.0585793826080302e-05, + "loss": 0.2046, + "step": 15885 + }, + { + "epoch": 6.460349735664904, + "grad_norm": 8.176168012971218, + "learning_rate": 1.0584782374768594e-05, + "loss": 0.1395, + "step": 15886 + }, + { + "epoch": 6.460756405042701, + "grad_norm": 5.372458319155965, + "learning_rate": 1.05837709174538e-05, + "loss": 0.146, + "step": 15887 + }, + { + "epoch": 6.461163074420496, + "grad_norm": 3.213291671834225, + "learning_rate": 1.0582759454146306e-05, + "loss": 0.1067, + "step": 15888 + }, + { + "epoch": 6.461569743798292, + "grad_norm": 4.951418062819736, + "learning_rate": 1.058174798485649e-05, + "loss": 0.061, + "step": 15889 + }, + { + "epoch": 6.461976413176088, + "grad_norm": 7.056724204437663, + "learning_rate": 1.0580736509594739e-05, + "loss": 0.394, + "step": 15890 + }, + { + "epoch": 6.462383082553884, + "grad_norm": 3.8883060476519495, + "learning_rate": 1.057972502837144e-05, + "loss": 0.158, + "step": 15891 + }, + { + "epoch": 6.4627897519316795, + "grad_norm": 3.5594184887261724, + "learning_rate": 1.0578713541196968e-05, + "loss": 0.0632, + "step": 15892 + }, + { + "epoch": 6.463196421309475, + "grad_norm": 5.037327080016037, + "learning_rate": 1.0577702048081708e-05, + "loss": 0.1292, + "step": 15893 + }, + { + "epoch": 6.463603090687271, + "grad_norm": 1.9728182153141818, + "learning_rate": 1.0576690549036049e-05, + "loss": 0.0397, + "step": 15894 + }, + { + "epoch": 6.464009760065067, + "grad_norm": 2.715571568718469, + "learning_rate": 1.057567904407037e-05, + "loss": 0.0997, + "step": 15895 + }, + { + "epoch": 6.4644164294428625, + "grad_norm": 2.0996340680333714, + "learning_rate": 1.0574667533195053e-05, + "loss": 0.0351, + "step": 15896 + }, + { + "epoch": 6.464823098820659, + "grad_norm": 1.8754319107648685, + "learning_rate": 1.0573656016420489e-05, + "loss": 0.0372, + "step": 15897 + }, + { + "epoch": 6.465229768198455, + "grad_norm": 0.9893818391078125, + "learning_rate": 1.0572644493757057e-05, + "loss": 0.0133, + "step": 15898 + }, + { + "epoch": 6.465636437576251, + "grad_norm": 8.076168380422885, + "learning_rate": 1.0571632965215138e-05, + "loss": 0.3604, + "step": 15899 + }, + { + "epoch": 6.4660431069540465, + "grad_norm": 9.229882401167021, + "learning_rate": 1.0570621430805122e-05, + "loss": 0.2237, + "step": 15900 + }, + { + "epoch": 6.466449776331842, + "grad_norm": 0.9733305192066152, + "learning_rate": 1.0569609890537389e-05, + "loss": 0.0151, + "step": 15901 + }, + { + "epoch": 6.466856445709638, + "grad_norm": 9.108068519720288, + "learning_rate": 1.0568598344422323e-05, + "loss": 0.188, + "step": 15902 + }, + { + "epoch": 6.467263115087434, + "grad_norm": 3.5424163903922135, + "learning_rate": 1.0567586792470307e-05, + "loss": 0.0449, + "step": 15903 + }, + { + "epoch": 6.4676697844652296, + "grad_norm": 3.0038464596873764, + "learning_rate": 1.0566575234691731e-05, + "loss": 0.0429, + "step": 15904 + }, + { + "epoch": 6.468076453843025, + "grad_norm": 0.494609569975242, + "learning_rate": 1.0565563671096975e-05, + "loss": 0.0089, + "step": 15905 + }, + { + "epoch": 6.468483123220821, + "grad_norm": 8.036792055450007, + "learning_rate": 1.056455210169642e-05, + "loss": 0.2832, + "step": 15906 + }, + { + "epoch": 6.468889792598618, + "grad_norm": 0.15342228418332254, + "learning_rate": 1.0563540526500453e-05, + "loss": 0.0016, + "step": 15907 + }, + { + "epoch": 6.4692964619764135, + "grad_norm": 7.472642574195591, + "learning_rate": 1.0562528945519463e-05, + "loss": 0.3068, + "step": 15908 + }, + { + "epoch": 6.469703131354209, + "grad_norm": 1.4779209074852524, + "learning_rate": 1.0561517358763826e-05, + "loss": 0.0232, + "step": 15909 + }, + { + "epoch": 6.470109800732005, + "grad_norm": 4.426791826858464, + "learning_rate": 1.0560505766243934e-05, + "loss": 0.1219, + "step": 15910 + }, + { + "epoch": 6.470516470109801, + "grad_norm": 0.05103193116108658, + "learning_rate": 1.0559494167970169e-05, + "loss": 0.0008, + "step": 15911 + }, + { + "epoch": 6.470923139487597, + "grad_norm": 1.1745207888943228, + "learning_rate": 1.055848256395291e-05, + "loss": 0.0203, + "step": 15912 + }, + { + "epoch": 6.471329808865392, + "grad_norm": 1.500501012422257, + "learning_rate": 1.0557470954202549e-05, + "loss": 0.0213, + "step": 15913 + }, + { + "epoch": 6.471736478243188, + "grad_norm": 0.3718115321718891, + "learning_rate": 1.0556459338729471e-05, + "loss": 0.0055, + "step": 15914 + }, + { + "epoch": 6.472143147620984, + "grad_norm": 5.5804291335366845, + "learning_rate": 1.0555447717544053e-05, + "loss": 0.1524, + "step": 15915 + }, + { + "epoch": 6.47254981699878, + "grad_norm": 2.2693199894314993, + "learning_rate": 1.0554436090656686e-05, + "loss": 0.0388, + "step": 15916 + }, + { + "epoch": 6.472956486376576, + "grad_norm": 0.08134909496076997, + "learning_rate": 1.0553424458077754e-05, + "loss": 0.0019, + "step": 15917 + }, + { + "epoch": 6.473363155754372, + "grad_norm": 6.782661969988898, + "learning_rate": 1.0552412819817643e-05, + "loss": 0.2256, + "step": 15918 + }, + { + "epoch": 6.473769825132168, + "grad_norm": 4.775908852758182, + "learning_rate": 1.0551401175886735e-05, + "loss": 0.1192, + "step": 15919 + }, + { + "epoch": 6.474176494509964, + "grad_norm": 1.5561988339303785, + "learning_rate": 1.0550389526295417e-05, + "loss": 0.0208, + "step": 15920 + }, + { + "epoch": 6.474583163887759, + "grad_norm": 7.9789702683975765, + "learning_rate": 1.0549377871054075e-05, + "loss": 0.2662, + "step": 15921 + }, + { + "epoch": 6.474989833265555, + "grad_norm": 7.320798441191866, + "learning_rate": 1.0548366210173087e-05, + "loss": 0.3888, + "step": 15922 + }, + { + "epoch": 6.475396502643351, + "grad_norm": 0.400661331094688, + "learning_rate": 1.0547354543662848e-05, + "loss": 0.0068, + "step": 15923 + }, + { + "epoch": 6.475803172021147, + "grad_norm": 1.1708785229167364, + "learning_rate": 1.0546342871533739e-05, + "loss": 0.0115, + "step": 15924 + }, + { + "epoch": 6.476209841398942, + "grad_norm": 4.477143782355842, + "learning_rate": 1.0545331193796145e-05, + "loss": 0.0652, + "step": 15925 + }, + { + "epoch": 6.476616510776738, + "grad_norm": 2.499403722232045, + "learning_rate": 1.0544319510460452e-05, + "loss": 0.0323, + "step": 15926 + }, + { + "epoch": 6.477023180154534, + "grad_norm": 0.24473246699278553, + "learning_rate": 1.0543307821537046e-05, + "loss": 0.003, + "step": 15927 + }, + { + "epoch": 6.477429849532331, + "grad_norm": 0.814094069448449, + "learning_rate": 1.054229612703631e-05, + "loss": 0.0101, + "step": 15928 + }, + { + "epoch": 6.477836518910126, + "grad_norm": 0.21200613769027857, + "learning_rate": 1.0541284426968632e-05, + "loss": 0.0032, + "step": 15929 + }, + { + "epoch": 6.478243188287922, + "grad_norm": 2.184194244018638, + "learning_rate": 1.0540272721344398e-05, + "loss": 0.0359, + "step": 15930 + }, + { + "epoch": 6.478649857665718, + "grad_norm": 7.23971116163744, + "learning_rate": 1.0539261010173991e-05, + "loss": 0.5964, + "step": 15931 + }, + { + "epoch": 6.479056527043514, + "grad_norm": 10.096686314146107, + "learning_rate": 1.05382492934678e-05, + "loss": 0.4344, + "step": 15932 + }, + { + "epoch": 6.479463196421309, + "grad_norm": 0.4360215639765039, + "learning_rate": 1.0537237571236207e-05, + "loss": 0.0063, + "step": 15933 + }, + { + "epoch": 6.479869865799105, + "grad_norm": 0.29874373350523953, + "learning_rate": 1.0536225843489603e-05, + "loss": 0.0039, + "step": 15934 + }, + { + "epoch": 6.480276535176901, + "grad_norm": 7.923345454129328, + "learning_rate": 1.0535214110238367e-05, + "loss": 0.1767, + "step": 15935 + }, + { + "epoch": 6.480683204554697, + "grad_norm": 9.114910552781453, + "learning_rate": 1.0534202371492893e-05, + "loss": 0.3565, + "step": 15936 + }, + { + "epoch": 6.4810898739324925, + "grad_norm": 14.12204957948248, + "learning_rate": 1.0533190627263558e-05, + "loss": 0.4659, + "step": 15937 + }, + { + "epoch": 6.481496543310289, + "grad_norm": 0.36868900346289496, + "learning_rate": 1.0532178877560755e-05, + "loss": 0.0027, + "step": 15938 + }, + { + "epoch": 6.481903212688085, + "grad_norm": 0.458372200706378, + "learning_rate": 1.053116712239487e-05, + "loss": 0.0079, + "step": 15939 + }, + { + "epoch": 6.482309882065881, + "grad_norm": 0.0191392369879095, + "learning_rate": 1.0530155361776286e-05, + "loss": 0.0003, + "step": 15940 + }, + { + "epoch": 6.482716551443676, + "grad_norm": 0.23683596412281666, + "learning_rate": 1.052914359571539e-05, + "loss": 0.003, + "step": 15941 + }, + { + "epoch": 6.483123220821472, + "grad_norm": 7.340395624868211, + "learning_rate": 1.052813182422257e-05, + "loss": 0.1843, + "step": 15942 + }, + { + "epoch": 6.483529890199268, + "grad_norm": 0.033521037325876406, + "learning_rate": 1.052712004730821e-05, + "loss": 0.0005, + "step": 15943 + }, + { + "epoch": 6.483936559577064, + "grad_norm": 4.423724476492671, + "learning_rate": 1.0526108264982695e-05, + "loss": 0.0872, + "step": 15944 + }, + { + "epoch": 6.4843432289548595, + "grad_norm": 0.16443674428692234, + "learning_rate": 1.0525096477256417e-05, + "loss": 0.0015, + "step": 15945 + }, + { + "epoch": 6.484749898332655, + "grad_norm": 1.2412119149266874, + "learning_rate": 1.052408468413976e-05, + "loss": 0.0174, + "step": 15946 + }, + { + "epoch": 6.485156567710451, + "grad_norm": 1.2570722917594184, + "learning_rate": 1.0523072885643109e-05, + "loss": 0.0203, + "step": 15947 + }, + { + "epoch": 6.485563237088248, + "grad_norm": 0.6051334787937034, + "learning_rate": 1.0522061081776849e-05, + "loss": 0.0077, + "step": 15948 + }, + { + "epoch": 6.485969906466043, + "grad_norm": 2.9496589120800776, + "learning_rate": 1.052104927255137e-05, + "loss": 0.081, + "step": 15949 + }, + { + "epoch": 6.486376575843839, + "grad_norm": 1.6457231513002815, + "learning_rate": 1.0520037457977061e-05, + "loss": 0.0178, + "step": 15950 + }, + { + "epoch": 6.486783245221635, + "grad_norm": 5.778858276140726, + "learning_rate": 1.0519025638064302e-05, + "loss": 0.0719, + "step": 15951 + }, + { + "epoch": 6.487189914599431, + "grad_norm": 5.336729110346965, + "learning_rate": 1.0518013812823486e-05, + "loss": 0.0652, + "step": 15952 + }, + { + "epoch": 6.4875965839772265, + "grad_norm": 8.384593710390968, + "learning_rate": 1.0517001982264999e-05, + "loss": 0.0888, + "step": 15953 + }, + { + "epoch": 6.488003253355022, + "grad_norm": 1.7339207267857022, + "learning_rate": 1.0515990146399221e-05, + "loss": 0.039, + "step": 15954 + }, + { + "epoch": 6.488409922732818, + "grad_norm": 6.449913132723336, + "learning_rate": 1.0514978305236549e-05, + "loss": 0.2306, + "step": 15955 + }, + { + "epoch": 6.488816592110614, + "grad_norm": 4.412917673980808, + "learning_rate": 1.0513966458787366e-05, + "loss": 0.0688, + "step": 15956 + }, + { + "epoch": 6.4892232614884096, + "grad_norm": 0.41633534468969896, + "learning_rate": 1.0512954607062055e-05, + "loss": 0.0034, + "step": 15957 + }, + { + "epoch": 6.489629930866206, + "grad_norm": 4.735290184289939, + "learning_rate": 1.0511942750071009e-05, + "loss": 0.1785, + "step": 15958 + }, + { + "epoch": 6.490036600244002, + "grad_norm": 3.087614312271644, + "learning_rate": 1.0510930887824616e-05, + "loss": 0.0882, + "step": 15959 + }, + { + "epoch": 6.490443269621798, + "grad_norm": 7.541314209799532, + "learning_rate": 1.0509919020333257e-05, + "loss": 0.5381, + "step": 15960 + }, + { + "epoch": 6.4908499389995935, + "grad_norm": 5.926122121870481, + "learning_rate": 1.0508907147607322e-05, + "loss": 0.1094, + "step": 15961 + }, + { + "epoch": 6.491256608377389, + "grad_norm": 2.6381191459012188, + "learning_rate": 1.0507895269657197e-05, + "loss": 0.0468, + "step": 15962 + }, + { + "epoch": 6.491663277755185, + "grad_norm": 7.367596337772873, + "learning_rate": 1.0506883386493276e-05, + "loss": 0.1604, + "step": 15963 + }, + { + "epoch": 6.492069947132981, + "grad_norm": 0.07135985814231069, + "learning_rate": 1.0505871498125939e-05, + "loss": 0.0011, + "step": 15964 + }, + { + "epoch": 6.492476616510777, + "grad_norm": 0.03918249303963068, + "learning_rate": 1.0504859604565578e-05, + "loss": 0.0005, + "step": 15965 + }, + { + "epoch": 6.492883285888572, + "grad_norm": 0.128512313586104, + "learning_rate": 1.0503847705822582e-05, + "loss": 0.0028, + "step": 15966 + }, + { + "epoch": 6.493289955266368, + "grad_norm": 7.196113627253815, + "learning_rate": 1.050283580190733e-05, + "loss": 0.1852, + "step": 15967 + }, + { + "epoch": 6.493696624644164, + "grad_norm": 31.31607887236841, + "learning_rate": 1.0501823892830216e-05, + "loss": 1.086, + "step": 15968 + }, + { + "epoch": 6.4941032940219605, + "grad_norm": 0.4201057360552305, + "learning_rate": 1.0500811978601631e-05, + "loss": 0.0053, + "step": 15969 + }, + { + "epoch": 6.494509963399756, + "grad_norm": 4.07637143594757, + "learning_rate": 1.0499800059231954e-05, + "loss": 0.2304, + "step": 15970 + }, + { + "epoch": 6.494916632777552, + "grad_norm": 1.7767067471192204, + "learning_rate": 1.0498788134731584e-05, + "loss": 0.0198, + "step": 15971 + }, + { + "epoch": 6.495323302155348, + "grad_norm": 0.07572996187682653, + "learning_rate": 1.0497776205110899e-05, + "loss": 0.0008, + "step": 15972 + }, + { + "epoch": 6.495729971533144, + "grad_norm": 5.472546420979557, + "learning_rate": 1.0496764270380293e-05, + "loss": 0.1603, + "step": 15973 + }, + { + "epoch": 6.496136640910939, + "grad_norm": 4.657244817658392, + "learning_rate": 1.0495752330550148e-05, + "loss": 0.1498, + "step": 15974 + }, + { + "epoch": 6.496543310288735, + "grad_norm": 1.4184789431023606, + "learning_rate": 1.0494740385630857e-05, + "loss": 0.0177, + "step": 15975 + }, + { + "epoch": 6.496949979666531, + "grad_norm": 11.09470949859153, + "learning_rate": 1.0493728435632808e-05, + "loss": 0.1185, + "step": 15976 + }, + { + "epoch": 6.497356649044327, + "grad_norm": 1.4629272955420842, + "learning_rate": 1.0492716480566388e-05, + "loss": 0.0214, + "step": 15977 + }, + { + "epoch": 6.497763318422122, + "grad_norm": 9.498321666503838, + "learning_rate": 1.0491704520441985e-05, + "loss": 0.199, + "step": 15978 + }, + { + "epoch": 6.498169987799919, + "grad_norm": 3.9404717471786332, + "learning_rate": 1.049069255526999e-05, + "loss": 0.1114, + "step": 15979 + }, + { + "epoch": 6.498576657177715, + "grad_norm": 1.3533183489655327, + "learning_rate": 1.0489680585060784e-05, + "loss": 0.0324, + "step": 15980 + }, + { + "epoch": 6.498983326555511, + "grad_norm": 6.7853733883396865, + "learning_rate": 1.0488668609824764e-05, + "loss": 0.1436, + "step": 15981 + }, + { + "epoch": 6.499389995933306, + "grad_norm": 0.6230116107062287, + "learning_rate": 1.0487656629572316e-05, + "loss": 0.0044, + "step": 15982 + }, + { + "epoch": 6.499796665311102, + "grad_norm": 0.029752739085715428, + "learning_rate": 1.0486644644313823e-05, + "loss": 0.0003, + "step": 15983 + }, + { + "epoch": 6.500203334688898, + "grad_norm": 0.5381922946550665, + "learning_rate": 1.0485632654059681e-05, + "loss": 0.0115, + "step": 15984 + }, + { + "epoch": 6.500610004066694, + "grad_norm": 3.764043240631023, + "learning_rate": 1.0484620658820274e-05, + "loss": 0.0657, + "step": 15985 + }, + { + "epoch": 6.501016673444489, + "grad_norm": 0.81145753251982, + "learning_rate": 1.0483608658605995e-05, + "loss": 0.0116, + "step": 15986 + }, + { + "epoch": 6.501423342822285, + "grad_norm": 1.3921425972305492, + "learning_rate": 1.0482596653427224e-05, + "loss": 0.0118, + "step": 15987 + }, + { + "epoch": 6.501830012200081, + "grad_norm": 8.482864416099357, + "learning_rate": 1.048158464329436e-05, + "loss": 0.2464, + "step": 15988 + }, + { + "epoch": 6.502236681577877, + "grad_norm": 5.6771892399950135, + "learning_rate": 1.0480572628217787e-05, + "loss": 0.1382, + "step": 15989 + }, + { + "epoch": 6.502643350955673, + "grad_norm": 2.0298577496597314, + "learning_rate": 1.0479560608207892e-05, + "loss": 0.0352, + "step": 15990 + }, + { + "epoch": 6.503050020333469, + "grad_norm": 6.264303628808492, + "learning_rate": 1.047854858327507e-05, + "loss": 0.1384, + "step": 15991 + }, + { + "epoch": 6.503456689711265, + "grad_norm": 5.962569486088632, + "learning_rate": 1.04775365534297e-05, + "loss": 0.2947, + "step": 15992 + }, + { + "epoch": 6.503863359089061, + "grad_norm": 6.81345101989267, + "learning_rate": 1.0476524518682181e-05, + "loss": 0.2623, + "step": 15993 + }, + { + "epoch": 6.504270028466856, + "grad_norm": 0.29271370470345964, + "learning_rate": 1.0475512479042898e-05, + "loss": 0.0042, + "step": 15994 + }, + { + "epoch": 6.504676697844652, + "grad_norm": 4.85053279266597, + "learning_rate": 1.047450043452224e-05, + "loss": 0.1347, + "step": 15995 + }, + { + "epoch": 6.505083367222448, + "grad_norm": 4.123490941096889, + "learning_rate": 1.0473488385130592e-05, + "loss": 0.0837, + "step": 15996 + }, + { + "epoch": 6.505490036600244, + "grad_norm": 1.3722338405916616, + "learning_rate": 1.0472476330878352e-05, + "loss": 0.022, + "step": 15997 + }, + { + "epoch": 6.5058967059780395, + "grad_norm": 0.1882593223613795, + "learning_rate": 1.0471464271775904e-05, + "loss": 0.0024, + "step": 15998 + }, + { + "epoch": 6.506303375355836, + "grad_norm": 0.39853321586360624, + "learning_rate": 1.0470452207833636e-05, + "loss": 0.0074, + "step": 15999 + }, + { + "epoch": 6.506710044733632, + "grad_norm": 5.45487382283978, + "learning_rate": 1.0469440139061942e-05, + "loss": 0.1088, + "step": 16000 + }, + { + "epoch": 6.507116714111428, + "grad_norm": 0.7366058625370523, + "learning_rate": 1.0468428065471208e-05, + "loss": 0.0089, + "step": 16001 + }, + { + "epoch": 6.507523383489223, + "grad_norm": 0.3849088774037232, + "learning_rate": 1.046741598707182e-05, + "loss": 0.0047, + "step": 16002 + }, + { + "epoch": 6.507930052867019, + "grad_norm": 0.15420599314080644, + "learning_rate": 1.0466403903874176e-05, + "loss": 0.0021, + "step": 16003 + }, + { + "epoch": 6.508336722244815, + "grad_norm": 7.399904762828989, + "learning_rate": 1.046539181588866e-05, + "loss": 0.2205, + "step": 16004 + }, + { + "epoch": 6.508743391622611, + "grad_norm": 3.1647189146347894, + "learning_rate": 1.0464379723125666e-05, + "loss": 0.0742, + "step": 16005 + }, + { + "epoch": 6.5091500610004065, + "grad_norm": 10.10534132584789, + "learning_rate": 1.0463367625595575e-05, + "loss": 0.2741, + "step": 16006 + }, + { + "epoch": 6.509556730378202, + "grad_norm": 1.5512813942362287, + "learning_rate": 1.0462355523308786e-05, + "loss": 0.0234, + "step": 16007 + }, + { + "epoch": 6.509963399755998, + "grad_norm": 6.823448285357088, + "learning_rate": 1.0461343416275684e-05, + "loss": 0.1276, + "step": 16008 + }, + { + "epoch": 6.510370069133794, + "grad_norm": 2.5142082417939235, + "learning_rate": 1.0460331304506658e-05, + "loss": 0.0639, + "step": 16009 + }, + { + "epoch": 6.5107767385115904, + "grad_norm": 4.550558535094971, + "learning_rate": 1.04593191880121e-05, + "loss": 0.096, + "step": 16010 + }, + { + "epoch": 6.511183407889386, + "grad_norm": 0.17252960151925303, + "learning_rate": 1.0458307066802402e-05, + "loss": 0.003, + "step": 16011 + }, + { + "epoch": 6.511590077267182, + "grad_norm": 5.935938120722958, + "learning_rate": 1.0457294940887948e-05, + "loss": 0.265, + "step": 16012 + }, + { + "epoch": 6.511996746644978, + "grad_norm": 8.295139664116547, + "learning_rate": 1.0456282810279133e-05, + "loss": 0.2088, + "step": 16013 + }, + { + "epoch": 6.5124034160227735, + "grad_norm": 7.819948427337551, + "learning_rate": 1.0455270674986343e-05, + "loss": 0.4623, + "step": 16014 + }, + { + "epoch": 6.512810085400569, + "grad_norm": 0.34401875570776475, + "learning_rate": 1.0454258535019971e-05, + "loss": 0.0043, + "step": 16015 + }, + { + "epoch": 6.513216754778365, + "grad_norm": 7.718256716226735, + "learning_rate": 1.045324639039041e-05, + "loss": 0.1669, + "step": 16016 + }, + { + "epoch": 6.513623424156161, + "grad_norm": 7.328717996851961, + "learning_rate": 1.0452234241108046e-05, + "loss": 0.2663, + "step": 16017 + }, + { + "epoch": 6.514030093533957, + "grad_norm": 4.195455333717495, + "learning_rate": 1.0451222087183268e-05, + "loss": 0.0764, + "step": 16018 + }, + { + "epoch": 6.514436762911753, + "grad_norm": 0.3329905360563261, + "learning_rate": 1.0450209928626467e-05, + "loss": 0.0054, + "step": 16019 + }, + { + "epoch": 6.514843432289549, + "grad_norm": 12.73571527249992, + "learning_rate": 1.0449197765448036e-05, + "loss": 0.5522, + "step": 16020 + }, + { + "epoch": 6.515250101667345, + "grad_norm": 5.992681852771372, + "learning_rate": 1.0448185597658365e-05, + "loss": 0.1575, + "step": 16021 + }, + { + "epoch": 6.5156567710451405, + "grad_norm": 2.60161839906963, + "learning_rate": 1.044717342526784e-05, + "loss": 0.0156, + "step": 16022 + }, + { + "epoch": 6.516063440422936, + "grad_norm": 0.7188189873749865, + "learning_rate": 1.044616124828686e-05, + "loss": 0.0106, + "step": 16023 + }, + { + "epoch": 6.516470109800732, + "grad_norm": 4.77586890323825, + "learning_rate": 1.0445149066725807e-05, + "loss": 0.0934, + "step": 16024 + }, + { + "epoch": 6.516876779178528, + "grad_norm": 0.10765745300252832, + "learning_rate": 1.0444136880595073e-05, + "loss": 0.0019, + "step": 16025 + }, + { + "epoch": 6.517283448556324, + "grad_norm": 2.9042680402782106, + "learning_rate": 1.0443124689905053e-05, + "loss": 0.0922, + "step": 16026 + }, + { + "epoch": 6.517690117934119, + "grad_norm": 1.8224976872007501, + "learning_rate": 1.0442112494666134e-05, + "loss": 0.0298, + "step": 16027 + }, + { + "epoch": 6.518096787311915, + "grad_norm": 1.6783955682379568, + "learning_rate": 1.0441100294888706e-05, + "loss": 0.029, + "step": 16028 + }, + { + "epoch": 6.518503456689711, + "grad_norm": 0.09274311144315414, + "learning_rate": 1.0440088090583164e-05, + "loss": 0.0011, + "step": 16029 + }, + { + "epoch": 6.518910126067507, + "grad_norm": 0.9638154326414808, + "learning_rate": 1.0439075881759897e-05, + "loss": 0.0195, + "step": 16030 + }, + { + "epoch": 6.519316795445303, + "grad_norm": 0.31935292860967757, + "learning_rate": 1.0438063668429294e-05, + "loss": 0.0076, + "step": 16031 + }, + { + "epoch": 6.519723464823099, + "grad_norm": 6.909949640441281, + "learning_rate": 1.0437051450601748e-05, + "loss": 0.1769, + "step": 16032 + }, + { + "epoch": 6.520130134200895, + "grad_norm": 1.0410077448838757, + "learning_rate": 1.0436039228287647e-05, + "loss": 0.0117, + "step": 16033 + }, + { + "epoch": 6.520536803578691, + "grad_norm": 5.403614032069621, + "learning_rate": 1.0435027001497386e-05, + "loss": 0.1228, + "step": 16034 + }, + { + "epoch": 6.520943472956486, + "grad_norm": 0.039722599776437606, + "learning_rate": 1.0434014770241349e-05, + "loss": 0.0005, + "step": 16035 + }, + { + "epoch": 6.521350142334282, + "grad_norm": 1.9039376670164179, + "learning_rate": 1.0433002534529937e-05, + "loss": 0.0255, + "step": 16036 + }, + { + "epoch": 6.521756811712078, + "grad_norm": 0.31613414881155383, + "learning_rate": 1.0431990294373535e-05, + "loss": 0.0048, + "step": 16037 + }, + { + "epoch": 6.522163481089874, + "grad_norm": 0.09566544567298252, + "learning_rate": 1.0430978049782534e-05, + "loss": 0.0018, + "step": 16038 + }, + { + "epoch": 6.522570150467669, + "grad_norm": 0.633270778228847, + "learning_rate": 1.0429965800767328e-05, + "loss": 0.0101, + "step": 16039 + }, + { + "epoch": 6.522976819845466, + "grad_norm": 10.04729094833593, + "learning_rate": 1.0428953547338309e-05, + "loss": 0.3197, + "step": 16040 + }, + { + "epoch": 6.523383489223262, + "grad_norm": 3.8503872057998336, + "learning_rate": 1.042794128950586e-05, + "loss": 0.0571, + "step": 16041 + }, + { + "epoch": 6.523790158601058, + "grad_norm": 0.47545834630774325, + "learning_rate": 1.0426929027280383e-05, + "loss": 0.0079, + "step": 16042 + }, + { + "epoch": 6.524196827978853, + "grad_norm": 3.1368731680644837, + "learning_rate": 1.0425916760672263e-05, + "loss": 0.0652, + "step": 16043 + }, + { + "epoch": 6.524603497356649, + "grad_norm": 6.212677099073996, + "learning_rate": 1.0424904489691892e-05, + "loss": 0.3949, + "step": 16044 + }, + { + "epoch": 6.525010166734445, + "grad_norm": 5.46851043847018, + "learning_rate": 1.0423892214349663e-05, + "loss": 0.3619, + "step": 16045 + }, + { + "epoch": 6.525416836112241, + "grad_norm": 7.129445414730277, + "learning_rate": 1.042287993465597e-05, + "loss": 0.1682, + "step": 16046 + }, + { + "epoch": 6.525823505490036, + "grad_norm": 9.358876964518034, + "learning_rate": 1.0421867650621196e-05, + "loss": 0.2203, + "step": 16047 + }, + { + "epoch": 6.526230174867832, + "grad_norm": 2.9680704975172145, + "learning_rate": 1.0420855362255741e-05, + "loss": 0.0442, + "step": 16048 + }, + { + "epoch": 6.526636844245628, + "grad_norm": 2.442926613529229, + "learning_rate": 1.0419843069569996e-05, + "loss": 0.0354, + "step": 16049 + }, + { + "epoch": 6.527043513623424, + "grad_norm": 1.5567936823339463, + "learning_rate": 1.0418830772574346e-05, + "loss": 0.0201, + "step": 16050 + }, + { + "epoch": 6.52745018300122, + "grad_norm": 1.6808094362878818, + "learning_rate": 1.0417818471279192e-05, + "loss": 0.0265, + "step": 16051 + }, + { + "epoch": 6.527856852379016, + "grad_norm": 0.07042859638507408, + "learning_rate": 1.041680616569492e-05, + "loss": 0.001, + "step": 16052 + }, + { + "epoch": 6.528263521756812, + "grad_norm": 6.628846321912659, + "learning_rate": 1.0415793855831924e-05, + "loss": 0.2025, + "step": 16053 + }, + { + "epoch": 6.528670191134608, + "grad_norm": 11.361913712827608, + "learning_rate": 1.0414781541700589e-05, + "loss": 0.3846, + "step": 16054 + }, + { + "epoch": 6.529076860512403, + "grad_norm": 0.43351873906212124, + "learning_rate": 1.041376922331132e-05, + "loss": 0.0107, + "step": 16055 + }, + { + "epoch": 6.529483529890199, + "grad_norm": 1.1002936899223215, + "learning_rate": 1.0412756900674496e-05, + "loss": 0.0141, + "step": 16056 + }, + { + "epoch": 6.529890199267995, + "grad_norm": 0.44573638210361827, + "learning_rate": 1.0411744573800518e-05, + "loss": 0.0028, + "step": 16057 + }, + { + "epoch": 6.530296868645791, + "grad_norm": 6.670941477935351, + "learning_rate": 1.0410732242699772e-05, + "loss": 0.1152, + "step": 16058 + }, + { + "epoch": 6.5307035380235865, + "grad_norm": 4.877979769368486, + "learning_rate": 1.0409719907382656e-05, + "loss": 0.1816, + "step": 16059 + }, + { + "epoch": 6.531110207401383, + "grad_norm": 9.018322760218114, + "learning_rate": 1.0408707567859558e-05, + "loss": 0.1328, + "step": 16060 + }, + { + "epoch": 6.531516876779179, + "grad_norm": 5.367134887667881, + "learning_rate": 1.040769522414087e-05, + "loss": 0.076, + "step": 16061 + }, + { + "epoch": 6.531923546156975, + "grad_norm": 12.09067869738942, + "learning_rate": 1.0406682876236987e-05, + "loss": 0.2544, + "step": 16062 + }, + { + "epoch": 6.5323302155347704, + "grad_norm": 8.439110382145888, + "learning_rate": 1.04056705241583e-05, + "loss": 0.2131, + "step": 16063 + }, + { + "epoch": 6.532736884912566, + "grad_norm": 4.201288109931116, + "learning_rate": 1.0404658167915199e-05, + "loss": 0.0713, + "step": 16064 + }, + { + "epoch": 6.533143554290362, + "grad_norm": 1.528305228927481, + "learning_rate": 1.0403645807518077e-05, + "loss": 0.0172, + "step": 16065 + }, + { + "epoch": 6.533550223668158, + "grad_norm": 0.04404233005622113, + "learning_rate": 1.040263344297733e-05, + "loss": 0.0006, + "step": 16066 + }, + { + "epoch": 6.5339568930459535, + "grad_norm": 4.900271732983176, + "learning_rate": 1.0401621074303348e-05, + "loss": 0.1748, + "step": 16067 + }, + { + "epoch": 6.534363562423749, + "grad_norm": 0.19123010020305728, + "learning_rate": 1.0400608701506522e-05, + "loss": 0.0031, + "step": 16068 + }, + { + "epoch": 6.534770231801545, + "grad_norm": 0.18074878982523576, + "learning_rate": 1.0399596324597247e-05, + "loss": 0.0029, + "step": 16069 + }, + { + "epoch": 6.535176901179341, + "grad_norm": 6.7751508311847815, + "learning_rate": 1.0398583943585914e-05, + "loss": 0.1491, + "step": 16070 + }, + { + "epoch": 6.535583570557137, + "grad_norm": 5.867945316657326, + "learning_rate": 1.0397571558482918e-05, + "loss": 0.0855, + "step": 16071 + }, + { + "epoch": 6.535990239934933, + "grad_norm": 0.09139097217940054, + "learning_rate": 1.0396559169298648e-05, + "loss": 0.0011, + "step": 16072 + }, + { + "epoch": 6.536396909312729, + "grad_norm": 2.5904345523561614, + "learning_rate": 1.0395546776043499e-05, + "loss": 0.0458, + "step": 16073 + }, + { + "epoch": 6.536803578690525, + "grad_norm": 2.4738182945533693, + "learning_rate": 1.0394534378727863e-05, + "loss": 0.0448, + "step": 16074 + }, + { + "epoch": 6.5372102480683205, + "grad_norm": 0.3359249777664008, + "learning_rate": 1.0393521977362135e-05, + "loss": 0.0038, + "step": 16075 + }, + { + "epoch": 6.537616917446116, + "grad_norm": 0.3840875423471504, + "learning_rate": 1.0392509571956706e-05, + "loss": 0.0065, + "step": 16076 + }, + { + "epoch": 6.538023586823912, + "grad_norm": 0.004743964506538568, + "learning_rate": 1.0391497162521964e-05, + "loss": 0.0001, + "step": 16077 + }, + { + "epoch": 6.538430256201708, + "grad_norm": 0.12817832735231424, + "learning_rate": 1.0390484749068311e-05, + "loss": 0.0015, + "step": 16078 + }, + { + "epoch": 6.538836925579504, + "grad_norm": 0.03322562728148185, + "learning_rate": 1.0389472331606134e-05, + "loss": 0.0005, + "step": 16079 + }, + { + "epoch": 6.539243594957299, + "grad_norm": 0.2437855893134827, + "learning_rate": 1.0388459910145828e-05, + "loss": 0.0029, + "step": 16080 + }, + { + "epoch": 6.539650264335096, + "grad_norm": 2.211266069673596, + "learning_rate": 1.0387447484697785e-05, + "loss": 0.0197, + "step": 16081 + }, + { + "epoch": 6.540056933712892, + "grad_norm": 0.4071101071531791, + "learning_rate": 1.03864350552724e-05, + "loss": 0.0056, + "step": 16082 + }, + { + "epoch": 6.5404636030906875, + "grad_norm": 1.2503909919790965, + "learning_rate": 1.038542262188006e-05, + "loss": 0.0127, + "step": 16083 + }, + { + "epoch": 6.540870272468483, + "grad_norm": 9.462003603718362, + "learning_rate": 1.0384410184531168e-05, + "loss": 0.3141, + "step": 16084 + }, + { + "epoch": 6.541276941846279, + "grad_norm": 0.8416473210038833, + "learning_rate": 1.0383397743236112e-05, + "loss": 0.0121, + "step": 16085 + }, + { + "epoch": 6.541683611224075, + "grad_norm": 2.364722442735136, + "learning_rate": 1.0382385298005283e-05, + "loss": 0.0477, + "step": 16086 + }, + { + "epoch": 6.542090280601871, + "grad_norm": 0.531792687135381, + "learning_rate": 1.0381372848849076e-05, + "loss": 0.0091, + "step": 16087 + }, + { + "epoch": 6.542496949979666, + "grad_norm": 4.507577813772937, + "learning_rate": 1.0380360395777886e-05, + "loss": 0.046, + "step": 16088 + }, + { + "epoch": 6.542903619357462, + "grad_norm": 5.590743090416939, + "learning_rate": 1.0379347938802107e-05, + "loss": 0.0897, + "step": 16089 + }, + { + "epoch": 6.543310288735258, + "grad_norm": 11.516758037614608, + "learning_rate": 1.037833547793213e-05, + "loss": 0.9471, + "step": 16090 + }, + { + "epoch": 6.543716958113054, + "grad_norm": 12.89305323705017, + "learning_rate": 1.0377323013178346e-05, + "loss": 0.1493, + "step": 16091 + }, + { + "epoch": 6.54412362749085, + "grad_norm": 5.227693634778392, + "learning_rate": 1.0376310544551151e-05, + "loss": 0.17, + "step": 16092 + }, + { + "epoch": 6.544530296868646, + "grad_norm": 27.40957483112878, + "learning_rate": 1.0375298072060942e-05, + "loss": 0.517, + "step": 16093 + }, + { + "epoch": 6.544936966246442, + "grad_norm": 0.48267696450021413, + "learning_rate": 1.0374285595718108e-05, + "loss": 0.0073, + "step": 16094 + }, + { + "epoch": 6.545343635624238, + "grad_norm": 0.14699902942678222, + "learning_rate": 1.0373273115533043e-05, + "loss": 0.0018, + "step": 16095 + }, + { + "epoch": 6.545750305002033, + "grad_norm": 0.5341090558756758, + "learning_rate": 1.0372260631516142e-05, + "loss": 0.0068, + "step": 16096 + }, + { + "epoch": 6.546156974379829, + "grad_norm": 0.13763828331891295, + "learning_rate": 1.0371248143677799e-05, + "loss": 0.0018, + "step": 16097 + }, + { + "epoch": 6.546563643757625, + "grad_norm": 7.3505634629590295, + "learning_rate": 1.0370235652028406e-05, + "loss": 0.2991, + "step": 16098 + }, + { + "epoch": 6.546970313135421, + "grad_norm": 7.566506783689469, + "learning_rate": 1.0369223156578359e-05, + "loss": 0.3517, + "step": 16099 + }, + { + "epoch": 6.547376982513216, + "grad_norm": 2.7995953459991636, + "learning_rate": 1.036821065733805e-05, + "loss": 0.0719, + "step": 16100 + }, + { + "epoch": 6.547783651891013, + "grad_norm": 0.41038031773811673, + "learning_rate": 1.0367198154317874e-05, + "loss": 0.0075, + "step": 16101 + }, + { + "epoch": 6.548190321268809, + "grad_norm": 2.020497666087332, + "learning_rate": 1.0366185647528223e-05, + "loss": 0.0389, + "step": 16102 + }, + { + "epoch": 6.548596990646605, + "grad_norm": 4.354141568544173, + "learning_rate": 1.0365173136979491e-05, + "loss": 0.1938, + "step": 16103 + }, + { + "epoch": 6.5490036600244, + "grad_norm": 1.228776786147542, + "learning_rate": 1.0364160622682075e-05, + "loss": 0.0158, + "step": 16104 + }, + { + "epoch": 6.549410329402196, + "grad_norm": 4.054925676151151, + "learning_rate": 1.0363148104646366e-05, + "loss": 0.0533, + "step": 16105 + }, + { + "epoch": 6.549816998779992, + "grad_norm": 0.2534274653203956, + "learning_rate": 1.0362135582882759e-05, + "loss": 0.0034, + "step": 16106 + }, + { + "epoch": 6.550223668157788, + "grad_norm": 2.8187841182215494, + "learning_rate": 1.0361123057401647e-05, + "loss": 0.0677, + "step": 16107 + }, + { + "epoch": 6.550630337535583, + "grad_norm": 0.14944900593284688, + "learning_rate": 1.0360110528213425e-05, + "loss": 0.0018, + "step": 16108 + }, + { + "epoch": 6.551037006913379, + "grad_norm": 5.600943451689193, + "learning_rate": 1.0359097995328491e-05, + "loss": 0.0836, + "step": 16109 + }, + { + "epoch": 6.551443676291175, + "grad_norm": 5.2054554976923395, + "learning_rate": 1.0358085458757233e-05, + "loss": 0.1047, + "step": 16110 + }, + { + "epoch": 6.551850345668971, + "grad_norm": 0.01925332421109419, + "learning_rate": 1.0357072918510047e-05, + "loss": 0.0002, + "step": 16111 + }, + { + "epoch": 6.5522570150467665, + "grad_norm": 10.681995831999089, + "learning_rate": 1.0356060374597327e-05, + "loss": 0.4391, + "step": 16112 + }, + { + "epoch": 6.552663684424563, + "grad_norm": 6.21371195375286, + "learning_rate": 1.035504782702947e-05, + "loss": 0.1811, + "step": 16113 + }, + { + "epoch": 6.553070353802359, + "grad_norm": 1.6290249901134, + "learning_rate": 1.0354035275816867e-05, + "loss": 0.0339, + "step": 16114 + }, + { + "epoch": 6.553477023180155, + "grad_norm": 4.1473675578783205, + "learning_rate": 1.0353022720969914e-05, + "loss": 0.1486, + "step": 16115 + }, + { + "epoch": 6.5538836925579504, + "grad_norm": 2.3874721237513117, + "learning_rate": 1.0352010162499004e-05, + "loss": 0.0744, + "step": 16116 + }, + { + "epoch": 6.554290361935746, + "grad_norm": 0.07003832904529574, + "learning_rate": 1.0350997600414536e-05, + "loss": 0.0007, + "step": 16117 + }, + { + "epoch": 6.554697031313542, + "grad_norm": 2.218295413499661, + "learning_rate": 1.0349985034726897e-05, + "loss": 0.0329, + "step": 16118 + }, + { + "epoch": 6.555103700691338, + "grad_norm": 4.851413225213289, + "learning_rate": 1.0348972465446487e-05, + "loss": 0.0866, + "step": 16119 + }, + { + "epoch": 6.5555103700691335, + "grad_norm": 0.0853708928347285, + "learning_rate": 1.03479598925837e-05, + "loss": 0.0016, + "step": 16120 + }, + { + "epoch": 6.555917039446929, + "grad_norm": 0.26624887794097385, + "learning_rate": 1.034694731614893e-05, + "loss": 0.0054, + "step": 16121 + }, + { + "epoch": 6.556323708824726, + "grad_norm": 1.8080518767645333, + "learning_rate": 1.034593473615257e-05, + "loss": 0.0118, + "step": 16122 + }, + { + "epoch": 6.556730378202522, + "grad_norm": 7.208830100056934, + "learning_rate": 1.0344922152605015e-05, + "loss": 0.1647, + "step": 16123 + }, + { + "epoch": 6.5571370475803175, + "grad_norm": 0.6802840444443958, + "learning_rate": 1.0343909565516664e-05, + "loss": 0.0129, + "step": 16124 + }, + { + "epoch": 6.557543716958113, + "grad_norm": 5.092639689458509, + "learning_rate": 1.0342896974897903e-05, + "loss": 0.0391, + "step": 16125 + }, + { + "epoch": 6.557950386335909, + "grad_norm": 0.006136017823506233, + "learning_rate": 1.0341884380759134e-05, + "loss": 0.0001, + "step": 16126 + }, + { + "epoch": 6.558357055713705, + "grad_norm": 2.381224277623285, + "learning_rate": 1.034087178311075e-05, + "loss": 0.0622, + "step": 16127 + }, + { + "epoch": 6.5587637250915005, + "grad_norm": 1.5450763908452112, + "learning_rate": 1.0339859181963145e-05, + "loss": 0.0246, + "step": 16128 + }, + { + "epoch": 6.559170394469296, + "grad_norm": 5.032320903518926, + "learning_rate": 1.0338846577326717e-05, + "loss": 0.2181, + "step": 16129 + }, + { + "epoch": 6.559577063847092, + "grad_norm": 3.3168092834213367, + "learning_rate": 1.0337833969211858e-05, + "loss": 0.0694, + "step": 16130 + }, + { + "epoch": 6.559983733224888, + "grad_norm": 0.02429717346226416, + "learning_rate": 1.0336821357628959e-05, + "loss": 0.0004, + "step": 16131 + }, + { + "epoch": 6.560390402602684, + "grad_norm": 5.058915636507113, + "learning_rate": 1.0335808742588425e-05, + "loss": 0.1649, + "step": 16132 + }, + { + "epoch": 6.56079707198048, + "grad_norm": 0.3282348343968587, + "learning_rate": 1.033479612410064e-05, + "loss": 0.003, + "step": 16133 + }, + { + "epoch": 6.561203741358276, + "grad_norm": 1.5277660361105796, + "learning_rate": 1.033378350217601e-05, + "loss": 0.0202, + "step": 16134 + }, + { + "epoch": 6.561610410736072, + "grad_norm": 0.34122827202030054, + "learning_rate": 1.0332770876824917e-05, + "loss": 0.0043, + "step": 16135 + }, + { + "epoch": 6.5620170801138675, + "grad_norm": 5.1339574303319395, + "learning_rate": 1.0331758248057771e-05, + "loss": 0.0762, + "step": 16136 + }, + { + "epoch": 6.562423749491663, + "grad_norm": 5.963954720324026, + "learning_rate": 1.0330745615884956e-05, + "loss": 0.1777, + "step": 16137 + }, + { + "epoch": 6.562830418869459, + "grad_norm": 0.01588627619819985, + "learning_rate": 1.032973298031687e-05, + "loss": 0.0003, + "step": 16138 + }, + { + "epoch": 6.563237088247255, + "grad_norm": 3.650313842279974, + "learning_rate": 1.032872034136391e-05, + "loss": 0.0832, + "step": 16139 + }, + { + "epoch": 6.563643757625051, + "grad_norm": 2.5584709323641897, + "learning_rate": 1.0327707699036471e-05, + "loss": 0.0194, + "step": 16140 + }, + { + "epoch": 6.564050427002846, + "grad_norm": 0.5722874166941917, + "learning_rate": 1.0326695053344946e-05, + "loss": 0.0092, + "step": 16141 + }, + { + "epoch": 6.564457096380643, + "grad_norm": 2.5853769872677783, + "learning_rate": 1.0325682404299735e-05, + "loss": 0.043, + "step": 16142 + }, + { + "epoch": 6.564863765758439, + "grad_norm": 5.151390949935705, + "learning_rate": 1.0324669751911227e-05, + "loss": 0.1122, + "step": 16143 + }, + { + "epoch": 6.5652704351362345, + "grad_norm": 0.7851201278276524, + "learning_rate": 1.032365709618982e-05, + "loss": 0.0119, + "step": 16144 + }, + { + "epoch": 6.56567710451403, + "grad_norm": 7.511119625223708, + "learning_rate": 1.0322644437145912e-05, + "loss": 0.0734, + "step": 16145 + }, + { + "epoch": 6.566083773891826, + "grad_norm": 6.805931907417121, + "learning_rate": 1.0321631774789898e-05, + "loss": 0.2695, + "step": 16146 + }, + { + "epoch": 6.566490443269622, + "grad_norm": 4.045368029151188, + "learning_rate": 1.0320619109132168e-05, + "loss": 0.0585, + "step": 16147 + }, + { + "epoch": 6.566897112647418, + "grad_norm": 0.5561488415605297, + "learning_rate": 1.0319606440183123e-05, + "loss": 0.018, + "step": 16148 + }, + { + "epoch": 6.567303782025213, + "grad_norm": 15.018096398916073, + "learning_rate": 1.031859376795316e-05, + "loss": 0.2381, + "step": 16149 + }, + { + "epoch": 6.567710451403009, + "grad_norm": 2.9574497195314264, + "learning_rate": 1.0317581092452666e-05, + "loss": 0.0616, + "step": 16150 + }, + { + "epoch": 6.568117120780805, + "grad_norm": 0.15118210515400077, + "learning_rate": 1.0316568413692048e-05, + "loss": 0.003, + "step": 16151 + }, + { + "epoch": 6.568523790158601, + "grad_norm": 1.0340326830488011, + "learning_rate": 1.0315555731681692e-05, + "loss": 0.0126, + "step": 16152 + }, + { + "epoch": 6.568930459536396, + "grad_norm": 0.03273844373845569, + "learning_rate": 1.0314543046431997e-05, + "loss": 0.0005, + "step": 16153 + }, + { + "epoch": 6.569337128914193, + "grad_norm": 11.968427519276057, + "learning_rate": 1.0313530357953363e-05, + "loss": 0.3997, + "step": 16154 + }, + { + "epoch": 6.569743798291989, + "grad_norm": 0.8470002273182633, + "learning_rate": 1.031251766625618e-05, + "loss": 0.0123, + "step": 16155 + }, + { + "epoch": 6.570150467669785, + "grad_norm": 2.1830541934354786, + "learning_rate": 1.0311504971350847e-05, + "loss": 0.0441, + "step": 16156 + }, + { + "epoch": 6.57055713704758, + "grad_norm": 0.15566968986714735, + "learning_rate": 1.0310492273247757e-05, + "loss": 0.0025, + "step": 16157 + }, + { + "epoch": 6.570963806425376, + "grad_norm": 0.49232144037602965, + "learning_rate": 1.0309479571957308e-05, + "loss": 0.0061, + "step": 16158 + }, + { + "epoch": 6.571370475803172, + "grad_norm": 10.625810132275955, + "learning_rate": 1.0308466867489897e-05, + "loss": 0.253, + "step": 16159 + }, + { + "epoch": 6.571777145180968, + "grad_norm": 2.949857882594698, + "learning_rate": 1.0307454159855913e-05, + "loss": 0.0581, + "step": 16160 + }, + { + "epoch": 6.572183814558763, + "grad_norm": 0.038632881229548506, + "learning_rate": 1.0306441449065765e-05, + "loss": 0.0005, + "step": 16161 + }, + { + "epoch": 6.572590483936559, + "grad_norm": 6.907968789973548, + "learning_rate": 1.0305428735129837e-05, + "loss": 0.1644, + "step": 16162 + }, + { + "epoch": 6.572997153314356, + "grad_norm": 1.1516031423766162, + "learning_rate": 1.030441601805853e-05, + "loss": 0.0171, + "step": 16163 + }, + { + "epoch": 6.573403822692152, + "grad_norm": 5.228003676616463, + "learning_rate": 1.0303403297862238e-05, + "loss": 0.1204, + "step": 16164 + }, + { + "epoch": 6.573810492069947, + "grad_norm": 12.232313714789472, + "learning_rate": 1.0302390574551361e-05, + "loss": 0.751, + "step": 16165 + }, + { + "epoch": 6.574217161447743, + "grad_norm": 0.03407090456468013, + "learning_rate": 1.0301377848136289e-05, + "loss": 0.0004, + "step": 16166 + }, + { + "epoch": 6.574623830825539, + "grad_norm": 0.3819493181126941, + "learning_rate": 1.0300365118627427e-05, + "loss": 0.0055, + "step": 16167 + }, + { + "epoch": 6.575030500203335, + "grad_norm": 6.300834426430814, + "learning_rate": 1.0299352386035161e-05, + "loss": 0.0655, + "step": 16168 + }, + { + "epoch": 6.5754371695811304, + "grad_norm": 0.01005688661522706, + "learning_rate": 1.0298339650369896e-05, + "loss": 0.0002, + "step": 16169 + }, + { + "epoch": 6.575843838958926, + "grad_norm": 6.328762320815036, + "learning_rate": 1.029732691164202e-05, + "loss": 0.1486, + "step": 16170 + }, + { + "epoch": 6.576250508336722, + "grad_norm": 2.204075481655987, + "learning_rate": 1.0296314169861939e-05, + "loss": 0.019, + "step": 16171 + }, + { + "epoch": 6.576657177714518, + "grad_norm": 0.016815348836295443, + "learning_rate": 1.029530142504004e-05, + "loss": 0.0003, + "step": 16172 + }, + { + "epoch": 6.5770638470923135, + "grad_norm": 2.6245633539842457, + "learning_rate": 1.0294288677186724e-05, + "loss": 0.0369, + "step": 16173 + }, + { + "epoch": 6.57747051647011, + "grad_norm": 8.573721729528076, + "learning_rate": 1.0293275926312387e-05, + "loss": 0.1905, + "step": 16174 + }, + { + "epoch": 6.577877185847906, + "grad_norm": 18.69834195948824, + "learning_rate": 1.0292263172427426e-05, + "loss": 0.2508, + "step": 16175 + }, + { + "epoch": 6.578283855225702, + "grad_norm": 7.778578746882255, + "learning_rate": 1.0291250415542233e-05, + "loss": 0.238, + "step": 16176 + }, + { + "epoch": 6.5786905246034975, + "grad_norm": 0.4704528362409902, + "learning_rate": 1.029023765566721e-05, + "loss": 0.0071, + "step": 16177 + }, + { + "epoch": 6.579097193981293, + "grad_norm": 6.042179336288319, + "learning_rate": 1.0289224892812752e-05, + "loss": 0.1086, + "step": 16178 + }, + { + "epoch": 6.579503863359089, + "grad_norm": 3.74212220919938, + "learning_rate": 1.0288212126989257e-05, + "loss": 0.0953, + "step": 16179 + }, + { + "epoch": 6.579910532736885, + "grad_norm": 0.30259162400025885, + "learning_rate": 1.0287199358207114e-05, + "loss": 0.0042, + "step": 16180 + }, + { + "epoch": 6.5803172021146805, + "grad_norm": 12.227072709074825, + "learning_rate": 1.0286186586476729e-05, + "loss": 0.3663, + "step": 16181 + }, + { + "epoch": 6.580723871492476, + "grad_norm": 0.7400578222230473, + "learning_rate": 1.0285173811808493e-05, + "loss": 0.0085, + "step": 16182 + }, + { + "epoch": 6.581130540870273, + "grad_norm": 10.04015445670743, + "learning_rate": 1.0284161034212805e-05, + "loss": 0.4082, + "step": 16183 + }, + { + "epoch": 6.581537210248069, + "grad_norm": 2.9523701108984386, + "learning_rate": 1.0283148253700062e-05, + "loss": 0.0382, + "step": 16184 + }, + { + "epoch": 6.5819438796258645, + "grad_norm": 4.112661212224605, + "learning_rate": 1.0282135470280658e-05, + "loss": 0.1844, + "step": 16185 + }, + { + "epoch": 6.58235054900366, + "grad_norm": 1.1350877871798477, + "learning_rate": 1.028112268396499e-05, + "loss": 0.0172, + "step": 16186 + }, + { + "epoch": 6.582757218381456, + "grad_norm": 12.699232689888849, + "learning_rate": 1.0280109894763458e-05, + "loss": 0.2007, + "step": 16187 + }, + { + "epoch": 6.583163887759252, + "grad_norm": 6.800939575074576, + "learning_rate": 1.027909710268646e-05, + "loss": 0.0551, + "step": 16188 + }, + { + "epoch": 6.5835705571370475, + "grad_norm": 1.4948893467825792, + "learning_rate": 1.0278084307744385e-05, + "loss": 0.0135, + "step": 16189 + }, + { + "epoch": 6.583977226514843, + "grad_norm": 0.3366949929502446, + "learning_rate": 1.0277071509947639e-05, + "loss": 0.0035, + "step": 16190 + }, + { + "epoch": 6.584383895892639, + "grad_norm": 0.06553169545765182, + "learning_rate": 1.0276058709306613e-05, + "loss": 0.0008, + "step": 16191 + }, + { + "epoch": 6.584790565270435, + "grad_norm": 4.120357760024205, + "learning_rate": 1.0275045905831702e-05, + "loss": 0.0589, + "step": 16192 + }, + { + "epoch": 6.585197234648231, + "grad_norm": 1.715854520373958, + "learning_rate": 1.0274033099533309e-05, + "loss": 0.0285, + "step": 16193 + }, + { + "epoch": 6.585603904026026, + "grad_norm": 0.1659654069779465, + "learning_rate": 1.0273020290421829e-05, + "loss": 0.0017, + "step": 16194 + }, + { + "epoch": 6.586010573403823, + "grad_norm": 3.755220085357183, + "learning_rate": 1.0272007478507657e-05, + "loss": 0.1199, + "step": 16195 + }, + { + "epoch": 6.586417242781619, + "grad_norm": 7.478926079246688, + "learning_rate": 1.0270994663801192e-05, + "loss": 0.1653, + "step": 16196 + }, + { + "epoch": 6.5868239121594145, + "grad_norm": 4.144725467605701, + "learning_rate": 1.0269981846312832e-05, + "loss": 0.0878, + "step": 16197 + }, + { + "epoch": 6.58723058153721, + "grad_norm": 3.7089310291655093, + "learning_rate": 1.0268969026052968e-05, + "loss": 0.1224, + "step": 16198 + }, + { + "epoch": 6.587637250915006, + "grad_norm": 35.94490542862206, + "learning_rate": 1.0267956203032004e-05, + "loss": 0.2604, + "step": 16199 + }, + { + "epoch": 6.588043920292802, + "grad_norm": 0.22910210370623244, + "learning_rate": 1.0266943377260337e-05, + "loss": 0.0033, + "step": 16200 + }, + { + "epoch": 6.588450589670598, + "grad_norm": 5.484174192181227, + "learning_rate": 1.026593054874836e-05, + "loss": 0.0512, + "step": 16201 + }, + { + "epoch": 6.588857259048393, + "grad_norm": 0.26490983560705844, + "learning_rate": 1.0264917717506468e-05, + "loss": 0.0021, + "step": 16202 + }, + { + "epoch": 6.589263928426189, + "grad_norm": 5.01080005323923, + "learning_rate": 1.0263904883545068e-05, + "loss": 0.1466, + "step": 16203 + }, + { + "epoch": 6.589670597803986, + "grad_norm": 4.037203707432619, + "learning_rate": 1.026289204687455e-05, + "loss": 0.0822, + "step": 16204 + }, + { + "epoch": 6.5900772671817816, + "grad_norm": 0.24383402540957347, + "learning_rate": 1.0261879207505311e-05, + "loss": 0.0032, + "step": 16205 + }, + { + "epoch": 6.590483936559577, + "grad_norm": 0.3800957746358337, + "learning_rate": 1.0260866365447753e-05, + "loss": 0.0032, + "step": 16206 + }, + { + "epoch": 6.590890605937373, + "grad_norm": 13.82611218798057, + "learning_rate": 1.0259853520712269e-05, + "loss": 0.4898, + "step": 16207 + }, + { + "epoch": 6.591297275315169, + "grad_norm": 4.068081978631822, + "learning_rate": 1.0258840673309256e-05, + "loss": 0.054, + "step": 16208 + }, + { + "epoch": 6.591703944692965, + "grad_norm": 0.2129544585665392, + "learning_rate": 1.0257827823249115e-05, + "loss": 0.0023, + "step": 16209 + }, + { + "epoch": 6.59211061407076, + "grad_norm": 12.012606636354556, + "learning_rate": 1.0256814970542243e-05, + "loss": 0.3906, + "step": 16210 + }, + { + "epoch": 6.592517283448556, + "grad_norm": 5.779880817685492, + "learning_rate": 1.0255802115199034e-05, + "loss": 0.2398, + "step": 16211 + }, + { + "epoch": 6.592923952826352, + "grad_norm": 1.7918022138217904, + "learning_rate": 1.025478925722989e-05, + "loss": 0.0434, + "step": 16212 + }, + { + "epoch": 6.593330622204148, + "grad_norm": 2.4049383399192488, + "learning_rate": 1.0253776396645205e-05, + "loss": 0.0318, + "step": 16213 + }, + { + "epoch": 6.593737291581943, + "grad_norm": 0.2916786488841749, + "learning_rate": 1.0252763533455377e-05, + "loss": 0.0048, + "step": 16214 + }, + { + "epoch": 6.59414396095974, + "grad_norm": 0.37340542889317396, + "learning_rate": 1.0251750667670804e-05, + "loss": 0.0055, + "step": 16215 + }, + { + "epoch": 6.594550630337536, + "grad_norm": 5.892278047228564, + "learning_rate": 1.0250737799301885e-05, + "loss": 0.1644, + "step": 16216 + }, + { + "epoch": 6.594957299715332, + "grad_norm": 5.575373098345464, + "learning_rate": 1.0249724928359015e-05, + "loss": 0.1143, + "step": 16217 + }, + { + "epoch": 6.595363969093127, + "grad_norm": 12.088859119901022, + "learning_rate": 1.024871205485259e-05, + "loss": 0.6433, + "step": 16218 + }, + { + "epoch": 6.595770638470923, + "grad_norm": 3.544767642943437, + "learning_rate": 1.0247699178793016e-05, + "loss": 0.144, + "step": 16219 + }, + { + "epoch": 6.596177307848719, + "grad_norm": 3.0593513154091085, + "learning_rate": 1.0246686300190682e-05, + "loss": 0.0608, + "step": 16220 + }, + { + "epoch": 6.596583977226515, + "grad_norm": 6.246498082596596, + "learning_rate": 1.0245673419055988e-05, + "loss": 0.0937, + "step": 16221 + }, + { + "epoch": 6.5969906466043104, + "grad_norm": 0.04068505230136325, + "learning_rate": 1.0244660535399335e-05, + "loss": 0.0005, + "step": 16222 + }, + { + "epoch": 6.597397315982106, + "grad_norm": 6.597283720069392, + "learning_rate": 1.0243647649231118e-05, + "loss": 0.2009, + "step": 16223 + }, + { + "epoch": 6.597803985359903, + "grad_norm": 6.126321645373382, + "learning_rate": 1.0242634760561738e-05, + "loss": 0.2025, + "step": 16224 + }, + { + "epoch": 6.598210654737699, + "grad_norm": 12.89789087533978, + "learning_rate": 1.0241621869401586e-05, + "loss": 0.672, + "step": 16225 + }, + { + "epoch": 6.598617324115494, + "grad_norm": 2.1610004562659344, + "learning_rate": 1.0240608975761066e-05, + "loss": 0.0656, + "step": 16226 + }, + { + "epoch": 6.59902399349329, + "grad_norm": 8.135028138338443, + "learning_rate": 1.0239596079650574e-05, + "loss": 0.2099, + "step": 16227 + }, + { + "epoch": 6.599430662871086, + "grad_norm": 3.4319448079364183, + "learning_rate": 1.0238583181080503e-05, + "loss": 0.1431, + "step": 16228 + }, + { + "epoch": 6.599837332248882, + "grad_norm": 0.11856669610094377, + "learning_rate": 1.0237570280061262e-05, + "loss": 0.0017, + "step": 16229 + }, + { + "epoch": 6.6002440016266775, + "grad_norm": 2.283151572824727, + "learning_rate": 1.023655737660324e-05, + "loss": 0.0244, + "step": 16230 + }, + { + "epoch": 6.600650671004473, + "grad_norm": 6.614093298989984, + "learning_rate": 1.0235544470716833e-05, + "loss": 0.2321, + "step": 16231 + }, + { + "epoch": 6.601057340382269, + "grad_norm": 11.687698931836982, + "learning_rate": 1.0234531562412452e-05, + "loss": 0.2974, + "step": 16232 + }, + { + "epoch": 6.601464009760065, + "grad_norm": 3.0480805596978775, + "learning_rate": 1.023351865170048e-05, + "loss": 0.0597, + "step": 16233 + }, + { + "epoch": 6.6018706791378605, + "grad_norm": 4.952775824059191, + "learning_rate": 1.0232505738591323e-05, + "loss": 0.0765, + "step": 16234 + }, + { + "epoch": 6.602277348515656, + "grad_norm": 6.580944143279074, + "learning_rate": 1.023149282309538e-05, + "loss": 0.1574, + "step": 16235 + }, + { + "epoch": 6.602684017893453, + "grad_norm": 0.22133357473557558, + "learning_rate": 1.0230479905223046e-05, + "loss": 0.004, + "step": 16236 + }, + { + "epoch": 6.603090687271249, + "grad_norm": 1.6257935467705078, + "learning_rate": 1.0229466984984717e-05, + "loss": 0.0213, + "step": 16237 + }, + { + "epoch": 6.6034973566490445, + "grad_norm": 2.5704029341904535, + "learning_rate": 1.0228454062390796e-05, + "loss": 0.0988, + "step": 16238 + }, + { + "epoch": 6.60390402602684, + "grad_norm": 2.523452361442513, + "learning_rate": 1.0227441137451679e-05, + "loss": 0.0298, + "step": 16239 + }, + { + "epoch": 6.604310695404636, + "grad_norm": 0.6096416752996215, + "learning_rate": 1.0226428210177764e-05, + "loss": 0.0049, + "step": 16240 + }, + { + "epoch": 6.604717364782432, + "grad_norm": 3.286692495114898, + "learning_rate": 1.022541528057945e-05, + "loss": 0.0868, + "step": 16241 + }, + { + "epoch": 6.6051240341602275, + "grad_norm": 0.6624806109074052, + "learning_rate": 1.0224402348667135e-05, + "loss": 0.0249, + "step": 16242 + }, + { + "epoch": 6.605530703538023, + "grad_norm": 0.5804312084019174, + "learning_rate": 1.0223389414451215e-05, + "loss": 0.0057, + "step": 16243 + }, + { + "epoch": 6.605937372915819, + "grad_norm": 3.2853788101691648, + "learning_rate": 1.0222376477942091e-05, + "loss": 0.0547, + "step": 16244 + }, + { + "epoch": 6.606344042293616, + "grad_norm": 4.810575621873702, + "learning_rate": 1.0221363539150161e-05, + "loss": 0.13, + "step": 16245 + }, + { + "epoch": 6.6067507116714115, + "grad_norm": 7.088945883390168, + "learning_rate": 1.0220350598085825e-05, + "loss": 0.1857, + "step": 16246 + }, + { + "epoch": 6.607157381049207, + "grad_norm": 0.374847829859672, + "learning_rate": 1.0219337654759474e-05, + "loss": 0.0049, + "step": 16247 + }, + { + "epoch": 6.607564050427003, + "grad_norm": 8.236821465601219, + "learning_rate": 1.0218324709181516e-05, + "loss": 0.2105, + "step": 16248 + }, + { + "epoch": 6.607970719804799, + "grad_norm": 0.12028628004489991, + "learning_rate": 1.0217311761362342e-05, + "loss": 0.0021, + "step": 16249 + }, + { + "epoch": 6.6083773891825945, + "grad_norm": 0.2556474608715289, + "learning_rate": 1.0216298811312355e-05, + "loss": 0.0035, + "step": 16250 + }, + { + "epoch": 6.60878405856039, + "grad_norm": 7.788633814467655, + "learning_rate": 1.0215285859041953e-05, + "loss": 0.1636, + "step": 16251 + }, + { + "epoch": 6.609190727938186, + "grad_norm": 5.103924824933986, + "learning_rate": 1.0214272904561533e-05, + "loss": 0.126, + "step": 16252 + }, + { + "epoch": 6.609597397315982, + "grad_norm": 0.35219740759793833, + "learning_rate": 1.0213259947881492e-05, + "loss": 0.0059, + "step": 16253 + }, + { + "epoch": 6.610004066693778, + "grad_norm": 4.014771375975289, + "learning_rate": 1.0212246989012228e-05, + "loss": 0.0671, + "step": 16254 + }, + { + "epoch": 6.610410736071573, + "grad_norm": 0.3826373061578833, + "learning_rate": 1.0211234027964147e-05, + "loss": 0.0049, + "step": 16255 + }, + { + "epoch": 6.61081740544937, + "grad_norm": 5.8704863452668, + "learning_rate": 1.0210221064747639e-05, + "loss": 0.1859, + "step": 16256 + }, + { + "epoch": 6.611224074827166, + "grad_norm": 8.433550996658777, + "learning_rate": 1.020920809937311e-05, + "loss": 0.3184, + "step": 16257 + }, + { + "epoch": 6.6116307442049616, + "grad_norm": 10.37447977916185, + "learning_rate": 1.020819513185095e-05, + "loss": 0.4128, + "step": 16258 + }, + { + "epoch": 6.612037413582757, + "grad_norm": 15.2038917720595, + "learning_rate": 1.0207182162191565e-05, + "loss": 0.3737, + "step": 16259 + }, + { + "epoch": 6.612444082960553, + "grad_norm": 3.72903243395253, + "learning_rate": 1.0206169190405349e-05, + "loss": 0.0436, + "step": 16260 + }, + { + "epoch": 6.612850752338349, + "grad_norm": 2.586397853943238, + "learning_rate": 1.0205156216502704e-05, + "loss": 0.0273, + "step": 16261 + }, + { + "epoch": 6.613257421716145, + "grad_norm": 0.3326315047652212, + "learning_rate": 1.0204143240494026e-05, + "loss": 0.004, + "step": 16262 + }, + { + "epoch": 6.61366409109394, + "grad_norm": 0.7725814630062325, + "learning_rate": 1.0203130262389715e-05, + "loss": 0.0092, + "step": 16263 + }, + { + "epoch": 6.614070760471736, + "grad_norm": 0.03781808469492268, + "learning_rate": 1.020211728220017e-05, + "loss": 0.0005, + "step": 16264 + }, + { + "epoch": 6.614477429849533, + "grad_norm": 6.353489775917295, + "learning_rate": 1.0201104299935792e-05, + "loss": 0.2921, + "step": 16265 + }, + { + "epoch": 6.614884099227329, + "grad_norm": 1.2041921939216493, + "learning_rate": 1.0200091315606973e-05, + "loss": 0.02, + "step": 16266 + }, + { + "epoch": 6.615290768605124, + "grad_norm": 0.15729839174890992, + "learning_rate": 1.0199078329224117e-05, + "loss": 0.0018, + "step": 16267 + }, + { + "epoch": 6.61569743798292, + "grad_norm": 3.8158105208614246, + "learning_rate": 1.0198065340797622e-05, + "loss": 0.0667, + "step": 16268 + }, + { + "epoch": 6.616104107360716, + "grad_norm": 15.336481280264227, + "learning_rate": 1.0197052350337885e-05, + "loss": 0.4734, + "step": 16269 + }, + { + "epoch": 6.616510776738512, + "grad_norm": 0.13412725078725948, + "learning_rate": 1.019603935785531e-05, + "loss": 0.0022, + "step": 16270 + }, + { + "epoch": 6.616917446116307, + "grad_norm": 5.265849640162085, + "learning_rate": 1.0195026363360291e-05, + "loss": 0.1283, + "step": 16271 + }, + { + "epoch": 6.617324115494103, + "grad_norm": 4.757935267933151, + "learning_rate": 1.0194013366863229e-05, + "loss": 0.1306, + "step": 16272 + }, + { + "epoch": 6.617730784871899, + "grad_norm": 0.024949644048740376, + "learning_rate": 1.0193000368374519e-05, + "loss": 0.0004, + "step": 16273 + }, + { + "epoch": 6.618137454249695, + "grad_norm": 6.464218798622614, + "learning_rate": 1.0191987367904566e-05, + "loss": 0.149, + "step": 16274 + }, + { + "epoch": 6.6185441236274904, + "grad_norm": 5.944289431476992, + "learning_rate": 1.0190974365463766e-05, + "loss": 0.1369, + "step": 16275 + }, + { + "epoch": 6.618950793005286, + "grad_norm": 2.2265803550225995, + "learning_rate": 1.0189961361062514e-05, + "loss": 0.0337, + "step": 16276 + }, + { + "epoch": 6.619357462383083, + "grad_norm": 4.460136184920372, + "learning_rate": 1.018894835471122e-05, + "loss": 0.0909, + "step": 16277 + }, + { + "epoch": 6.619764131760879, + "grad_norm": 0.0017153838702179732, + "learning_rate": 1.0187935346420272e-05, + "loss": 0.0, + "step": 16278 + }, + { + "epoch": 6.620170801138674, + "grad_norm": 0.43833361029582846, + "learning_rate": 1.0186922336200071e-05, + "loss": 0.0061, + "step": 16279 + }, + { + "epoch": 6.62057747051647, + "grad_norm": 2.614443267944997, + "learning_rate": 1.018590932406102e-05, + "loss": 0.0284, + "step": 16280 + }, + { + "epoch": 6.620984139894266, + "grad_norm": 4.288406184189223, + "learning_rate": 1.0184896310013519e-05, + "loss": 0.0785, + "step": 16281 + }, + { + "epoch": 6.621390809272062, + "grad_norm": 9.104229363524107, + "learning_rate": 1.0183883294067961e-05, + "loss": 0.2875, + "step": 16282 + }, + { + "epoch": 6.6217974786498575, + "grad_norm": 1.8669974770047268, + "learning_rate": 1.0182870276234749e-05, + "loss": 0.0488, + "step": 16283 + }, + { + "epoch": 6.622204148027653, + "grad_norm": 0.3602685097875557, + "learning_rate": 1.0181857256524283e-05, + "loss": 0.0038, + "step": 16284 + }, + { + "epoch": 6.622610817405449, + "grad_norm": 13.625420896346094, + "learning_rate": 1.0180844234946961e-05, + "loss": 0.1635, + "step": 16285 + }, + { + "epoch": 6.623017486783246, + "grad_norm": 5.33870763415546, + "learning_rate": 1.0179831211513178e-05, + "loss": 0.1261, + "step": 16286 + }, + { + "epoch": 6.623424156161041, + "grad_norm": 0.982460728076353, + "learning_rate": 1.0178818186233342e-05, + "loss": 0.0128, + "step": 16287 + }, + { + "epoch": 6.623830825538837, + "grad_norm": 10.116022620467703, + "learning_rate": 1.0177805159117844e-05, + "loss": 0.3633, + "step": 16288 + }, + { + "epoch": 6.624237494916633, + "grad_norm": 17.320489782717434, + "learning_rate": 1.0176792130177087e-05, + "loss": 0.2726, + "step": 16289 + }, + { + "epoch": 6.624644164294429, + "grad_norm": 0.7363567220259755, + "learning_rate": 1.0175779099421472e-05, + "loss": 0.0077, + "step": 16290 + }, + { + "epoch": 6.6250508336722245, + "grad_norm": 0.7279981158398423, + "learning_rate": 1.0174766066861396e-05, + "loss": 0.0105, + "step": 16291 + }, + { + "epoch": 6.62545750305002, + "grad_norm": 1.0088943573514206, + "learning_rate": 1.0173753032507256e-05, + "loss": 0.0123, + "step": 16292 + }, + { + "epoch": 6.625864172427816, + "grad_norm": 10.127109482237762, + "learning_rate": 1.0172739996369454e-05, + "loss": 0.3236, + "step": 16293 + }, + { + "epoch": 6.626270841805612, + "grad_norm": 9.914558859931981, + "learning_rate": 1.0171726958458392e-05, + "loss": 0.2866, + "step": 16294 + }, + { + "epoch": 6.6266775111834075, + "grad_norm": 1.4144072492753503, + "learning_rate": 1.017071391878446e-05, + "loss": 0.0235, + "step": 16295 + }, + { + "epoch": 6.627084180561203, + "grad_norm": 2.588601522420051, + "learning_rate": 1.016970087735807e-05, + "loss": 0.0368, + "step": 16296 + }, + { + "epoch": 6.627490849939, + "grad_norm": 7.288040346409138, + "learning_rate": 1.0168687834189613e-05, + "loss": 0.3101, + "step": 16297 + }, + { + "epoch": 6.627897519316796, + "grad_norm": 0.9999588626211853, + "learning_rate": 1.0167674789289488e-05, + "loss": 0.0205, + "step": 16298 + }, + { + "epoch": 6.6283041886945915, + "grad_norm": 13.992422277098559, + "learning_rate": 1.01666617426681e-05, + "loss": 0.0647, + "step": 16299 + }, + { + "epoch": 6.628710858072387, + "grad_norm": 0.787634857242791, + "learning_rate": 1.0165648694335846e-05, + "loss": 0.0072, + "step": 16300 + }, + { + "epoch": 6.629117527450183, + "grad_norm": 1.1993607226751533, + "learning_rate": 1.0164635644303123e-05, + "loss": 0.0176, + "step": 16301 + }, + { + "epoch": 6.629524196827979, + "grad_norm": 0.32694789054824963, + "learning_rate": 1.016362259258033e-05, + "loss": 0.0036, + "step": 16302 + }, + { + "epoch": 6.6299308662057745, + "grad_norm": 1.0407156430421847, + "learning_rate": 1.0162609539177872e-05, + "loss": 0.0158, + "step": 16303 + }, + { + "epoch": 6.63033753558357, + "grad_norm": 12.244300706493593, + "learning_rate": 1.0161596484106147e-05, + "loss": 0.4766, + "step": 16304 + }, + { + "epoch": 6.630744204961366, + "grad_norm": 3.0847107911740204, + "learning_rate": 1.0160583427375548e-05, + "loss": 0.0268, + "step": 16305 + }, + { + "epoch": 6.631150874339163, + "grad_norm": 6.055675168466312, + "learning_rate": 1.0159570368996483e-05, + "loss": 0.1507, + "step": 16306 + }, + { + "epoch": 6.6315575437169585, + "grad_norm": 2.5376948319121033, + "learning_rate": 1.0158557308979347e-05, + "loss": 0.0341, + "step": 16307 + }, + { + "epoch": 6.631964213094754, + "grad_norm": 8.65346494690925, + "learning_rate": 1.0157544247334536e-05, + "loss": 0.153, + "step": 16308 + }, + { + "epoch": 6.63237088247255, + "grad_norm": 20.259243977464443, + "learning_rate": 1.015653118407246e-05, + "loss": 0.28, + "step": 16309 + }, + { + "epoch": 6.632777551850346, + "grad_norm": 2.1441124198232817, + "learning_rate": 1.0155518119203511e-05, + "loss": 0.034, + "step": 16310 + }, + { + "epoch": 6.6331842212281416, + "grad_norm": 0.037348516389531156, + "learning_rate": 1.0154505052738088e-05, + "loss": 0.0005, + "step": 16311 + }, + { + "epoch": 6.633590890605937, + "grad_norm": 12.445680827756627, + "learning_rate": 1.0153491984686595e-05, + "loss": 0.2916, + "step": 16312 + }, + { + "epoch": 6.633997559983733, + "grad_norm": 7.587469155402664, + "learning_rate": 1.015247891505943e-05, + "loss": 0.2017, + "step": 16313 + }, + { + "epoch": 6.634404229361529, + "grad_norm": 0.021176064559926374, + "learning_rate": 1.015146584386699e-05, + "loss": 0.0003, + "step": 16314 + }, + { + "epoch": 6.634810898739325, + "grad_norm": 5.900533226235214, + "learning_rate": 1.0150452771119678e-05, + "loss": 0.0662, + "step": 16315 + }, + { + "epoch": 6.63521756811712, + "grad_norm": 6.356488455823636, + "learning_rate": 1.0149439696827895e-05, + "loss": 0.1437, + "step": 16316 + }, + { + "epoch": 6.635624237494916, + "grad_norm": 6.851074585375268, + "learning_rate": 1.0148426621002035e-05, + "loss": 0.1532, + "step": 16317 + }, + { + "epoch": 6.636030906872713, + "grad_norm": 9.926233557551907, + "learning_rate": 1.01474135436525e-05, + "loss": 0.2649, + "step": 16318 + }, + { + "epoch": 6.636437576250509, + "grad_norm": 7.294042135145825, + "learning_rate": 1.0146400464789694e-05, + "loss": 0.448, + "step": 16319 + }, + { + "epoch": 6.636844245628304, + "grad_norm": 3.4019602362065324, + "learning_rate": 1.0145387384424013e-05, + "loss": 0.057, + "step": 16320 + }, + { + "epoch": 6.6372509150061, + "grad_norm": 0.6303587983599569, + "learning_rate": 1.0144374302565855e-05, + "loss": 0.0076, + "step": 16321 + }, + { + "epoch": 6.637657584383896, + "grad_norm": 9.571706996838834, + "learning_rate": 1.0143361219225625e-05, + "loss": 0.5705, + "step": 16322 + }, + { + "epoch": 6.638064253761692, + "grad_norm": 0.6963385419978558, + "learning_rate": 1.014234813441372e-05, + "loss": 0.0098, + "step": 16323 + }, + { + "epoch": 6.638470923139487, + "grad_norm": 6.469929453709411, + "learning_rate": 1.0141335048140538e-05, + "loss": 0.2263, + "step": 16324 + }, + { + "epoch": 6.638877592517283, + "grad_norm": 4.2352356810194, + "learning_rate": 1.0140321960416479e-05, + "loss": 0.0587, + "step": 16325 + }, + { + "epoch": 6.639284261895079, + "grad_norm": 0.08223637001528733, + "learning_rate": 1.0139308871251947e-05, + "loss": 0.0011, + "step": 16326 + }, + { + "epoch": 6.639690931272876, + "grad_norm": 4.422265965024189, + "learning_rate": 1.0138295780657339e-05, + "loss": 0.121, + "step": 16327 + }, + { + "epoch": 6.640097600650671, + "grad_norm": 1.0081401892618993, + "learning_rate": 1.0137282688643056e-05, + "loss": 0.0141, + "step": 16328 + }, + { + "epoch": 6.640504270028467, + "grad_norm": 4.197539844711909, + "learning_rate": 1.0136269595219496e-05, + "loss": 0.088, + "step": 16329 + }, + { + "epoch": 6.640910939406263, + "grad_norm": 1.73377428677931, + "learning_rate": 1.0135256500397062e-05, + "loss": 0.0332, + "step": 16330 + }, + { + "epoch": 6.641317608784059, + "grad_norm": 1.3392418265327426, + "learning_rate": 1.0134243404186148e-05, + "loss": 0.0302, + "step": 16331 + }, + { + "epoch": 6.641724278161854, + "grad_norm": 0.07641872967692104, + "learning_rate": 1.0133230306597158e-05, + "loss": 0.0008, + "step": 16332 + }, + { + "epoch": 6.64213094753965, + "grad_norm": 0.10081831981436475, + "learning_rate": 1.0132217207640497e-05, + "loss": 0.0018, + "step": 16333 + }, + { + "epoch": 6.642537616917446, + "grad_norm": 0.7195209217416506, + "learning_rate": 1.0131204107326555e-05, + "loss": 0.0107, + "step": 16334 + }, + { + "epoch": 6.642944286295242, + "grad_norm": 9.303602274651249, + "learning_rate": 1.013019100566574e-05, + "loss": 0.1822, + "step": 16335 + }, + { + "epoch": 6.6433509556730375, + "grad_norm": 0.7696601302146642, + "learning_rate": 1.0129177902668448e-05, + "loss": 0.0135, + "step": 16336 + }, + { + "epoch": 6.643757625050833, + "grad_norm": 1.0013369202922333, + "learning_rate": 1.0128164798345077e-05, + "loss": 0.0133, + "step": 16337 + }, + { + "epoch": 6.64416429442863, + "grad_norm": 2.6383205605154174, + "learning_rate": 1.0127151692706032e-05, + "loss": 0.0432, + "step": 16338 + }, + { + "epoch": 6.644570963806426, + "grad_norm": 0.16287160440091578, + "learning_rate": 1.0126138585761713e-05, + "loss": 0.0027, + "step": 16339 + }, + { + "epoch": 6.644977633184221, + "grad_norm": 1.4237781867968782, + "learning_rate": 1.0125125477522514e-05, + "loss": 0.0094, + "step": 16340 + }, + { + "epoch": 6.645384302562017, + "grad_norm": 8.632374420097722, + "learning_rate": 1.0124112367998844e-05, + "loss": 0.2246, + "step": 16341 + }, + { + "epoch": 6.645790971939813, + "grad_norm": 4.227173134671213, + "learning_rate": 1.0123099257201094e-05, + "loss": 0.0903, + "step": 16342 + }, + { + "epoch": 6.646197641317609, + "grad_norm": 0.530239102425315, + "learning_rate": 1.0122086145139669e-05, + "loss": 0.0109, + "step": 16343 + }, + { + "epoch": 6.6466043106954045, + "grad_norm": 0.5021684210347197, + "learning_rate": 1.0121073031824969e-05, + "loss": 0.0073, + "step": 16344 + }, + { + "epoch": 6.6470109800732, + "grad_norm": 0.2841315001961665, + "learning_rate": 1.0120059917267392e-05, + "loss": 0.003, + "step": 16345 + }, + { + "epoch": 6.647417649450996, + "grad_norm": 0.677651572734681, + "learning_rate": 1.0119046801477338e-05, + "loss": 0.0107, + "step": 16346 + }, + { + "epoch": 6.647824318828793, + "grad_norm": 0.639457863275892, + "learning_rate": 1.0118033684465211e-05, + "loss": 0.0069, + "step": 16347 + }, + { + "epoch": 6.648230988206588, + "grad_norm": 3.7513274126736698, + "learning_rate": 1.0117020566241411e-05, + "loss": 0.1686, + "step": 16348 + }, + { + "epoch": 6.648637657584384, + "grad_norm": 0.9629901742701391, + "learning_rate": 1.0116007446816334e-05, + "loss": 0.0201, + "step": 16349 + }, + { + "epoch": 6.64904432696218, + "grad_norm": 4.147888633999571, + "learning_rate": 1.011499432620038e-05, + "loss": 0.1127, + "step": 16350 + }, + { + "epoch": 6.649450996339976, + "grad_norm": 0.06486319178876737, + "learning_rate": 1.0113981204403956e-05, + "loss": 0.0011, + "step": 16351 + }, + { + "epoch": 6.6498576657177715, + "grad_norm": 6.874134167964703, + "learning_rate": 1.0112968081437455e-05, + "loss": 0.2634, + "step": 16352 + }, + { + "epoch": 6.650264335095567, + "grad_norm": 8.325530587434152, + "learning_rate": 1.0111954957311277e-05, + "loss": 0.2249, + "step": 16353 + }, + { + "epoch": 6.650671004473363, + "grad_norm": 4.830533085662542, + "learning_rate": 1.0110941832035828e-05, + "loss": 0.0584, + "step": 16354 + }, + { + "epoch": 6.651077673851159, + "grad_norm": 4.959938500848004, + "learning_rate": 1.0109928705621506e-05, + "loss": 0.08, + "step": 16355 + }, + { + "epoch": 6.6514843432289545, + "grad_norm": 1.3642139026686038, + "learning_rate": 1.0108915578078709e-05, + "loss": 0.0492, + "step": 16356 + }, + { + "epoch": 6.65189101260675, + "grad_norm": 6.863646957395001, + "learning_rate": 1.0107902449417839e-05, + "loss": 0.1449, + "step": 16357 + }, + { + "epoch": 6.652297681984546, + "grad_norm": 0.33651466309975137, + "learning_rate": 1.0106889319649299e-05, + "loss": 0.0051, + "step": 16358 + }, + { + "epoch": 6.652704351362343, + "grad_norm": 1.7999530510907729, + "learning_rate": 1.0105876188783478e-05, + "loss": 0.0413, + "step": 16359 + }, + { + "epoch": 6.6531110207401385, + "grad_norm": 5.4128953130484545, + "learning_rate": 1.0104863056830794e-05, + "loss": 0.1374, + "step": 16360 + }, + { + "epoch": 6.653517690117934, + "grad_norm": 0.6785954244494634, + "learning_rate": 1.0103849923801634e-05, + "loss": 0.0106, + "step": 16361 + }, + { + "epoch": 6.65392435949573, + "grad_norm": 3.152044711537182, + "learning_rate": 1.0102836789706402e-05, + "loss": 0.0555, + "step": 16362 + }, + { + "epoch": 6.654331028873526, + "grad_norm": 0.970467177002592, + "learning_rate": 1.0101823654555498e-05, + "loss": 0.0059, + "step": 16363 + }, + { + "epoch": 6.6547376982513216, + "grad_norm": 6.154330806387432, + "learning_rate": 1.0100810518359326e-05, + "loss": 0.2799, + "step": 16364 + }, + { + "epoch": 6.655144367629117, + "grad_norm": 0.204715469580191, + "learning_rate": 1.009979738112828e-05, + "loss": 0.0025, + "step": 16365 + }, + { + "epoch": 6.655551037006913, + "grad_norm": 0.02297927973434816, + "learning_rate": 1.0098784242872764e-05, + "loss": 0.0003, + "step": 16366 + }, + { + "epoch": 6.655957706384709, + "grad_norm": 9.989047683085458, + "learning_rate": 1.009777110360318e-05, + "loss": 0.4724, + "step": 16367 + }, + { + "epoch": 6.6563643757625055, + "grad_norm": 0.26285420558400907, + "learning_rate": 1.0096757963329926e-05, + "loss": 0.005, + "step": 16368 + }, + { + "epoch": 6.656771045140301, + "grad_norm": 0.3472186326322269, + "learning_rate": 1.00957448220634e-05, + "loss": 0.0047, + "step": 16369 + }, + { + "epoch": 6.657177714518097, + "grad_norm": 0.310160447219349, + "learning_rate": 1.0094731679814007e-05, + "loss": 0.0023, + "step": 16370 + }, + { + "epoch": 6.657584383895893, + "grad_norm": 7.344522614272876, + "learning_rate": 1.0093718536592147e-05, + "loss": 0.3081, + "step": 16371 + }, + { + "epoch": 6.657991053273689, + "grad_norm": 8.847609377046343, + "learning_rate": 1.0092705392408217e-05, + "loss": 0.1772, + "step": 16372 + }, + { + "epoch": 6.658397722651484, + "grad_norm": 4.310559131771685, + "learning_rate": 1.0091692247272622e-05, + "loss": 0.0646, + "step": 16373 + }, + { + "epoch": 6.65880439202928, + "grad_norm": 1.4657426322072915, + "learning_rate": 1.0090679101195759e-05, + "loss": 0.0297, + "step": 16374 + }, + { + "epoch": 6.659211061407076, + "grad_norm": 3.2331221399304626, + "learning_rate": 1.0089665954188028e-05, + "loss": 0.0712, + "step": 16375 + }, + { + "epoch": 6.659617730784872, + "grad_norm": 0.8845107381746597, + "learning_rate": 1.0088652806259831e-05, + "loss": 0.0099, + "step": 16376 + }, + { + "epoch": 6.660024400162667, + "grad_norm": 2.2241943089800773, + "learning_rate": 1.008763965742157e-05, + "loss": 0.0268, + "step": 16377 + }, + { + "epoch": 6.660431069540463, + "grad_norm": 0.9059578540264696, + "learning_rate": 1.0086626507683643e-05, + "loss": 0.0174, + "step": 16378 + }, + { + "epoch": 6.66083773891826, + "grad_norm": 3.9720542805866748, + "learning_rate": 1.008561335705645e-05, + "loss": 0.0796, + "step": 16379 + }, + { + "epoch": 6.661244408296056, + "grad_norm": 4.767471782034241, + "learning_rate": 1.0084600205550395e-05, + "loss": 0.114, + "step": 16380 + }, + { + "epoch": 6.661651077673851, + "grad_norm": 3.2504051969340146, + "learning_rate": 1.0083587053175877e-05, + "loss": 0.1333, + "step": 16381 + }, + { + "epoch": 6.662057747051647, + "grad_norm": 3.654312905906428, + "learning_rate": 1.008257389994329e-05, + "loss": 0.0833, + "step": 16382 + }, + { + "epoch": 6.662464416429443, + "grad_norm": 11.335701195313073, + "learning_rate": 1.0081560745863046e-05, + "loss": 0.1751, + "step": 16383 + }, + { + "epoch": 6.662871085807239, + "grad_norm": 4.39968382032699, + "learning_rate": 1.0080547590945539e-05, + "loss": 0.1157, + "step": 16384 + }, + { + "epoch": 6.663277755185034, + "grad_norm": 6.12383347300151, + "learning_rate": 1.0079534435201168e-05, + "loss": 0.1419, + "step": 16385 + }, + { + "epoch": 6.66368442456283, + "grad_norm": 4.0435811226317915, + "learning_rate": 1.0078521278640339e-05, + "loss": 0.0727, + "step": 16386 + }, + { + "epoch": 6.664091093940626, + "grad_norm": 3.070626500625484, + "learning_rate": 1.0077508121273449e-05, + "loss": 0.0451, + "step": 16387 + }, + { + "epoch": 6.664497763318423, + "grad_norm": 0.027586264573372953, + "learning_rate": 1.0076494963110899e-05, + "loss": 0.0002, + "step": 16388 + }, + { + "epoch": 6.664904432696218, + "grad_norm": 4.158301049602894, + "learning_rate": 1.007548180416309e-05, + "loss": 0.1266, + "step": 16389 + }, + { + "epoch": 6.665311102074014, + "grad_norm": 0.025343317156650794, + "learning_rate": 1.0074468644440422e-05, + "loss": 0.0005, + "step": 16390 + }, + { + "epoch": 6.66571777145181, + "grad_norm": 2.0680259528520915, + "learning_rate": 1.0073455483953294e-05, + "loss": 0.0319, + "step": 16391 + }, + { + "epoch": 6.666124440829606, + "grad_norm": 0.46242998048895695, + "learning_rate": 1.0072442322712111e-05, + "loss": 0.0068, + "step": 16392 + }, + { + "epoch": 6.666531110207401, + "grad_norm": 8.24256276286571, + "learning_rate": 1.007142916072727e-05, + "loss": 0.2854, + "step": 16393 + }, + { + "epoch": 6.666937779585197, + "grad_norm": 3.960155760962026, + "learning_rate": 1.0070415998009172e-05, + "loss": 0.0275, + "step": 16394 + }, + { + "epoch": 6.667344448962993, + "grad_norm": 15.857172832842704, + "learning_rate": 1.0069402834568218e-05, + "loss": 0.5732, + "step": 16395 + }, + { + "epoch": 6.667751118340789, + "grad_norm": 1.9834349858157962, + "learning_rate": 1.006838967041481e-05, + "loss": 0.034, + "step": 16396 + }, + { + "epoch": 6.6681577877185845, + "grad_norm": 4.933703266915647, + "learning_rate": 1.006737650555935e-05, + "loss": 0.0777, + "step": 16397 + }, + { + "epoch": 6.66856445709638, + "grad_norm": 3.7793592081917726, + "learning_rate": 1.0066363340012233e-05, + "loss": 0.1136, + "step": 16398 + }, + { + "epoch": 6.668971126474176, + "grad_norm": 0.1168798745238821, + "learning_rate": 1.0065350173783863e-05, + "loss": 0.0014, + "step": 16399 + }, + { + "epoch": 6.669377795851973, + "grad_norm": 0.13801799775662618, + "learning_rate": 1.006433700688464e-05, + "loss": 0.0022, + "step": 16400 + }, + { + "epoch": 6.669784465229768, + "grad_norm": 2.4563379382238995, + "learning_rate": 1.0063323839324968e-05, + "loss": 0.0514, + "step": 16401 + }, + { + "epoch": 6.670191134607564, + "grad_norm": 0.5877843447175539, + "learning_rate": 1.0062310671115244e-05, + "loss": 0.0076, + "step": 16402 + }, + { + "epoch": 6.67059780398536, + "grad_norm": 7.61512624960288, + "learning_rate": 1.006129750226587e-05, + "loss": 0.1703, + "step": 16403 + }, + { + "epoch": 6.671004473363156, + "grad_norm": 6.1648422017285585, + "learning_rate": 1.0060284332787243e-05, + "loss": 0.1339, + "step": 16404 + }, + { + "epoch": 6.6714111427409515, + "grad_norm": 2.276544500551352, + "learning_rate": 1.005927116268977e-05, + "loss": 0.0336, + "step": 16405 + }, + { + "epoch": 6.671817812118747, + "grad_norm": 8.757651331545372, + "learning_rate": 1.0058257991983848e-05, + "loss": 0.4795, + "step": 16406 + }, + { + "epoch": 6.672224481496543, + "grad_norm": 18.947949795766526, + "learning_rate": 1.0057244820679878e-05, + "loss": 0.8867, + "step": 16407 + }, + { + "epoch": 6.672631150874339, + "grad_norm": 1.3594497515253143, + "learning_rate": 1.0056231648788262e-05, + "loss": 0.0178, + "step": 16408 + }, + { + "epoch": 6.673037820252135, + "grad_norm": 1.2111384857569605, + "learning_rate": 1.0055218476319399e-05, + "loss": 0.0213, + "step": 16409 + }, + { + "epoch": 6.673444489629931, + "grad_norm": 1.4641002839002504, + "learning_rate": 1.0054205303283692e-05, + "loss": 0.0146, + "step": 16410 + }, + { + "epoch": 6.673851159007727, + "grad_norm": 5.355543265765114, + "learning_rate": 1.0053192129691537e-05, + "loss": 0.1491, + "step": 16411 + }, + { + "epoch": 6.674257828385523, + "grad_norm": 0.025126058685303033, + "learning_rate": 1.0052178955553343e-05, + "loss": 0.0004, + "step": 16412 + }, + { + "epoch": 6.6746644977633185, + "grad_norm": 1.685191319828132, + "learning_rate": 1.0051165780879503e-05, + "loss": 0.0588, + "step": 16413 + }, + { + "epoch": 6.675071167141114, + "grad_norm": 0.032898356242858307, + "learning_rate": 1.0050152605680419e-05, + "loss": 0.0006, + "step": 16414 + }, + { + "epoch": 6.67547783651891, + "grad_norm": 0.11061307205890336, + "learning_rate": 1.0049139429966494e-05, + "loss": 0.0017, + "step": 16415 + }, + { + "epoch": 6.675884505896706, + "grad_norm": 3.17719757111324, + "learning_rate": 1.0048126253748132e-05, + "loss": 0.0404, + "step": 16416 + }, + { + "epoch": 6.6762911752745016, + "grad_norm": 2.809273481600083, + "learning_rate": 1.0047113077035723e-05, + "loss": 0.0275, + "step": 16417 + }, + { + "epoch": 6.676697844652297, + "grad_norm": 0.026661187185138453, + "learning_rate": 1.0046099899839679e-05, + "loss": 0.0003, + "step": 16418 + }, + { + "epoch": 6.677104514030093, + "grad_norm": 0.6712100398528771, + "learning_rate": 1.0045086722170396e-05, + "loss": 0.0092, + "step": 16419 + }, + { + "epoch": 6.67751118340789, + "grad_norm": 0.7597212241988819, + "learning_rate": 1.0044073544038275e-05, + "loss": 0.0124, + "step": 16420 + }, + { + "epoch": 6.6779178527856855, + "grad_norm": 6.962218837160983, + "learning_rate": 1.0043060365453716e-05, + "loss": 0.1488, + "step": 16421 + }, + { + "epoch": 6.678324522163481, + "grad_norm": 1.0929140227104535, + "learning_rate": 1.0042047186427122e-05, + "loss": 0.0205, + "step": 16422 + }, + { + "epoch": 6.678731191541277, + "grad_norm": 1.5609092584634507, + "learning_rate": 1.0041034006968891e-05, + "loss": 0.0167, + "step": 16423 + }, + { + "epoch": 6.679137860919073, + "grad_norm": 2.9152615925500873, + "learning_rate": 1.0040020827089425e-05, + "loss": 0.0279, + "step": 16424 + }, + { + "epoch": 6.679544530296869, + "grad_norm": 0.2611526082304212, + "learning_rate": 1.0039007646799126e-05, + "loss": 0.0036, + "step": 16425 + }, + { + "epoch": 6.679951199674664, + "grad_norm": 4.388205479311251, + "learning_rate": 1.0037994466108393e-05, + "loss": 0.151, + "step": 16426 + }, + { + "epoch": 6.68035786905246, + "grad_norm": 0.17284580608610509, + "learning_rate": 1.0036981285027629e-05, + "loss": 0.0025, + "step": 16427 + }, + { + "epoch": 6.680764538430256, + "grad_norm": 0.05156107793777747, + "learning_rate": 1.0035968103567233e-05, + "loss": 0.0008, + "step": 16428 + }, + { + "epoch": 6.6811712078080525, + "grad_norm": 9.541112618358428, + "learning_rate": 1.0034954921737608e-05, + "loss": 0.3626, + "step": 16429 + }, + { + "epoch": 6.681577877185848, + "grad_norm": 13.26861419069179, + "learning_rate": 1.003394173954915e-05, + "loss": 0.5648, + "step": 16430 + }, + { + "epoch": 6.681984546563644, + "grad_norm": 14.262849875595048, + "learning_rate": 1.0032928557012264e-05, + "loss": 1.0269, + "step": 16431 + }, + { + "epoch": 6.68239121594144, + "grad_norm": 5.985853461385357, + "learning_rate": 1.003191537413735e-05, + "loss": 0.1378, + "step": 16432 + }, + { + "epoch": 6.682797885319236, + "grad_norm": 13.908337793179035, + "learning_rate": 1.003090219093481e-05, + "loss": 0.49, + "step": 16433 + }, + { + "epoch": 6.683204554697031, + "grad_norm": 0.005498515000878426, + "learning_rate": 1.0029889007415042e-05, + "loss": 0.0001, + "step": 16434 + }, + { + "epoch": 6.683611224074827, + "grad_norm": 8.117662396756947, + "learning_rate": 1.0028875823588449e-05, + "loss": 0.1533, + "step": 16435 + }, + { + "epoch": 6.684017893452623, + "grad_norm": 12.936521606462462, + "learning_rate": 1.0027862639465432e-05, + "loss": 0.874, + "step": 16436 + }, + { + "epoch": 6.684424562830419, + "grad_norm": 3.709199343483574, + "learning_rate": 1.0026849455056387e-05, + "loss": 0.1004, + "step": 16437 + }, + { + "epoch": 6.684831232208214, + "grad_norm": 0.12718834779862898, + "learning_rate": 1.0025836270371722e-05, + "loss": 0.0011, + "step": 16438 + }, + { + "epoch": 6.68523790158601, + "grad_norm": 5.566010458410605, + "learning_rate": 1.0024823085421838e-05, + "loss": 0.0923, + "step": 16439 + }, + { + "epoch": 6.685644570963806, + "grad_norm": 0.41828126748175587, + "learning_rate": 1.0023809900217125e-05, + "loss": 0.0051, + "step": 16440 + }, + { + "epoch": 6.686051240341603, + "grad_norm": 0.006113780299942808, + "learning_rate": 1.0022796714767998e-05, + "loss": 0.0001, + "step": 16441 + }, + { + "epoch": 6.686457909719398, + "grad_norm": 9.15118931121279, + "learning_rate": 1.0021783529084847e-05, + "loss": 0.2729, + "step": 16442 + }, + { + "epoch": 6.686864579097194, + "grad_norm": 8.83180527395873, + "learning_rate": 1.0020770343178077e-05, + "loss": 0.1762, + "step": 16443 + }, + { + "epoch": 6.68727124847499, + "grad_norm": 3.6107149266642162, + "learning_rate": 1.0019757157058092e-05, + "loss": 0.0546, + "step": 16444 + }, + { + "epoch": 6.687677917852786, + "grad_norm": 8.593551253816601, + "learning_rate": 1.001874397073529e-05, + "loss": 0.47, + "step": 16445 + }, + { + "epoch": 6.688084587230581, + "grad_norm": 2.782233308758495, + "learning_rate": 1.0017730784220068e-05, + "loss": 0.0367, + "step": 16446 + }, + { + "epoch": 6.688491256608377, + "grad_norm": 7.71636357076934, + "learning_rate": 1.0016717597522831e-05, + "loss": 0.3365, + "step": 16447 + }, + { + "epoch": 6.688897925986173, + "grad_norm": 0.6609667047649945, + "learning_rate": 1.0015704410653984e-05, + "loss": 0.0116, + "step": 16448 + }, + { + "epoch": 6.689304595363969, + "grad_norm": 2.3508313342543294, + "learning_rate": 1.0014691223623918e-05, + "loss": 0.0375, + "step": 16449 + }, + { + "epoch": 6.689711264741765, + "grad_norm": 7.541389570669257, + "learning_rate": 1.0013678036443042e-05, + "loss": 0.3614, + "step": 16450 + }, + { + "epoch": 6.690117934119561, + "grad_norm": 8.541899216886168, + "learning_rate": 1.0012664849121754e-05, + "loss": 0.3413, + "step": 16451 + }, + { + "epoch": 6.690524603497357, + "grad_norm": 0.09590235635210681, + "learning_rate": 1.0011651661670456e-05, + "loss": 0.0024, + "step": 16452 + }, + { + "epoch": 6.690931272875153, + "grad_norm": 5.706046263805898, + "learning_rate": 1.0010638474099544e-05, + "loss": 0.2686, + "step": 16453 + }, + { + "epoch": 6.691337942252948, + "grad_norm": 5.03330817795744, + "learning_rate": 1.0009625286419427e-05, + "loss": 0.1131, + "step": 16454 + }, + { + "epoch": 6.691744611630744, + "grad_norm": 0.05381879612401651, + "learning_rate": 1.00086120986405e-05, + "loss": 0.0007, + "step": 16455 + }, + { + "epoch": 6.69215128100854, + "grad_norm": 7.918247664537584, + "learning_rate": 1.0007598910773163e-05, + "loss": 0.193, + "step": 16456 + }, + { + "epoch": 6.692557950386336, + "grad_norm": 11.4352365485873, + "learning_rate": 1.0006585722827821e-05, + "loss": 0.3547, + "step": 16457 + }, + { + "epoch": 6.6929646197641315, + "grad_norm": 0.8305537422469945, + "learning_rate": 1.0005572534814876e-05, + "loss": 0.0164, + "step": 16458 + }, + { + "epoch": 6.693371289141927, + "grad_norm": 2.775989575767532, + "learning_rate": 1.0004559346744723e-05, + "loss": 0.0666, + "step": 16459 + }, + { + "epoch": 6.693777958519723, + "grad_norm": 0.8573202136927994, + "learning_rate": 1.0003546158627766e-05, + "loss": 0.0176, + "step": 16460 + }, + { + "epoch": 6.69418462789752, + "grad_norm": 3.4170663713272234, + "learning_rate": 1.0002532970474408e-05, + "loss": 0.0766, + "step": 16461 + }, + { + "epoch": 6.694591297275315, + "grad_norm": 0.44042541656199574, + "learning_rate": 1.0001519782295043e-05, + "loss": 0.0075, + "step": 16462 + }, + { + "epoch": 6.694997966653111, + "grad_norm": 6.6335701150308015, + "learning_rate": 1.0000506594100083e-05, + "loss": 0.1526, + "step": 16463 + }, + { + "epoch": 6.695404636030907, + "grad_norm": 10.687778331767168, + "learning_rate": 9.999493405899922e-06, + "loss": 0.3303, + "step": 16464 + }, + { + "epoch": 6.695811305408703, + "grad_norm": 5.3091697220764384, + "learning_rate": 9.998480217704958e-06, + "loss": 0.1154, + "step": 16465 + }, + { + "epoch": 6.6962179747864985, + "grad_norm": 2.9721382980423026, + "learning_rate": 9.997467029525597e-06, + "loss": 0.0689, + "step": 16466 + }, + { + "epoch": 6.696624644164294, + "grad_norm": 0.11189491064176423, + "learning_rate": 9.99645384137224e-06, + "loss": 0.0014, + "step": 16467 + }, + { + "epoch": 6.69703131354209, + "grad_norm": 0.6197112640859274, + "learning_rate": 9.99544065325528e-06, + "loss": 0.0092, + "step": 16468 + }, + { + "epoch": 6.697437982919886, + "grad_norm": 5.487524532266537, + "learning_rate": 9.994427465185129e-06, + "loss": 0.1132, + "step": 16469 + }, + { + "epoch": 6.697844652297682, + "grad_norm": 1.7464573696104184, + "learning_rate": 9.993414277172184e-06, + "loss": 0.0506, + "step": 16470 + }, + { + "epoch": 6.698251321675478, + "grad_norm": 2.821086223965169, + "learning_rate": 9.992401089226839e-06, + "loss": 0.0318, + "step": 16471 + }, + { + "epoch": 6.698657991053274, + "grad_norm": 5.987825525094674, + "learning_rate": 9.991387901359506e-06, + "loss": 0.1406, + "step": 16472 + }, + { + "epoch": 6.69906466043107, + "grad_norm": 4.65105315975846, + "learning_rate": 9.99037471358058e-06, + "loss": 0.0835, + "step": 16473 + }, + { + "epoch": 6.6994713298088655, + "grad_norm": 1.8629667430406915, + "learning_rate": 9.989361525900461e-06, + "loss": 0.0333, + "step": 16474 + }, + { + "epoch": 6.699877999186661, + "grad_norm": 4.007011259237921, + "learning_rate": 9.98834833832955e-06, + "loss": 0.1207, + "step": 16475 + }, + { + "epoch": 6.700284668564457, + "grad_norm": 4.7480539442870455, + "learning_rate": 9.987335150878251e-06, + "loss": 0.1237, + "step": 16476 + }, + { + "epoch": 6.700691337942253, + "grad_norm": 3.230663892082457, + "learning_rate": 9.986321963556963e-06, + "loss": 0.0807, + "step": 16477 + }, + { + "epoch": 6.701098007320049, + "grad_norm": 5.610407726103833, + "learning_rate": 9.985308776376086e-06, + "loss": 0.1228, + "step": 16478 + }, + { + "epoch": 6.701504676697844, + "grad_norm": 10.988217417371894, + "learning_rate": 9.984295589346023e-06, + "loss": 0.6617, + "step": 16479 + }, + { + "epoch": 6.70191134607564, + "grad_norm": 4.570201602786517, + "learning_rate": 9.983282402477172e-06, + "loss": 0.1328, + "step": 16480 + }, + { + "epoch": 6.702318015453436, + "grad_norm": 14.875172013001457, + "learning_rate": 9.982269215779937e-06, + "loss": 0.2986, + "step": 16481 + }, + { + "epoch": 6.7027246848312325, + "grad_norm": 1.6003942280261614, + "learning_rate": 9.981256029264716e-06, + "loss": 0.0365, + "step": 16482 + }, + { + "epoch": 6.703131354209028, + "grad_norm": 3.869158792669709, + "learning_rate": 9.980242842941913e-06, + "loss": 0.0494, + "step": 16483 + }, + { + "epoch": 6.703538023586824, + "grad_norm": 0.9671819656429471, + "learning_rate": 9.979229656821928e-06, + "loss": 0.0149, + "step": 16484 + }, + { + "epoch": 6.70394469296462, + "grad_norm": 13.892758387985301, + "learning_rate": 9.978216470915158e-06, + "loss": 0.3332, + "step": 16485 + }, + { + "epoch": 6.704351362342416, + "grad_norm": 4.075251109435765, + "learning_rate": 9.977203285232007e-06, + "loss": 0.0752, + "step": 16486 + }, + { + "epoch": 6.704758031720211, + "grad_norm": 0.027394074957043144, + "learning_rate": 9.976190099782879e-06, + "loss": 0.0004, + "step": 16487 + }, + { + "epoch": 6.705164701098007, + "grad_norm": 3.869154125654149, + "learning_rate": 9.975176914578168e-06, + "loss": 0.0376, + "step": 16488 + }, + { + "epoch": 6.705571370475803, + "grad_norm": 7.858868517029498, + "learning_rate": 9.974163729628281e-06, + "loss": 0.3283, + "step": 16489 + }, + { + "epoch": 6.705978039853599, + "grad_norm": 4.492345170643717, + "learning_rate": 9.973150544943617e-06, + "loss": 0.1774, + "step": 16490 + }, + { + "epoch": 6.706384709231395, + "grad_norm": 5.441640514488214, + "learning_rate": 9.972137360534572e-06, + "loss": 0.1077, + "step": 16491 + }, + { + "epoch": 6.706791378609191, + "grad_norm": 0.4118240506295141, + "learning_rate": 9.971124176411554e-06, + "loss": 0.0072, + "step": 16492 + }, + { + "epoch": 6.707198047986987, + "grad_norm": 0.17852221071658056, + "learning_rate": 9.970110992584961e-06, + "loss": 0.003, + "step": 16493 + }, + { + "epoch": 6.707604717364783, + "grad_norm": 2.4857233856989063, + "learning_rate": 9.969097809065193e-06, + "loss": 0.0551, + "step": 16494 + }, + { + "epoch": 6.708011386742578, + "grad_norm": 0.17074274204313716, + "learning_rate": 9.968084625862651e-06, + "loss": 0.0024, + "step": 16495 + }, + { + "epoch": 6.708418056120374, + "grad_norm": 3.753245010400722, + "learning_rate": 9.967071442987739e-06, + "loss": 0.0291, + "step": 16496 + }, + { + "epoch": 6.70882472549817, + "grad_norm": 1.244879714441483, + "learning_rate": 9.966058260450854e-06, + "loss": 0.0215, + "step": 16497 + }, + { + "epoch": 6.709231394875966, + "grad_norm": 5.5362977998198115, + "learning_rate": 9.965045078262395e-06, + "loss": 0.0773, + "step": 16498 + }, + { + "epoch": 6.709638064253761, + "grad_norm": 1.0961712470303717, + "learning_rate": 9.96403189643277e-06, + "loss": 0.018, + "step": 16499 + }, + { + "epoch": 6.710044733631557, + "grad_norm": 1.4130481004685018, + "learning_rate": 9.963018714972374e-06, + "loss": 0.033, + "step": 16500 + }, + { + "epoch": 6.710451403009353, + "grad_norm": 0.12235559450868387, + "learning_rate": 9.962005533891608e-06, + "loss": 0.0026, + "step": 16501 + }, + { + "epoch": 6.71085807238715, + "grad_norm": 0.023185383782084342, + "learning_rate": 9.960992353200877e-06, + "loss": 0.0004, + "step": 16502 + }, + { + "epoch": 6.711264741764945, + "grad_norm": 10.956796956489796, + "learning_rate": 9.959979172910577e-06, + "loss": 0.4513, + "step": 16503 + }, + { + "epoch": 6.711671411142741, + "grad_norm": 2.288333702043834, + "learning_rate": 9.95896599303111e-06, + "loss": 0.023, + "step": 16504 + }, + { + "epoch": 6.712078080520537, + "grad_norm": 9.484021105008363, + "learning_rate": 9.957952813572881e-06, + "loss": 0.4159, + "step": 16505 + }, + { + "epoch": 6.712484749898333, + "grad_norm": 3.6712488758988204, + "learning_rate": 9.956939634546287e-06, + "loss": 0.0691, + "step": 16506 + }, + { + "epoch": 6.712891419276128, + "grad_norm": 11.518802783100508, + "learning_rate": 9.955926455961727e-06, + "loss": 0.4371, + "step": 16507 + }, + { + "epoch": 6.713298088653924, + "grad_norm": 14.57493298778538, + "learning_rate": 9.954913277829606e-06, + "loss": 0.4601, + "step": 16508 + }, + { + "epoch": 6.71370475803172, + "grad_norm": 4.495072992361579, + "learning_rate": 9.953900100160323e-06, + "loss": 0.0524, + "step": 16509 + }, + { + "epoch": 6.714111427409516, + "grad_norm": 0.21401485416447302, + "learning_rate": 9.952886922964278e-06, + "loss": 0.0041, + "step": 16510 + }, + { + "epoch": 6.714518096787312, + "grad_norm": 10.007330266239212, + "learning_rate": 9.951873746251873e-06, + "loss": 0.3026, + "step": 16511 + }, + { + "epoch": 6.714924766165108, + "grad_norm": 4.3423780084627035, + "learning_rate": 9.950860570033508e-06, + "loss": 0.1827, + "step": 16512 + }, + { + "epoch": 6.715331435542904, + "grad_norm": 7.722437896859123, + "learning_rate": 9.949847394319584e-06, + "loss": 0.3269, + "step": 16513 + }, + { + "epoch": 6.7157381049207, + "grad_norm": 5.06425392424448, + "learning_rate": 9.9488342191205e-06, + "loss": 0.0941, + "step": 16514 + }, + { + "epoch": 6.716144774298495, + "grad_norm": 8.999198221381453, + "learning_rate": 9.94782104444666e-06, + "loss": 0.0764, + "step": 16515 + }, + { + "epoch": 6.716551443676291, + "grad_norm": 5.333375530684063, + "learning_rate": 9.946807870308464e-06, + "loss": 0.1393, + "step": 16516 + }, + { + "epoch": 6.716958113054087, + "grad_norm": 4.917931843650206, + "learning_rate": 9.94579469671631e-06, + "loss": 0.1062, + "step": 16517 + }, + { + "epoch": 6.717364782431883, + "grad_norm": 10.886270073773515, + "learning_rate": 9.944781523680603e-06, + "loss": 0.4127, + "step": 16518 + }, + { + "epoch": 6.7177714518096785, + "grad_norm": 3.0124799363528063, + "learning_rate": 9.943768351211741e-06, + "loss": 0.0895, + "step": 16519 + }, + { + "epoch": 6.718178121187474, + "grad_norm": 4.901082317235314, + "learning_rate": 9.942755179320123e-06, + "loss": 0.1151, + "step": 16520 + }, + { + "epoch": 6.71858479056527, + "grad_norm": 0.3961968395235771, + "learning_rate": 9.941742008016154e-06, + "loss": 0.0054, + "step": 16521 + }, + { + "epoch": 6.718991459943066, + "grad_norm": 1.4025260011291956, + "learning_rate": 9.940728837310233e-06, + "loss": 0.0277, + "step": 16522 + }, + { + "epoch": 6.7193981293208624, + "grad_norm": 0.15371126654042508, + "learning_rate": 9.939715667212759e-06, + "loss": 0.0028, + "step": 16523 + }, + { + "epoch": 6.719804798698658, + "grad_norm": 7.251041958863997, + "learning_rate": 9.938702497734135e-06, + "loss": 0.1249, + "step": 16524 + }, + { + "epoch": 6.720211468076454, + "grad_norm": 7.088288217069392, + "learning_rate": 9.93768932888476e-06, + "loss": 0.2559, + "step": 16525 + }, + { + "epoch": 6.72061813745425, + "grad_norm": 3.014781148558651, + "learning_rate": 9.936676160675035e-06, + "loss": 0.0748, + "step": 16526 + }, + { + "epoch": 6.7210248068320455, + "grad_norm": 4.150861888272193, + "learning_rate": 9.935662993115361e-06, + "loss": 0.1447, + "step": 16527 + }, + { + "epoch": 6.721431476209841, + "grad_norm": 2.170910510852427, + "learning_rate": 9.934649826216139e-06, + "loss": 0.0517, + "step": 16528 + }, + { + "epoch": 6.721838145587637, + "grad_norm": 2.453824829072599, + "learning_rate": 9.93363665998777e-06, + "loss": 0.1046, + "step": 16529 + }, + { + "epoch": 6.722244814965433, + "grad_norm": 6.085741044434347, + "learning_rate": 9.932623494440653e-06, + "loss": 0.1871, + "step": 16530 + }, + { + "epoch": 6.722651484343229, + "grad_norm": 0.04022774481231696, + "learning_rate": 9.931610329585193e-06, + "loss": 0.0007, + "step": 16531 + }, + { + "epoch": 6.723058153721025, + "grad_norm": 1.9076867565897409, + "learning_rate": 9.930597165431785e-06, + "loss": 0.0553, + "step": 16532 + }, + { + "epoch": 6.723464823098821, + "grad_norm": 0.12571117512539484, + "learning_rate": 9.929584001990829e-06, + "loss": 0.0015, + "step": 16533 + }, + { + "epoch": 6.723871492476617, + "grad_norm": 1.5346066708145976, + "learning_rate": 9.928570839272733e-06, + "loss": 0.0611, + "step": 16534 + }, + { + "epoch": 6.7242781618544125, + "grad_norm": 1.449580135016188, + "learning_rate": 9.927557677287892e-06, + "loss": 0.0192, + "step": 16535 + }, + { + "epoch": 6.724684831232208, + "grad_norm": 9.857102292649065, + "learning_rate": 9.92654451604671e-06, + "loss": 0.3447, + "step": 16536 + }, + { + "epoch": 6.725091500610004, + "grad_norm": 4.22874695363176, + "learning_rate": 9.925531355559581e-06, + "loss": 0.0754, + "step": 16537 + }, + { + "epoch": 6.7254981699878, + "grad_norm": 0.6682078205414737, + "learning_rate": 9.924518195836913e-06, + "loss": 0.0128, + "step": 16538 + }, + { + "epoch": 6.725904839365596, + "grad_norm": 2.1751917609733145, + "learning_rate": 9.923505036889104e-06, + "loss": 0.0416, + "step": 16539 + }, + { + "epoch": 6.726311508743391, + "grad_norm": 12.545178400820292, + "learning_rate": 9.92249187872655e-06, + "loss": 0.5526, + "step": 16540 + }, + { + "epoch": 6.726718178121187, + "grad_norm": 0.11882589059253064, + "learning_rate": 9.921478721359661e-06, + "loss": 0.0018, + "step": 16541 + }, + { + "epoch": 6.727124847498983, + "grad_norm": 6.764744830198528, + "learning_rate": 9.920465564798832e-06, + "loss": 0.1696, + "step": 16542 + }, + { + "epoch": 6.7275315168767795, + "grad_norm": 0.07368203869865753, + "learning_rate": 9.919452409054461e-06, + "loss": 0.0011, + "step": 16543 + }, + { + "epoch": 6.727938186254575, + "grad_norm": 1.5057598470582314, + "learning_rate": 9.918439254136955e-06, + "loss": 0.0264, + "step": 16544 + }, + { + "epoch": 6.728344855632371, + "grad_norm": 0.1193130225893308, + "learning_rate": 9.91742610005671e-06, + "loss": 0.0023, + "step": 16545 + }, + { + "epoch": 6.728751525010167, + "grad_norm": 4.014582246250044, + "learning_rate": 9.916412946824125e-06, + "loss": 0.0836, + "step": 16546 + }, + { + "epoch": 6.729158194387963, + "grad_norm": 3.889481883322458, + "learning_rate": 9.915399794449607e-06, + "loss": 0.0635, + "step": 16547 + }, + { + "epoch": 6.729564863765758, + "grad_norm": 9.83260927141842, + "learning_rate": 9.914386642943551e-06, + "loss": 0.3863, + "step": 16548 + }, + { + "epoch": 6.729971533143554, + "grad_norm": 6.89196477000209, + "learning_rate": 9.913373492316359e-06, + "loss": 0.1368, + "step": 16549 + }, + { + "epoch": 6.73037820252135, + "grad_norm": 4.690567866280966, + "learning_rate": 9.91236034257843e-06, + "loss": 0.0943, + "step": 16550 + }, + { + "epoch": 6.730784871899146, + "grad_norm": 5.502399897141654, + "learning_rate": 9.91134719374017e-06, + "loss": 0.1337, + "step": 16551 + }, + { + "epoch": 6.731191541276942, + "grad_norm": 2.8806837266578595, + "learning_rate": 9.910334045811974e-06, + "loss": 0.0581, + "step": 16552 + }, + { + "epoch": 6.731598210654738, + "grad_norm": 2.537931391940459, + "learning_rate": 9.909320898804243e-06, + "loss": 0.059, + "step": 16553 + }, + { + "epoch": 6.732004880032534, + "grad_norm": 0.5234737466049842, + "learning_rate": 9.90830775272738e-06, + "loss": 0.0067, + "step": 16554 + }, + { + "epoch": 6.73241154941033, + "grad_norm": 0.3195624284928502, + "learning_rate": 9.907294607591783e-06, + "loss": 0.0046, + "step": 16555 + }, + { + "epoch": 6.732818218788125, + "grad_norm": 3.8893977536256643, + "learning_rate": 9.906281463407853e-06, + "loss": 0.0782, + "step": 16556 + }, + { + "epoch": 6.733224888165921, + "grad_norm": 0.36671701065207307, + "learning_rate": 9.905268320185992e-06, + "loss": 0.0064, + "step": 16557 + }, + { + "epoch": 6.733631557543717, + "grad_norm": 1.2402678084607295, + "learning_rate": 9.9042551779366e-06, + "loss": 0.0254, + "step": 16558 + }, + { + "epoch": 6.734038226921513, + "grad_norm": 1.109793853329421, + "learning_rate": 9.903242036670078e-06, + "loss": 0.0155, + "step": 16559 + }, + { + "epoch": 6.734444896299308, + "grad_norm": 1.1782020166826683, + "learning_rate": 9.902228896396821e-06, + "loss": 0.0206, + "step": 16560 + }, + { + "epoch": 6.734851565677104, + "grad_norm": 0.33095724532914045, + "learning_rate": 9.901215757127238e-06, + "loss": 0.0044, + "step": 16561 + }, + { + "epoch": 6.7352582350549, + "grad_norm": 2.7312341303007655, + "learning_rate": 9.900202618871722e-06, + "loss": 0.0432, + "step": 16562 + }, + { + "epoch": 6.735664904432696, + "grad_norm": 1.684547341715338, + "learning_rate": 9.899189481640676e-06, + "loss": 0.0153, + "step": 16563 + }, + { + "epoch": 6.736071573810492, + "grad_norm": 3.455048732001791, + "learning_rate": 9.898176345444507e-06, + "loss": 0.0763, + "step": 16564 + }, + { + "epoch": 6.736478243188288, + "grad_norm": 8.127408963038697, + "learning_rate": 9.897163210293603e-06, + "loss": 0.1587, + "step": 16565 + }, + { + "epoch": 6.736884912566084, + "grad_norm": 0.7485772543750141, + "learning_rate": 9.896150076198373e-06, + "loss": 0.0114, + "step": 16566 + }, + { + "epoch": 6.73729158194388, + "grad_norm": 1.4481841034376226, + "learning_rate": 9.895136943169213e-06, + "loss": 0.0166, + "step": 16567 + }, + { + "epoch": 6.737698251321675, + "grad_norm": 4.932712690988026, + "learning_rate": 9.894123811216524e-06, + "loss": 0.186, + "step": 16568 + }, + { + "epoch": 6.738104920699471, + "grad_norm": 0.14086511690537115, + "learning_rate": 9.893110680350708e-06, + "loss": 0.0017, + "step": 16569 + }, + { + "epoch": 6.738511590077267, + "grad_norm": 6.421568585441822, + "learning_rate": 9.892097550582166e-06, + "loss": 0.2112, + "step": 16570 + }, + { + "epoch": 6.738918259455063, + "grad_norm": 0.004945053013817612, + "learning_rate": 9.891084421921296e-06, + "loss": 0.0001, + "step": 16571 + }, + { + "epoch": 6.739324928832859, + "grad_norm": 0.9641673521917544, + "learning_rate": 9.890071294378499e-06, + "loss": 0.0216, + "step": 16572 + }, + { + "epoch": 6.739731598210655, + "grad_norm": 7.238662770530974, + "learning_rate": 9.889058167964175e-06, + "loss": 0.2481, + "step": 16573 + }, + { + "epoch": 6.740138267588451, + "grad_norm": 4.9664521914349935, + "learning_rate": 9.888045042688725e-06, + "loss": 0.099, + "step": 16574 + }, + { + "epoch": 6.740544936966247, + "grad_norm": 8.042317574362409, + "learning_rate": 9.88703191856255e-06, + "loss": 0.155, + "step": 16575 + }, + { + "epoch": 6.7409516063440424, + "grad_norm": 0.24323616262919406, + "learning_rate": 9.88601879559605e-06, + "loss": 0.0035, + "step": 16576 + }, + { + "epoch": 6.741358275721838, + "grad_norm": 9.96508145992764, + "learning_rate": 9.885005673799625e-06, + "loss": 0.2598, + "step": 16577 + }, + { + "epoch": 6.741764945099634, + "grad_norm": 7.762636369978596, + "learning_rate": 9.88399255318367e-06, + "loss": 0.2364, + "step": 16578 + }, + { + "epoch": 6.74217161447743, + "grad_norm": 15.620623204440987, + "learning_rate": 9.882979433758594e-06, + "loss": 0.2658, + "step": 16579 + }, + { + "epoch": 6.7425782838552255, + "grad_norm": 0.42771383825522885, + "learning_rate": 9.881966315534792e-06, + "loss": 0.0055, + "step": 16580 + }, + { + "epoch": 6.742984953233021, + "grad_norm": 3.2664952884515803, + "learning_rate": 9.880953198522663e-06, + "loss": 0.0537, + "step": 16581 + }, + { + "epoch": 6.743391622610817, + "grad_norm": 6.349601986806556, + "learning_rate": 9.879940082732613e-06, + "loss": 0.1502, + "step": 16582 + }, + { + "epoch": 6.743798291988613, + "grad_norm": 5.05024491557142, + "learning_rate": 9.878926968175036e-06, + "loss": 0.2065, + "step": 16583 + }, + { + "epoch": 6.7442049613664095, + "grad_norm": 15.43364324298667, + "learning_rate": 9.877913854860334e-06, + "loss": 0.5477, + "step": 16584 + }, + { + "epoch": 6.744611630744205, + "grad_norm": 6.693086936921477, + "learning_rate": 9.876900742798911e-06, + "loss": 0.1893, + "step": 16585 + }, + { + "epoch": 6.745018300122001, + "grad_norm": 0.09851826945196052, + "learning_rate": 9.875887632001163e-06, + "loss": 0.002, + "step": 16586 + }, + { + "epoch": 6.745424969499797, + "grad_norm": 8.07863088036952, + "learning_rate": 9.874874522477488e-06, + "loss": 0.2821, + "step": 16587 + }, + { + "epoch": 6.7458316388775925, + "grad_norm": 2.2095946063192393, + "learning_rate": 9.87386141423829e-06, + "loss": 0.0292, + "step": 16588 + }, + { + "epoch": 6.746238308255388, + "grad_norm": 11.898687759073173, + "learning_rate": 9.872848307293971e-06, + "loss": 0.5056, + "step": 16589 + }, + { + "epoch": 6.746644977633184, + "grad_norm": 0.7151313314253829, + "learning_rate": 9.871835201654926e-06, + "loss": 0.0111, + "step": 16590 + }, + { + "epoch": 6.74705164701098, + "grad_norm": 5.105993856607518, + "learning_rate": 9.870822097331555e-06, + "loss": 0.0675, + "step": 16591 + }, + { + "epoch": 6.747458316388776, + "grad_norm": 1.255331106586056, + "learning_rate": 9.869808994334264e-06, + "loss": 0.0329, + "step": 16592 + }, + { + "epoch": 6.747864985766572, + "grad_norm": 3.5791208206845204, + "learning_rate": 9.868795892673446e-06, + "loss": 0.0701, + "step": 16593 + }, + { + "epoch": 6.748271655144368, + "grad_norm": 6.030379597891595, + "learning_rate": 9.867782792359506e-06, + "loss": 0.1324, + "step": 16594 + }, + { + "epoch": 6.748678324522164, + "grad_norm": 9.305618231575746, + "learning_rate": 9.866769693402843e-06, + "loss": 0.21, + "step": 16595 + }, + { + "epoch": 6.7490849938999595, + "grad_norm": 0.3502558842100438, + "learning_rate": 9.865756595813855e-06, + "loss": 0.0052, + "step": 16596 + }, + { + "epoch": 6.749491663277755, + "grad_norm": 6.311833904196598, + "learning_rate": 9.864743499602943e-06, + "loss": 0.0849, + "step": 16597 + }, + { + "epoch": 6.749898332655551, + "grad_norm": 17.0770413629946, + "learning_rate": 9.863730404780507e-06, + "loss": 0.4861, + "step": 16598 + }, + { + "epoch": 6.750305002033347, + "grad_norm": 8.549697130442803, + "learning_rate": 9.862717311356948e-06, + "loss": 0.2769, + "step": 16599 + }, + { + "epoch": 6.750711671411143, + "grad_norm": 3.092050236621575, + "learning_rate": 9.861704219342663e-06, + "loss": 0.0336, + "step": 16600 + }, + { + "epoch": 6.751118340788938, + "grad_norm": 13.140341958661566, + "learning_rate": 9.860691128748055e-06, + "loss": 0.4043, + "step": 16601 + }, + { + "epoch": 6.751525010166734, + "grad_norm": 5.303921486821711, + "learning_rate": 9.859678039583523e-06, + "loss": 0.1787, + "step": 16602 + }, + { + "epoch": 6.75193167954453, + "grad_norm": 7.037165923517375, + "learning_rate": 9.858664951859467e-06, + "loss": 0.2111, + "step": 16603 + }, + { + "epoch": 6.752338348922326, + "grad_norm": 2.796239728812827, + "learning_rate": 9.857651865586284e-06, + "loss": 0.0619, + "step": 16604 + }, + { + "epoch": 6.752745018300122, + "grad_norm": 7.172835093713685, + "learning_rate": 9.856638780774378e-06, + "loss": 0.2005, + "step": 16605 + }, + { + "epoch": 6.753151687677918, + "grad_norm": 4.223868080488358, + "learning_rate": 9.855625697434146e-06, + "loss": 0.0955, + "step": 16606 + }, + { + "epoch": 6.753558357055714, + "grad_norm": 3.902186325737199, + "learning_rate": 9.854612615575989e-06, + "loss": 0.1834, + "step": 16607 + }, + { + "epoch": 6.75396502643351, + "grad_norm": 13.11254496656823, + "learning_rate": 9.85359953521031e-06, + "loss": 0.2322, + "step": 16608 + }, + { + "epoch": 6.754371695811305, + "grad_norm": 2.7866717732167303, + "learning_rate": 9.852586456347502e-06, + "loss": 0.0421, + "step": 16609 + }, + { + "epoch": 6.754778365189101, + "grad_norm": 1.6821714319741825, + "learning_rate": 9.851573378997967e-06, + "loss": 0.0231, + "step": 16610 + }, + { + "epoch": 6.755185034566897, + "grad_norm": 0.4939621426183518, + "learning_rate": 9.850560303172108e-06, + "loss": 0.0089, + "step": 16611 + }, + { + "epoch": 6.755591703944693, + "grad_norm": 0.0061221202332406525, + "learning_rate": 9.849547228880325e-06, + "loss": 0.0001, + "step": 16612 + }, + { + "epoch": 6.755998373322489, + "grad_norm": 1.7696124377433853, + "learning_rate": 9.848534156133013e-06, + "loss": 0.0227, + "step": 16613 + }, + { + "epoch": 6.756405042700285, + "grad_norm": 4.119901223262933, + "learning_rate": 9.847521084940573e-06, + "loss": 0.0946, + "step": 16614 + }, + { + "epoch": 6.756811712078081, + "grad_norm": 2.8185182340019304, + "learning_rate": 9.846508015313407e-06, + "loss": 0.0589, + "step": 16615 + }, + { + "epoch": 6.757218381455877, + "grad_norm": 6.727268366379673, + "learning_rate": 9.845494947261915e-06, + "loss": 0.1595, + "step": 16616 + }, + { + "epoch": 6.757625050833672, + "grad_norm": 4.344483437394669, + "learning_rate": 9.844481880796492e-06, + "loss": 0.0753, + "step": 16617 + }, + { + "epoch": 6.758031720211468, + "grad_norm": 9.466676580921272, + "learning_rate": 9.843468815927543e-06, + "loss": 0.2247, + "step": 16618 + }, + { + "epoch": 6.758438389589264, + "grad_norm": 5.117826635635765, + "learning_rate": 9.842455752665465e-06, + "loss": 0.4435, + "step": 16619 + }, + { + "epoch": 6.75884505896706, + "grad_norm": 3.1751199174612794, + "learning_rate": 9.841442691020656e-06, + "loss": 0.0578, + "step": 16620 + }, + { + "epoch": 6.759251728344855, + "grad_norm": 4.780859006383804, + "learning_rate": 9.84042963100352e-06, + "loss": 0.1759, + "step": 16621 + }, + { + "epoch": 6.759658397722651, + "grad_norm": 3.775893282989358, + "learning_rate": 9.839416572624455e-06, + "loss": 0.0245, + "step": 16622 + }, + { + "epoch": 6.760065067100447, + "grad_norm": 0.7523815221244651, + "learning_rate": 9.838403515893857e-06, + "loss": 0.0139, + "step": 16623 + }, + { + "epoch": 6.760471736478243, + "grad_norm": 1.1120020290936503, + "learning_rate": 9.83739046082213e-06, + "loss": 0.0138, + "step": 16624 + }, + { + "epoch": 6.760878405856039, + "grad_norm": 4.5377748560456235, + "learning_rate": 9.836377407419671e-06, + "loss": 0.088, + "step": 16625 + }, + { + "epoch": 6.761285075233835, + "grad_norm": 0.06798881902880746, + "learning_rate": 9.835364355696882e-06, + "loss": 0.0012, + "step": 16626 + }, + { + "epoch": 6.761691744611631, + "grad_norm": 1.3706591778953623, + "learning_rate": 9.834351305664158e-06, + "loss": 0.0189, + "step": 16627 + }, + { + "epoch": 6.762098413989427, + "grad_norm": 9.54665872565737, + "learning_rate": 9.833338257331903e-06, + "loss": 0.2033, + "step": 16628 + }, + { + "epoch": 6.7625050833672224, + "grad_norm": 2.905144421483677, + "learning_rate": 9.832325210710514e-06, + "loss": 0.0178, + "step": 16629 + }, + { + "epoch": 6.762911752745018, + "grad_norm": 7.232742843670564, + "learning_rate": 9.831312165810389e-06, + "loss": 0.2707, + "step": 16630 + }, + { + "epoch": 6.763318422122814, + "grad_norm": 0.18340653501458778, + "learning_rate": 9.830299122641933e-06, + "loss": 0.0022, + "step": 16631 + }, + { + "epoch": 6.76372509150061, + "grad_norm": 5.2664451409666535, + "learning_rate": 9.829286081215541e-06, + "loss": 0.0977, + "step": 16632 + }, + { + "epoch": 6.7641317608784055, + "grad_norm": 8.35656126119548, + "learning_rate": 9.828273041541611e-06, + "loss": 0.3064, + "step": 16633 + }, + { + "epoch": 6.764538430256202, + "grad_norm": 0.3658747064356377, + "learning_rate": 9.827260003630548e-06, + "loss": 0.0065, + "step": 16634 + }, + { + "epoch": 6.764945099633998, + "grad_norm": 5.040724967629764, + "learning_rate": 9.826246967492747e-06, + "loss": 0.188, + "step": 16635 + }, + { + "epoch": 6.765351769011794, + "grad_norm": 12.418490823365197, + "learning_rate": 9.825233933138607e-06, + "loss": 0.3469, + "step": 16636 + }, + { + "epoch": 6.7657584383895895, + "grad_norm": 4.398206581997907, + "learning_rate": 9.82422090057853e-06, + "loss": 0.0906, + "step": 16637 + }, + { + "epoch": 6.766165107767385, + "grad_norm": 3.2677303060446588, + "learning_rate": 9.823207869822914e-06, + "loss": 0.0615, + "step": 16638 + }, + { + "epoch": 6.766571777145181, + "grad_norm": 5.278963357359084, + "learning_rate": 9.822194840882157e-06, + "loss": 0.1311, + "step": 16639 + }, + { + "epoch": 6.766978446522977, + "grad_norm": 7.797423953369058, + "learning_rate": 9.82118181376666e-06, + "loss": 0.2065, + "step": 16640 + }, + { + "epoch": 6.7673851159007725, + "grad_norm": 7.379751460203865, + "learning_rate": 9.820168788486823e-06, + "loss": 0.4602, + "step": 16641 + }, + { + "epoch": 6.767791785278568, + "grad_norm": 0.8360345039948854, + "learning_rate": 9.819155765053042e-06, + "loss": 0.0123, + "step": 16642 + }, + { + "epoch": 6.768198454656364, + "grad_norm": 2.0519850727932383, + "learning_rate": 9.818142743475719e-06, + "loss": 0.0445, + "step": 16643 + }, + { + "epoch": 6.76860512403416, + "grad_norm": 5.869248669434353, + "learning_rate": 9.817129723765253e-06, + "loss": 0.0849, + "step": 16644 + }, + { + "epoch": 6.7690117934119565, + "grad_norm": 4.3416587799692055, + "learning_rate": 9.81611670593204e-06, + "loss": 0.0759, + "step": 16645 + }, + { + "epoch": 6.769418462789752, + "grad_norm": 5.50595477154496, + "learning_rate": 9.815103689986481e-06, + "loss": 0.1303, + "step": 16646 + }, + { + "epoch": 6.769825132167548, + "grad_norm": 0.07258988956575745, + "learning_rate": 9.814090675938979e-06, + "loss": 0.001, + "step": 16647 + }, + { + "epoch": 6.770231801545344, + "grad_norm": 5.332716695687505, + "learning_rate": 9.813077663799929e-06, + "loss": 0.171, + "step": 16648 + }, + { + "epoch": 6.7706384709231395, + "grad_norm": 0.06957669959027535, + "learning_rate": 9.81206465357973e-06, + "loss": 0.0013, + "step": 16649 + }, + { + "epoch": 6.771045140300935, + "grad_norm": 9.463552912791318, + "learning_rate": 9.811051645288782e-06, + "loss": 0.1708, + "step": 16650 + }, + { + "epoch": 6.771451809678731, + "grad_norm": 11.251656723554989, + "learning_rate": 9.810038638937484e-06, + "loss": 0.204, + "step": 16651 + }, + { + "epoch": 6.771858479056527, + "grad_norm": 2.77741891421241, + "learning_rate": 9.809025634536237e-06, + "loss": 0.0538, + "step": 16652 + }, + { + "epoch": 6.772265148434323, + "grad_norm": 6.646085270633761, + "learning_rate": 9.808012632095434e-06, + "loss": 0.1668, + "step": 16653 + }, + { + "epoch": 6.772671817812119, + "grad_norm": 1.923351363709023, + "learning_rate": 9.80699963162548e-06, + "loss": 0.0317, + "step": 16654 + }, + { + "epoch": 6.773078487189915, + "grad_norm": 9.542373607439064, + "learning_rate": 9.805986633136773e-06, + "loss": 0.3203, + "step": 16655 + }, + { + "epoch": 6.773485156567711, + "grad_norm": 0.46208902784793543, + "learning_rate": 9.80497363663971e-06, + "loss": 0.0086, + "step": 16656 + }, + { + "epoch": 6.7738918259455065, + "grad_norm": 14.795392992431378, + "learning_rate": 9.803960642144692e-06, + "loss": 0.304, + "step": 16657 + }, + { + "epoch": 6.774298495323302, + "grad_norm": 0.4933500490333955, + "learning_rate": 9.802947649662115e-06, + "loss": 0.0107, + "step": 16658 + }, + { + "epoch": 6.774705164701098, + "grad_norm": 9.521956459862219, + "learning_rate": 9.801934659202378e-06, + "loss": 0.4972, + "step": 16659 + }, + { + "epoch": 6.775111834078894, + "grad_norm": 12.16631363959802, + "learning_rate": 9.800921670775884e-06, + "loss": 0.4932, + "step": 16660 + }, + { + "epoch": 6.77551850345669, + "grad_norm": 3.7614293571419712, + "learning_rate": 9.799908684393029e-06, + "loss": 0.0564, + "step": 16661 + }, + { + "epoch": 6.775925172834485, + "grad_norm": 4.232430475333629, + "learning_rate": 9.79889570006421e-06, + "loss": 0.0857, + "step": 16662 + }, + { + "epoch": 6.776331842212281, + "grad_norm": 0.6325965633374869, + "learning_rate": 9.797882717799835e-06, + "loss": 0.0064, + "step": 16663 + }, + { + "epoch": 6.776738511590077, + "grad_norm": 12.792411315071755, + "learning_rate": 9.796869737610288e-06, + "loss": 0.4079, + "step": 16664 + }, + { + "epoch": 6.777145180967873, + "grad_norm": 0.187928496040034, + "learning_rate": 9.795856759505979e-06, + "loss": 0.003, + "step": 16665 + }, + { + "epoch": 6.777551850345669, + "grad_norm": 0.9265621700020418, + "learning_rate": 9.794843783497301e-06, + "loss": 0.0111, + "step": 16666 + }, + { + "epoch": 6.777958519723465, + "grad_norm": 3.589859824798383, + "learning_rate": 9.793830809594657e-06, + "loss": 0.0175, + "step": 16667 + }, + { + "epoch": 6.778365189101261, + "grad_norm": 3.85451227960407, + "learning_rate": 9.792817837808439e-06, + "loss": 0.0314, + "step": 16668 + }, + { + "epoch": 6.778771858479057, + "grad_norm": 1.0075756780535594, + "learning_rate": 9.791804868149054e-06, + "loss": 0.0176, + "step": 16669 + }, + { + "epoch": 6.779178527856852, + "grad_norm": 2.9375992504202855, + "learning_rate": 9.790791900626897e-06, + "loss": 0.0538, + "step": 16670 + }, + { + "epoch": 6.779585197234648, + "grad_norm": 3.767938173893174, + "learning_rate": 9.789778935252363e-06, + "loss": 0.1451, + "step": 16671 + }, + { + "epoch": 6.779991866612444, + "grad_norm": 0.138129837833183, + "learning_rate": 9.788765972035857e-06, + "loss": 0.0013, + "step": 16672 + }, + { + "epoch": 6.78039853599024, + "grad_norm": 0.6774596757483666, + "learning_rate": 9.787753010987775e-06, + "loss": 0.0125, + "step": 16673 + }, + { + "epoch": 6.780805205368035, + "grad_norm": 1.6321348758876615, + "learning_rate": 9.786740052118513e-06, + "loss": 0.0253, + "step": 16674 + }, + { + "epoch": 6.781211874745832, + "grad_norm": 0.11205567504303002, + "learning_rate": 9.785727095438472e-06, + "loss": 0.0016, + "step": 16675 + }, + { + "epoch": 6.781618544123628, + "grad_norm": 14.78625076253955, + "learning_rate": 9.784714140958052e-06, + "loss": 0.116, + "step": 16676 + }, + { + "epoch": 6.782025213501424, + "grad_norm": 4.820505696469683, + "learning_rate": 9.783701188687648e-06, + "loss": 0.1772, + "step": 16677 + }, + { + "epoch": 6.782431882879219, + "grad_norm": 6.267511461948921, + "learning_rate": 9.782688238637661e-06, + "loss": 0.0741, + "step": 16678 + }, + { + "epoch": 6.782838552257015, + "grad_norm": 12.695233703368745, + "learning_rate": 9.78167529081849e-06, + "loss": 0.2164, + "step": 16679 + }, + { + "epoch": 6.783245221634811, + "grad_norm": 7.95255012181656, + "learning_rate": 9.78066234524053e-06, + "loss": 0.1841, + "step": 16680 + }, + { + "epoch": 6.783651891012607, + "grad_norm": 3.834161268274248, + "learning_rate": 9.77964940191418e-06, + "loss": 0.1278, + "step": 16681 + }, + { + "epoch": 6.7840585603904024, + "grad_norm": 1.3649602925791946, + "learning_rate": 9.778636460849844e-06, + "loss": 0.0164, + "step": 16682 + }, + { + "epoch": 6.784465229768198, + "grad_norm": 3.6735738069757504, + "learning_rate": 9.777623522057914e-06, + "loss": 0.0571, + "step": 16683 + }, + { + "epoch": 6.784871899145994, + "grad_norm": 13.49948402529161, + "learning_rate": 9.776610585548789e-06, + "loss": 0.495, + "step": 16684 + }, + { + "epoch": 6.78527856852379, + "grad_norm": 0.4296030846730657, + "learning_rate": 9.77559765133287e-06, + "loss": 0.0046, + "step": 16685 + }, + { + "epoch": 6.785685237901586, + "grad_norm": 7.163813589799296, + "learning_rate": 9.774584719420555e-06, + "loss": 0.1743, + "step": 16686 + }, + { + "epoch": 6.786091907279382, + "grad_norm": 0.6731091865680615, + "learning_rate": 9.773571789822238e-06, + "loss": 0.0121, + "step": 16687 + }, + { + "epoch": 6.786498576657178, + "grad_norm": 4.874935919045526, + "learning_rate": 9.772558862548325e-06, + "loss": 0.0447, + "step": 16688 + }, + { + "epoch": 6.786905246034974, + "grad_norm": 8.43213041233626, + "learning_rate": 9.771545937609209e-06, + "loss": 0.3697, + "step": 16689 + }, + { + "epoch": 6.7873119154127695, + "grad_norm": 0.2061765180395315, + "learning_rate": 9.770533015015285e-06, + "loss": 0.0021, + "step": 16690 + }, + { + "epoch": 6.787718584790565, + "grad_norm": 1.1350321448522582, + "learning_rate": 9.76952009477696e-06, + "loss": 0.0163, + "step": 16691 + }, + { + "epoch": 6.788125254168361, + "grad_norm": 2.1586135694317106, + "learning_rate": 9.768507176904624e-06, + "loss": 0.0502, + "step": 16692 + }, + { + "epoch": 6.788531923546157, + "grad_norm": 1.4851723381059652, + "learning_rate": 9.76749426140868e-06, + "loss": 0.0175, + "step": 16693 + }, + { + "epoch": 6.7889385929239525, + "grad_norm": 2.0965261851235315, + "learning_rate": 9.766481348299521e-06, + "loss": 0.0365, + "step": 16694 + }, + { + "epoch": 6.789345262301749, + "grad_norm": 0.6939961247426526, + "learning_rate": 9.765468437587553e-06, + "loss": 0.0104, + "step": 16695 + }, + { + "epoch": 6.789751931679545, + "grad_norm": 1.2474318663456698, + "learning_rate": 9.764455529283168e-06, + "loss": 0.0189, + "step": 16696 + }, + { + "epoch": 6.790158601057341, + "grad_norm": 8.214795594294188, + "learning_rate": 9.763442623396764e-06, + "loss": 0.3103, + "step": 16697 + }, + { + "epoch": 6.7905652704351365, + "grad_norm": 6.0467054937197595, + "learning_rate": 9.762429719938743e-06, + "loss": 0.1953, + "step": 16698 + }, + { + "epoch": 6.790971939812932, + "grad_norm": 1.1903508055469416, + "learning_rate": 9.761416818919498e-06, + "loss": 0.0115, + "step": 16699 + }, + { + "epoch": 6.791378609190728, + "grad_norm": 7.766923919073816, + "learning_rate": 9.760403920349431e-06, + "loss": 0.2386, + "step": 16700 + }, + { + "epoch": 6.791785278568524, + "grad_norm": 0.6626815690055399, + "learning_rate": 9.759391024238938e-06, + "loss": 0.0117, + "step": 16701 + }, + { + "epoch": 6.7921919479463195, + "grad_norm": 3.4907469570516256, + "learning_rate": 9.758378130598417e-06, + "loss": 0.0441, + "step": 16702 + }, + { + "epoch": 6.792598617324115, + "grad_norm": 0.14466389968563487, + "learning_rate": 9.757365239438266e-06, + "loss": 0.0021, + "step": 16703 + }, + { + "epoch": 6.793005286701911, + "grad_norm": 7.170248599078378, + "learning_rate": 9.756352350768884e-06, + "loss": 0.3845, + "step": 16704 + }, + { + "epoch": 6.793411956079707, + "grad_norm": 3.9296264333227127, + "learning_rate": 9.755339464600668e-06, + "loss": 0.0563, + "step": 16705 + }, + { + "epoch": 6.793818625457503, + "grad_norm": 8.36109212191662, + "learning_rate": 9.754326580944015e-06, + "loss": 0.3054, + "step": 16706 + }, + { + "epoch": 6.794225294835299, + "grad_norm": 6.019347036415764, + "learning_rate": 9.75331369980932e-06, + "loss": 0.3014, + "step": 16707 + }, + { + "epoch": 6.794631964213095, + "grad_norm": 2.638977254542912, + "learning_rate": 9.752300821206989e-06, + "loss": 0.1063, + "step": 16708 + }, + { + "epoch": 6.795038633590891, + "grad_norm": 2.3957412084966148, + "learning_rate": 9.751287945147412e-06, + "loss": 0.0416, + "step": 16709 + }, + { + "epoch": 6.7954453029686865, + "grad_norm": 5.588107787938821, + "learning_rate": 9.750275071640988e-06, + "loss": 0.0916, + "step": 16710 + }, + { + "epoch": 6.795851972346482, + "grad_norm": 3.034501364658265, + "learning_rate": 9.74926220069812e-06, + "loss": 0.0609, + "step": 16711 + }, + { + "epoch": 6.796258641724278, + "grad_norm": 4.994415221946811, + "learning_rate": 9.748249332329201e-06, + "loss": 0.0749, + "step": 16712 + }, + { + "epoch": 6.796665311102074, + "grad_norm": 0.014260830694665228, + "learning_rate": 9.747236466544626e-06, + "loss": 0.0003, + "step": 16713 + }, + { + "epoch": 6.79707198047987, + "grad_norm": 9.129298080536314, + "learning_rate": 9.746223603354799e-06, + "loss": 0.2166, + "step": 16714 + }, + { + "epoch": 6.797478649857665, + "grad_norm": 3.851856278095248, + "learning_rate": 9.745210742770114e-06, + "loss": 0.1163, + "step": 16715 + }, + { + "epoch": 6.797885319235462, + "grad_norm": 0.5080762675455841, + "learning_rate": 9.744197884800968e-06, + "loss": 0.0076, + "step": 16716 + }, + { + "epoch": 6.798291988613258, + "grad_norm": 6.288900647168714, + "learning_rate": 9.743185029457759e-06, + "loss": 0.1093, + "step": 16717 + }, + { + "epoch": 6.7986986579910536, + "grad_norm": 0.025126946243202778, + "learning_rate": 9.742172176750887e-06, + "loss": 0.0004, + "step": 16718 + }, + { + "epoch": 6.799105327368849, + "grad_norm": 0.25890133547546185, + "learning_rate": 9.741159326690745e-06, + "loss": 0.0024, + "step": 16719 + }, + { + "epoch": 6.799511996746645, + "grad_norm": 4.106766260353083, + "learning_rate": 9.740146479287733e-06, + "loss": 0.1037, + "step": 16720 + }, + { + "epoch": 6.799918666124441, + "grad_norm": 7.503877648155594, + "learning_rate": 9.73913363455225e-06, + "loss": 0.1217, + "step": 16721 + }, + { + "epoch": 6.800325335502237, + "grad_norm": 10.282321519365214, + "learning_rate": 9.738120792494692e-06, + "loss": 0.4752, + "step": 16722 + }, + { + "epoch": 6.800732004880032, + "grad_norm": 4.417582874496245, + "learning_rate": 9.737107953125451e-06, + "loss": 0.075, + "step": 16723 + }, + { + "epoch": 6.801138674257828, + "grad_norm": 3.5299773198240154, + "learning_rate": 9.736095116454934e-06, + "loss": 0.054, + "step": 16724 + }, + { + "epoch": 6.801545343635624, + "grad_norm": 16.271205394424307, + "learning_rate": 9.735082282493533e-06, + "loss": 1.026, + "step": 16725 + }, + { + "epoch": 6.80195201301342, + "grad_norm": 0.5607321721727355, + "learning_rate": 9.734069451251643e-06, + "loss": 0.0072, + "step": 16726 + }, + { + "epoch": 6.802358682391216, + "grad_norm": 8.847838421813499, + "learning_rate": 9.733056622739668e-06, + "loss": 0.2011, + "step": 16727 + }, + { + "epoch": 6.802765351769012, + "grad_norm": 15.115309658274645, + "learning_rate": 9.732043796967998e-06, + "loss": 0.5537, + "step": 16728 + }, + { + "epoch": 6.803172021146808, + "grad_norm": 4.176742492141079, + "learning_rate": 9.731030973947035e-06, + "loss": 0.0417, + "step": 16729 + }, + { + "epoch": 6.803578690524604, + "grad_norm": 2.5010439916902705, + "learning_rate": 9.730018153687173e-06, + "loss": 0.0257, + "step": 16730 + }, + { + "epoch": 6.803985359902399, + "grad_norm": 10.551309775002501, + "learning_rate": 9.729005336198812e-06, + "loss": 0.2742, + "step": 16731 + }, + { + "epoch": 6.804392029280195, + "grad_norm": 0.15961487582475195, + "learning_rate": 9.727992521492347e-06, + "loss": 0.0019, + "step": 16732 + }, + { + "epoch": 6.804798698657991, + "grad_norm": 2.2531664721932776, + "learning_rate": 9.726979709578173e-06, + "loss": 0.0346, + "step": 16733 + }, + { + "epoch": 6.805205368035787, + "grad_norm": 3.4231776370927087, + "learning_rate": 9.725966900466693e-06, + "loss": 0.0899, + "step": 16734 + }, + { + "epoch": 6.8056120374135824, + "grad_norm": 0.42924335994112706, + "learning_rate": 9.7249540941683e-06, + "loss": 0.0073, + "step": 16735 + }, + { + "epoch": 6.806018706791379, + "grad_norm": 7.351949416568511, + "learning_rate": 9.72394129069339e-06, + "loss": 0.1711, + "step": 16736 + }, + { + "epoch": 6.806425376169175, + "grad_norm": 0.5409952957761389, + "learning_rate": 9.722928490052363e-06, + "loss": 0.004, + "step": 16737 + }, + { + "epoch": 6.806832045546971, + "grad_norm": 6.899900887019245, + "learning_rate": 9.721915692255617e-06, + "loss": 0.4844, + "step": 16738 + }, + { + "epoch": 6.807238714924766, + "grad_norm": 1.0739225877718002, + "learning_rate": 9.72090289731354e-06, + "loss": 0.0147, + "step": 16739 + }, + { + "epoch": 6.807645384302562, + "grad_norm": 1.5207314703470305, + "learning_rate": 9.719890105236541e-06, + "loss": 0.0187, + "step": 16740 + }, + { + "epoch": 6.808052053680358, + "grad_norm": 11.821764591584055, + "learning_rate": 9.718877316035011e-06, + "loss": 0.3671, + "step": 16741 + }, + { + "epoch": 6.808458723058154, + "grad_norm": 4.272039215854746, + "learning_rate": 9.717864529719344e-06, + "loss": 0.0683, + "step": 16742 + }, + { + "epoch": 6.8088653924359495, + "grad_norm": 7.372673941458431, + "learning_rate": 9.71685174629994e-06, + "loss": 0.1895, + "step": 16743 + }, + { + "epoch": 6.809272061813745, + "grad_norm": 16.231483585658413, + "learning_rate": 9.715838965787197e-06, + "loss": 0.5351, + "step": 16744 + }, + { + "epoch": 6.809678731191541, + "grad_norm": 4.190647205164876, + "learning_rate": 9.714826188191508e-06, + "loss": 0.1526, + "step": 16745 + }, + { + "epoch": 6.810085400569337, + "grad_norm": 0.8984469836725114, + "learning_rate": 9.713813413523273e-06, + "loss": 0.0089, + "step": 16746 + }, + { + "epoch": 6.8104920699471325, + "grad_norm": 6.275737922768917, + "learning_rate": 9.712800641792886e-06, + "loss": 0.2054, + "step": 16747 + }, + { + "epoch": 6.810898739324929, + "grad_norm": 4.160676030978881, + "learning_rate": 9.711787873010747e-06, + "loss": 0.0563, + "step": 16748 + }, + { + "epoch": 6.811305408702725, + "grad_norm": 0.025839950445537933, + "learning_rate": 9.710775107187248e-06, + "loss": 0.0003, + "step": 16749 + }, + { + "epoch": 6.811712078080521, + "grad_norm": 0.19054532895196855, + "learning_rate": 9.70976234433279e-06, + "loss": 0.003, + "step": 16750 + }, + { + "epoch": 6.8121187474583165, + "grad_norm": 13.853377806880033, + "learning_rate": 9.708749584457768e-06, + "loss": 0.8685, + "step": 16751 + }, + { + "epoch": 6.812525416836112, + "grad_norm": 5.909290250274367, + "learning_rate": 9.707736827572576e-06, + "loss": 0.1133, + "step": 16752 + }, + { + "epoch": 6.812932086213908, + "grad_norm": 7.97658521815249, + "learning_rate": 9.706724073687615e-06, + "loss": 0.2834, + "step": 16753 + }, + { + "epoch": 6.813338755591704, + "grad_norm": 0.174682677755347, + "learning_rate": 9.705711322813276e-06, + "loss": 0.0029, + "step": 16754 + }, + { + "epoch": 6.8137454249694995, + "grad_norm": 3.3254165665430104, + "learning_rate": 9.704698574959962e-06, + "loss": 0.0516, + "step": 16755 + }, + { + "epoch": 6.814152094347295, + "grad_norm": 9.836472153619422, + "learning_rate": 9.703685830138063e-06, + "loss": 0.4514, + "step": 16756 + }, + { + "epoch": 6.814558763725092, + "grad_norm": 3.1702929282976737, + "learning_rate": 9.70267308835798e-06, + "loss": 0.0437, + "step": 16757 + }, + { + "epoch": 6.814965433102888, + "grad_norm": 0.6606783141656296, + "learning_rate": 9.701660349630106e-06, + "loss": 0.0115, + "step": 16758 + }, + { + "epoch": 6.8153721024806835, + "grad_norm": 4.510069233870833, + "learning_rate": 9.700647613964837e-06, + "loss": 0.0794, + "step": 16759 + }, + { + "epoch": 6.815778771858479, + "grad_norm": 15.335747847839444, + "learning_rate": 9.699634881372575e-06, + "loss": 0.3932, + "step": 16760 + }, + { + "epoch": 6.816185441236275, + "grad_norm": 2.7847401974440005, + "learning_rate": 9.69862215186371e-06, + "loss": 0.0626, + "step": 16761 + }, + { + "epoch": 6.816592110614071, + "grad_norm": 7.475449785267152, + "learning_rate": 9.69760942544864e-06, + "loss": 0.1539, + "step": 16762 + }, + { + "epoch": 6.8169987799918665, + "grad_norm": 6.152815322816963, + "learning_rate": 9.696596702137765e-06, + "loss": 0.1843, + "step": 16763 + }, + { + "epoch": 6.817405449369662, + "grad_norm": 0.6328674348165042, + "learning_rate": 9.695583981941475e-06, + "loss": 0.0107, + "step": 16764 + }, + { + "epoch": 6.817812118747458, + "grad_norm": 1.7140117472220597, + "learning_rate": 9.694571264870167e-06, + "loss": 0.0233, + "step": 16765 + }, + { + "epoch": 6.818218788125254, + "grad_norm": 0.44843562147397553, + "learning_rate": 9.69355855093424e-06, + "loss": 0.0084, + "step": 16766 + }, + { + "epoch": 6.81862545750305, + "grad_norm": 8.793156887678613, + "learning_rate": 9.692545840144088e-06, + "loss": 0.3948, + "step": 16767 + }, + { + "epoch": 6.819032126880846, + "grad_norm": 8.138262545092289, + "learning_rate": 9.69153313251011e-06, + "loss": 0.3467, + "step": 16768 + }, + { + "epoch": 6.819438796258642, + "grad_norm": 7.847621564826438, + "learning_rate": 9.690520428042697e-06, + "loss": 0.2258, + "step": 16769 + }, + { + "epoch": 6.819845465636438, + "grad_norm": 10.163974636967726, + "learning_rate": 9.689507726752248e-06, + "loss": 0.3971, + "step": 16770 + }, + { + "epoch": 6.8202521350142336, + "grad_norm": 0.16809988169893814, + "learning_rate": 9.688495028649158e-06, + "loss": 0.002, + "step": 16771 + }, + { + "epoch": 6.820658804392029, + "grad_norm": 10.728590720730045, + "learning_rate": 9.687482333743825e-06, + "loss": 0.3956, + "step": 16772 + }, + { + "epoch": 6.821065473769825, + "grad_norm": 6.778373638498948, + "learning_rate": 9.686469642046644e-06, + "loss": 0.138, + "step": 16773 + }, + { + "epoch": 6.821472143147621, + "grad_norm": 4.515280462212639, + "learning_rate": 9.685456953568005e-06, + "loss": 0.1927, + "step": 16774 + }, + { + "epoch": 6.821878812525417, + "grad_norm": 0.2509387158464289, + "learning_rate": 9.684444268318313e-06, + "loss": 0.0048, + "step": 16775 + }, + { + "epoch": 6.822285481903212, + "grad_norm": 1.556283098784252, + "learning_rate": 9.683431586307959e-06, + "loss": 0.0156, + "step": 16776 + }, + { + "epoch": 6.822692151281009, + "grad_norm": 3.433572659367368, + "learning_rate": 9.682418907547336e-06, + "loss": 0.0577, + "step": 16777 + }, + { + "epoch": 6.823098820658805, + "grad_norm": 4.462919343651454, + "learning_rate": 9.681406232046846e-06, + "loss": 0.0876, + "step": 16778 + }, + { + "epoch": 6.823505490036601, + "grad_norm": 1.252176796137267, + "learning_rate": 9.68039355981688e-06, + "loss": 0.0202, + "step": 16779 + }, + { + "epoch": 6.823912159414396, + "grad_norm": 19.03831286983642, + "learning_rate": 9.679380890867835e-06, + "loss": 1.1721, + "step": 16780 + }, + { + "epoch": 6.824318828792192, + "grad_norm": 0.7134325211299035, + "learning_rate": 9.678368225210107e-06, + "loss": 0.0115, + "step": 16781 + }, + { + "epoch": 6.824725498169988, + "grad_norm": 0.8847455634963144, + "learning_rate": 9.677355562854092e-06, + "loss": 0.0131, + "step": 16782 + }, + { + "epoch": 6.825132167547784, + "grad_norm": 3.1248852693835216, + "learning_rate": 9.676342903810185e-06, + "loss": 0.0568, + "step": 16783 + }, + { + "epoch": 6.825538836925579, + "grad_norm": 7.619205661503198, + "learning_rate": 9.675330248088777e-06, + "loss": 0.4619, + "step": 16784 + }, + { + "epoch": 6.825945506303375, + "grad_norm": 4.681746584099409, + "learning_rate": 9.67431759570027e-06, + "loss": 0.3079, + "step": 16785 + }, + { + "epoch": 6.826352175681171, + "grad_norm": 1.4621194878978356, + "learning_rate": 9.673304946655057e-06, + "loss": 0.0254, + "step": 16786 + }, + { + "epoch": 6.826758845058967, + "grad_norm": 4.09143742846653, + "learning_rate": 9.672292300963532e-06, + "loss": 0.0859, + "step": 16787 + }, + { + "epoch": 6.8271655144367625, + "grad_norm": 5.600665082097924, + "learning_rate": 9.671279658636094e-06, + "loss": 0.1649, + "step": 16788 + }, + { + "epoch": 6.827572183814559, + "grad_norm": 0.02932140158141484, + "learning_rate": 9.670267019683134e-06, + "loss": 0.0006, + "step": 16789 + }, + { + "epoch": 6.827978853192355, + "grad_norm": 7.025988887652726, + "learning_rate": 9.669254384115047e-06, + "loss": 0.1329, + "step": 16790 + }, + { + "epoch": 6.828385522570151, + "grad_norm": 4.922887260551248, + "learning_rate": 9.668241751942234e-06, + "loss": 0.0726, + "step": 16791 + }, + { + "epoch": 6.828792191947946, + "grad_norm": 1.428081864678066, + "learning_rate": 9.667229123175085e-06, + "loss": 0.0213, + "step": 16792 + }, + { + "epoch": 6.829198861325742, + "grad_norm": 7.062366235097127, + "learning_rate": 9.666216497823994e-06, + "loss": 0.2034, + "step": 16793 + }, + { + "epoch": 6.829605530703538, + "grad_norm": 9.768207561423603, + "learning_rate": 9.665203875899361e-06, + "loss": 0.2751, + "step": 16794 + }, + { + "epoch": 6.830012200081334, + "grad_norm": 6.972452043967745, + "learning_rate": 9.66419125741158e-06, + "loss": 0.2319, + "step": 16795 + }, + { + "epoch": 6.8304188694591295, + "grad_norm": 0.5445110752771037, + "learning_rate": 9.663178642371043e-06, + "loss": 0.0078, + "step": 16796 + }, + { + "epoch": 6.830825538836925, + "grad_norm": 1.3408924574016032, + "learning_rate": 9.662166030788146e-06, + "loss": 0.0278, + "step": 16797 + }, + { + "epoch": 6.831232208214722, + "grad_norm": 4.066363569586702, + "learning_rate": 9.661153422673286e-06, + "loss": 0.1319, + "step": 16798 + }, + { + "epoch": 6.831638877592518, + "grad_norm": 3.3549746393221933, + "learning_rate": 9.660140818036857e-06, + "loss": 0.0503, + "step": 16799 + }, + { + "epoch": 6.832045546970313, + "grad_norm": 0.3346229878213413, + "learning_rate": 9.659128216889251e-06, + "loss": 0.0051, + "step": 16800 + }, + { + "epoch": 6.832452216348109, + "grad_norm": 8.059396578738422, + "learning_rate": 9.658115619240869e-06, + "loss": 0.3341, + "step": 16801 + }, + { + "epoch": 6.832858885725905, + "grad_norm": 2.364685893744783, + "learning_rate": 9.6571030251021e-06, + "loss": 0.0255, + "step": 16802 + }, + { + "epoch": 6.833265555103701, + "grad_norm": 7.122851323384512, + "learning_rate": 9.656090434483341e-06, + "loss": 0.136, + "step": 16803 + }, + { + "epoch": 6.8336722244814965, + "grad_norm": 5.554564791710931, + "learning_rate": 9.655077847394988e-06, + "loss": 0.1058, + "step": 16804 + }, + { + "epoch": 6.834078893859292, + "grad_norm": 9.948828019301628, + "learning_rate": 9.654065263847435e-06, + "loss": 0.3716, + "step": 16805 + }, + { + "epoch": 6.834485563237088, + "grad_norm": 10.32005774452837, + "learning_rate": 9.653052683851074e-06, + "loss": 0.2594, + "step": 16806 + }, + { + "epoch": 6.834892232614884, + "grad_norm": 4.236498780240568, + "learning_rate": 9.652040107416301e-06, + "loss": 0.0529, + "step": 16807 + }, + { + "epoch": 6.8352989019926795, + "grad_norm": 2.83781966595238, + "learning_rate": 9.651027534553515e-06, + "loss": 0.0384, + "step": 16808 + }, + { + "epoch": 6.835705571370476, + "grad_norm": 0.5848180347256353, + "learning_rate": 9.650014965273106e-06, + "loss": 0.0067, + "step": 16809 + }, + { + "epoch": 6.836112240748272, + "grad_norm": 0.2143172519358641, + "learning_rate": 9.649002399585468e-06, + "loss": 0.0031, + "step": 16810 + }, + { + "epoch": 6.836518910126068, + "grad_norm": 3.078072691941337, + "learning_rate": 9.647989837500997e-06, + "loss": 0.0446, + "step": 16811 + }, + { + "epoch": 6.8369255795038635, + "grad_norm": 5.268009463082131, + "learning_rate": 9.64697727903009e-06, + "loss": 0.0909, + "step": 16812 + }, + { + "epoch": 6.837332248881659, + "grad_norm": 0.17857453009329569, + "learning_rate": 9.645964724183134e-06, + "loss": 0.0026, + "step": 16813 + }, + { + "epoch": 6.837738918259455, + "grad_norm": 6.5085986999195375, + "learning_rate": 9.644952172970533e-06, + "loss": 0.2008, + "step": 16814 + }, + { + "epoch": 6.838145587637251, + "grad_norm": 0.9467297097296331, + "learning_rate": 9.643939625402676e-06, + "loss": 0.0133, + "step": 16815 + }, + { + "epoch": 6.8385522570150465, + "grad_norm": 2.115057766659757, + "learning_rate": 9.642927081489956e-06, + "loss": 0.0318, + "step": 16816 + }, + { + "epoch": 6.838958926392842, + "grad_norm": 0.4971119287752589, + "learning_rate": 9.64191454124277e-06, + "loss": 0.0093, + "step": 16817 + }, + { + "epoch": 6.839365595770639, + "grad_norm": 1.0234319810066435, + "learning_rate": 9.640902004671514e-06, + "loss": 0.0183, + "step": 16818 + }, + { + "epoch": 6.839772265148435, + "grad_norm": 1.5107863728134538, + "learning_rate": 9.639889471786576e-06, + "loss": 0.0307, + "step": 16819 + }, + { + "epoch": 6.8401789345262305, + "grad_norm": 0.03151312538967867, + "learning_rate": 9.638876942598355e-06, + "loss": 0.0006, + "step": 16820 + }, + { + "epoch": 6.840585603904026, + "grad_norm": 0.09726452955120349, + "learning_rate": 9.637864417117244e-06, + "loss": 0.0012, + "step": 16821 + }, + { + "epoch": 6.840992273281822, + "grad_norm": 1.0348665073721026, + "learning_rate": 9.636851895353638e-06, + "loss": 0.0157, + "step": 16822 + }, + { + "epoch": 6.841398942659618, + "grad_norm": 4.861875255654191, + "learning_rate": 9.635839377317926e-06, + "loss": 0.2287, + "step": 16823 + }, + { + "epoch": 6.8418056120374136, + "grad_norm": 1.1500472754210576, + "learning_rate": 9.63482686302051e-06, + "loss": 0.0181, + "step": 16824 + }, + { + "epoch": 6.842212281415209, + "grad_norm": 3.726004290308859, + "learning_rate": 9.633814352471782e-06, + "loss": 0.0711, + "step": 16825 + }, + { + "epoch": 6.842618950793005, + "grad_norm": 6.823164005140098, + "learning_rate": 9.63280184568213e-06, + "loss": 0.3824, + "step": 16826 + }, + { + "epoch": 6.843025620170801, + "grad_norm": 6.826383047596253, + "learning_rate": 9.631789342661952e-06, + "loss": 0.1297, + "step": 16827 + }, + { + "epoch": 6.843432289548597, + "grad_norm": 1.3945968319327382, + "learning_rate": 9.630776843421645e-06, + "loss": 0.0174, + "step": 16828 + }, + { + "epoch": 6.843838958926392, + "grad_norm": 9.05972018482341, + "learning_rate": 9.629764347971596e-06, + "loss": 0.1081, + "step": 16829 + }, + { + "epoch": 6.844245628304189, + "grad_norm": 5.319107986245565, + "learning_rate": 9.628751856322204e-06, + "loss": 0.181, + "step": 16830 + }, + { + "epoch": 6.844652297681985, + "grad_norm": 2.074460247774271, + "learning_rate": 9.627739368483861e-06, + "loss": 0.0418, + "step": 16831 + }, + { + "epoch": 6.845058967059781, + "grad_norm": 8.027938512361958, + "learning_rate": 9.626726884466962e-06, + "loss": 0.1196, + "step": 16832 + }, + { + "epoch": 6.845465636437576, + "grad_norm": 0.34456788794398713, + "learning_rate": 9.625714404281895e-06, + "loss": 0.0051, + "step": 16833 + }, + { + "epoch": 6.845872305815372, + "grad_norm": 6.825976350409857, + "learning_rate": 9.624701927939062e-06, + "loss": 0.3687, + "step": 16834 + }, + { + "epoch": 6.846278975193168, + "grad_norm": 5.006657357450949, + "learning_rate": 9.623689455448852e-06, + "loss": 0.1841, + "step": 16835 + }, + { + "epoch": 6.846685644570964, + "grad_norm": 0.22248028179089543, + "learning_rate": 9.622676986821657e-06, + "loss": 0.0034, + "step": 16836 + }, + { + "epoch": 6.847092313948759, + "grad_norm": 0.00834708213515877, + "learning_rate": 9.621664522067876e-06, + "loss": 0.0002, + "step": 16837 + }, + { + "epoch": 6.847498983326555, + "grad_norm": 12.259392445587956, + "learning_rate": 9.620652061197896e-06, + "loss": 0.5434, + "step": 16838 + }, + { + "epoch": 6.847905652704352, + "grad_norm": 12.337851360488644, + "learning_rate": 9.619639604222113e-06, + "loss": 0.6497, + "step": 16839 + }, + { + "epoch": 6.848312322082148, + "grad_norm": 5.356914523359093, + "learning_rate": 9.618627151150924e-06, + "loss": 0.1193, + "step": 16840 + }, + { + "epoch": 6.848718991459943, + "grad_norm": 0.3249694617015834, + "learning_rate": 9.617614701994719e-06, + "loss": 0.0041, + "step": 16841 + }, + { + "epoch": 6.849125660837739, + "grad_norm": 0.5756022809652211, + "learning_rate": 9.61660225676389e-06, + "loss": 0.0079, + "step": 16842 + }, + { + "epoch": 6.849532330215535, + "grad_norm": 3.195835860681306, + "learning_rate": 9.615589815468832e-06, + "loss": 0.0466, + "step": 16843 + }, + { + "epoch": 6.849938999593331, + "grad_norm": 3.4870189312921633, + "learning_rate": 9.614577378119939e-06, + "loss": 0.0638, + "step": 16844 + }, + { + "epoch": 6.850345668971126, + "grad_norm": 1.2222343506618711, + "learning_rate": 9.613564944727603e-06, + "loss": 0.0105, + "step": 16845 + }, + { + "epoch": 6.850752338348922, + "grad_norm": 1.9083251350482493, + "learning_rate": 9.612552515302216e-06, + "loss": 0.0316, + "step": 16846 + }, + { + "epoch": 6.851159007726718, + "grad_norm": 1.147449047142422, + "learning_rate": 9.611540089854174e-06, + "loss": 0.0199, + "step": 16847 + }, + { + "epoch": 6.851565677104514, + "grad_norm": 2.208722263755145, + "learning_rate": 9.610527668393867e-06, + "loss": 0.0359, + "step": 16848 + }, + { + "epoch": 6.8519723464823095, + "grad_norm": 0.7026272295974814, + "learning_rate": 9.609515250931689e-06, + "loss": 0.0153, + "step": 16849 + }, + { + "epoch": 6.852379015860106, + "grad_norm": 2.761257839847193, + "learning_rate": 9.608502837478036e-06, + "loss": 0.0524, + "step": 16850 + }, + { + "epoch": 6.852785685237902, + "grad_norm": 4.735935249551472, + "learning_rate": 9.607490428043297e-06, + "loss": 0.0888, + "step": 16851 + }, + { + "epoch": 6.853192354615698, + "grad_norm": 3.3792348317600287, + "learning_rate": 9.606478022637867e-06, + "loss": 0.0799, + "step": 16852 + }, + { + "epoch": 6.853599023993493, + "grad_norm": 3.149661233946499, + "learning_rate": 9.605465621272137e-06, + "loss": 0.111, + "step": 16853 + }, + { + "epoch": 6.854005693371289, + "grad_norm": 1.0729998102895115, + "learning_rate": 9.604453223956503e-06, + "loss": 0.0224, + "step": 16854 + }, + { + "epoch": 6.854412362749085, + "grad_norm": 2.108028734595141, + "learning_rate": 9.603440830701352e-06, + "loss": 0.0222, + "step": 16855 + }, + { + "epoch": 6.854819032126881, + "grad_norm": 9.952923429460181, + "learning_rate": 9.602428441517084e-06, + "loss": 0.2758, + "step": 16856 + }, + { + "epoch": 6.8552257015046765, + "grad_norm": 10.834701596870042, + "learning_rate": 9.601416056414086e-06, + "loss": 0.6085, + "step": 16857 + }, + { + "epoch": 6.855632370882472, + "grad_norm": 3.1271658084721783, + "learning_rate": 9.600403675402756e-06, + "loss": 0.1069, + "step": 16858 + }, + { + "epoch": 6.856039040260269, + "grad_norm": 13.577273696621232, + "learning_rate": 9.59939129849348e-06, + "loss": 0.617, + "step": 16859 + }, + { + "epoch": 6.856445709638065, + "grad_norm": 0.20099334848420047, + "learning_rate": 9.598378925696654e-06, + "loss": 0.004, + "step": 16860 + }, + { + "epoch": 6.85685237901586, + "grad_norm": 13.417579171023734, + "learning_rate": 9.597366557022672e-06, + "loss": 0.5795, + "step": 16861 + }, + { + "epoch": 6.857259048393656, + "grad_norm": 0.09720899362396747, + "learning_rate": 9.596354192481923e-06, + "loss": 0.0012, + "step": 16862 + }, + { + "epoch": 6.857665717771452, + "grad_norm": 2.4132392650106094, + "learning_rate": 9.595341832084808e-06, + "loss": 0.0463, + "step": 16863 + }, + { + "epoch": 6.858072387149248, + "grad_norm": 2.098765443510058, + "learning_rate": 9.594329475841707e-06, + "loss": 0.0462, + "step": 16864 + }, + { + "epoch": 6.8584790565270435, + "grad_norm": 2.559985484056624, + "learning_rate": 9.593317123763018e-06, + "loss": 0.0439, + "step": 16865 + }, + { + "epoch": 6.858885725904839, + "grad_norm": 0.24311820527952677, + "learning_rate": 9.592304775859134e-06, + "loss": 0.0029, + "step": 16866 + }, + { + "epoch": 6.859292395282635, + "grad_norm": 8.327922174777832, + "learning_rate": 9.591292432140445e-06, + "loss": 0.2667, + "step": 16867 + }, + { + "epoch": 6.859699064660431, + "grad_norm": 1.1620347220671166, + "learning_rate": 9.590280092617348e-06, + "loss": 0.0155, + "step": 16868 + }, + { + "epoch": 6.8601057340382265, + "grad_norm": 2.3383175185097915, + "learning_rate": 9.589267757300231e-06, + "loss": 0.0411, + "step": 16869 + }, + { + "epoch": 6.860512403416022, + "grad_norm": 6.030209280109042, + "learning_rate": 9.588255426199486e-06, + "loss": 0.2469, + "step": 16870 + }, + { + "epoch": 6.860919072793819, + "grad_norm": 1.6297770613941271, + "learning_rate": 9.587243099325507e-06, + "loss": 0.0213, + "step": 16871 + }, + { + "epoch": 6.861325742171615, + "grad_norm": 0.4298080327265972, + "learning_rate": 9.586230776688686e-06, + "loss": 0.0071, + "step": 16872 + }, + { + "epoch": 6.8617324115494105, + "grad_norm": 5.480860207430989, + "learning_rate": 9.585218458299414e-06, + "loss": 0.1546, + "step": 16873 + }, + { + "epoch": 6.862139080927206, + "grad_norm": 2.8703894977593842, + "learning_rate": 9.584206144168083e-06, + "loss": 0.0477, + "step": 16874 + }, + { + "epoch": 6.862545750305002, + "grad_norm": 0.020251075568273182, + "learning_rate": 9.583193834305085e-06, + "loss": 0.0004, + "step": 16875 + }, + { + "epoch": 6.862952419682798, + "grad_norm": 5.853482414637619, + "learning_rate": 9.582181528720813e-06, + "loss": 0.1853, + "step": 16876 + }, + { + "epoch": 6.8633590890605936, + "grad_norm": 4.818450070622746, + "learning_rate": 9.581169227425655e-06, + "loss": 0.0832, + "step": 16877 + }, + { + "epoch": 6.863765758438389, + "grad_norm": 9.068265688492547, + "learning_rate": 9.580156930430009e-06, + "loss": 0.3637, + "step": 16878 + }, + { + "epoch": 6.864172427816185, + "grad_norm": 2.0879023974696063, + "learning_rate": 9.579144637744264e-06, + "loss": 0.0315, + "step": 16879 + }, + { + "epoch": 6.864579097193982, + "grad_norm": 3.1145558844656915, + "learning_rate": 9.578132349378807e-06, + "loss": 0.048, + "step": 16880 + }, + { + "epoch": 6.8649857665717775, + "grad_norm": 0.5026368102538913, + "learning_rate": 9.577120065344036e-06, + "loss": 0.0082, + "step": 16881 + }, + { + "epoch": 6.865392435949573, + "grad_norm": 5.053462034352502, + "learning_rate": 9.576107785650342e-06, + "loss": 0.0862, + "step": 16882 + }, + { + "epoch": 6.865799105327369, + "grad_norm": 2.308263716821945, + "learning_rate": 9.575095510308112e-06, + "loss": 0.0474, + "step": 16883 + }, + { + "epoch": 6.866205774705165, + "grad_norm": 5.140181681382531, + "learning_rate": 9.574083239327742e-06, + "loss": 0.2725, + "step": 16884 + }, + { + "epoch": 6.866612444082961, + "grad_norm": 0.6466868273013412, + "learning_rate": 9.573070972719622e-06, + "loss": 0.0058, + "step": 16885 + }, + { + "epoch": 6.867019113460756, + "grad_norm": 4.392141643280206, + "learning_rate": 9.572058710494144e-06, + "loss": 0.1192, + "step": 16886 + }, + { + "epoch": 6.867425782838552, + "grad_norm": 6.25980305176971, + "learning_rate": 9.571046452661696e-06, + "loss": 0.0461, + "step": 16887 + }, + { + "epoch": 6.867832452216348, + "grad_norm": 0.34833330416885616, + "learning_rate": 9.570034199232676e-06, + "loss": 0.0067, + "step": 16888 + }, + { + "epoch": 6.868239121594144, + "grad_norm": 7.493651924175545, + "learning_rate": 9.56902195021747e-06, + "loss": 0.0999, + "step": 16889 + }, + { + "epoch": 6.868645790971939, + "grad_norm": 2.4324541841766125, + "learning_rate": 9.568009705626467e-06, + "loss": 0.0437, + "step": 16890 + }, + { + "epoch": 6.869052460349736, + "grad_norm": 0.8851875512347863, + "learning_rate": 9.566997465470065e-06, + "loss": 0.0104, + "step": 16891 + }, + { + "epoch": 6.869459129727532, + "grad_norm": 0.15952007408202865, + "learning_rate": 9.565985229758653e-06, + "loss": 0.0018, + "step": 16892 + }, + { + "epoch": 6.869865799105328, + "grad_norm": 0.03681044196926102, + "learning_rate": 9.564972998502617e-06, + "loss": 0.0006, + "step": 16893 + }, + { + "epoch": 6.870272468483123, + "grad_norm": 2.43879899271545, + "learning_rate": 9.563960771712354e-06, + "loss": 0.0332, + "step": 16894 + }, + { + "epoch": 6.870679137860919, + "grad_norm": 7.084581556208862, + "learning_rate": 9.562948549398256e-06, + "loss": 0.1538, + "step": 16895 + }, + { + "epoch": 6.871085807238715, + "grad_norm": 1.3568621483027181, + "learning_rate": 9.561936331570708e-06, + "loss": 0.0224, + "step": 16896 + }, + { + "epoch": 6.871492476616511, + "grad_norm": 1.4878510804565628, + "learning_rate": 9.560924118240105e-06, + "loss": 0.022, + "step": 16897 + }, + { + "epoch": 6.871899145994306, + "grad_norm": 8.081694995025714, + "learning_rate": 9.559911909416837e-06, + "loss": 0.2387, + "step": 16898 + }, + { + "epoch": 6.872305815372102, + "grad_norm": 3.1924157706899368, + "learning_rate": 9.558899705111296e-06, + "loss": 0.0565, + "step": 16899 + }, + { + "epoch": 6.872712484749899, + "grad_norm": 4.200678443927705, + "learning_rate": 9.557887505333868e-06, + "loss": 0.0971, + "step": 16900 + }, + { + "epoch": 6.873119154127695, + "grad_norm": 1.668138067222451, + "learning_rate": 9.55687531009495e-06, + "loss": 0.0242, + "step": 16901 + }, + { + "epoch": 6.87352582350549, + "grad_norm": 0.11946752373764899, + "learning_rate": 9.55586311940493e-06, + "loss": 0.0023, + "step": 16902 + }, + { + "epoch": 6.873932492883286, + "grad_norm": 2.6681116226696666, + "learning_rate": 9.554850933274197e-06, + "loss": 0.0417, + "step": 16903 + }, + { + "epoch": 6.874339162261082, + "grad_norm": 0.824888172528366, + "learning_rate": 9.553838751713145e-06, + "loss": 0.0163, + "step": 16904 + }, + { + "epoch": 6.874745831638878, + "grad_norm": 0.13915314852800872, + "learning_rate": 9.552826574732162e-06, + "loss": 0.0024, + "step": 16905 + }, + { + "epoch": 6.875152501016673, + "grad_norm": 7.68613900590129, + "learning_rate": 9.551814402341636e-06, + "loss": 0.2016, + "step": 16906 + }, + { + "epoch": 6.875559170394469, + "grad_norm": 13.662195450823152, + "learning_rate": 9.550802234551965e-06, + "loss": 0.888, + "step": 16907 + }, + { + "epoch": 6.875965839772265, + "grad_norm": 0.309420509084351, + "learning_rate": 9.549790071373536e-06, + "loss": 0.0053, + "step": 16908 + }, + { + "epoch": 6.876372509150061, + "grad_norm": 2.598563394751387, + "learning_rate": 9.548777912816734e-06, + "loss": 0.0912, + "step": 16909 + }, + { + "epoch": 6.8767791785278565, + "grad_norm": 0.8019641941724484, + "learning_rate": 9.547765758891959e-06, + "loss": 0.009, + "step": 16910 + }, + { + "epoch": 6.877185847905652, + "grad_norm": 2.845525640887677, + "learning_rate": 9.546753609609593e-06, + "loss": 0.1062, + "step": 16911 + }, + { + "epoch": 6.877592517283449, + "grad_norm": 6.783026707948608, + "learning_rate": 9.54574146498003e-06, + "loss": 0.2184, + "step": 16912 + }, + { + "epoch": 6.877999186661245, + "grad_norm": 0.053938822401102884, + "learning_rate": 9.544729325013658e-06, + "loss": 0.0007, + "step": 16913 + }, + { + "epoch": 6.87840585603904, + "grad_norm": 9.985516498388002, + "learning_rate": 9.54371718972087e-06, + "loss": 0.4604, + "step": 16914 + }, + { + "epoch": 6.878812525416836, + "grad_norm": 4.881540330719039, + "learning_rate": 9.542705059112056e-06, + "loss": 0.0812, + "step": 16915 + }, + { + "epoch": 6.879219194794632, + "grad_norm": 1.9235228515060911, + "learning_rate": 9.541692933197601e-06, + "loss": 0.0307, + "step": 16916 + }, + { + "epoch": 6.879625864172428, + "grad_norm": 5.676872409511725, + "learning_rate": 9.540680811987903e-06, + "loss": 0.1746, + "step": 16917 + }, + { + "epoch": 6.8800325335502235, + "grad_norm": 3.3324087033689485, + "learning_rate": 9.539668695493344e-06, + "loss": 0.0329, + "step": 16918 + }, + { + "epoch": 6.880439202928019, + "grad_norm": 2.664288365941, + "learning_rate": 9.538656583724318e-06, + "loss": 0.0559, + "step": 16919 + }, + { + "epoch": 6.880845872305815, + "grad_norm": 6.687528334149828, + "learning_rate": 9.537644476691217e-06, + "loss": 0.2621, + "step": 16920 + }, + { + "epoch": 6.881252541683612, + "grad_norm": 0.9847797471926842, + "learning_rate": 9.536632374404428e-06, + "loss": 0.0126, + "step": 16921 + }, + { + "epoch": 6.881659211061407, + "grad_norm": 0.6767399560610521, + "learning_rate": 9.535620276874337e-06, + "loss": 0.0137, + "step": 16922 + }, + { + "epoch": 6.882065880439203, + "grad_norm": 2.804748780433769, + "learning_rate": 9.534608184111341e-06, + "loss": 0.0325, + "step": 16923 + }, + { + "epoch": 6.882472549816999, + "grad_norm": 3.0658110969268497, + "learning_rate": 9.533596096125826e-06, + "loss": 0.0495, + "step": 16924 + }, + { + "epoch": 6.882879219194795, + "grad_norm": 9.886941160099067, + "learning_rate": 9.532584012928183e-06, + "loss": 0.5522, + "step": 16925 + }, + { + "epoch": 6.8832858885725905, + "grad_norm": 0.3878255396889592, + "learning_rate": 9.531571934528795e-06, + "loss": 0.0041, + "step": 16926 + }, + { + "epoch": 6.883692557950386, + "grad_norm": 4.996072553536739, + "learning_rate": 9.530559860938061e-06, + "loss": 0.1311, + "step": 16927 + }, + { + "epoch": 6.884099227328182, + "grad_norm": 1.7120813259748886, + "learning_rate": 9.529547792166367e-06, + "loss": 0.0302, + "step": 16928 + }, + { + "epoch": 6.884505896705978, + "grad_norm": 9.911654039574644, + "learning_rate": 9.528535728224099e-06, + "loss": 0.6623, + "step": 16929 + }, + { + "epoch": 6.8849125660837736, + "grad_norm": 4.847086211166657, + "learning_rate": 9.527523669121652e-06, + "loss": 0.1942, + "step": 16930 + }, + { + "epoch": 6.885319235461569, + "grad_norm": 3.5165991791741185, + "learning_rate": 9.52651161486941e-06, + "loss": 0.0635, + "step": 16931 + }, + { + "epoch": 6.885725904839366, + "grad_norm": 5.8044112902376135, + "learning_rate": 9.525499565477764e-06, + "loss": 0.2854, + "step": 16932 + }, + { + "epoch": 6.886132574217162, + "grad_norm": 14.417632912171491, + "learning_rate": 9.524487520957105e-06, + "loss": 0.7035, + "step": 16933 + }, + { + "epoch": 6.8865392435949575, + "grad_norm": 7.205172946592426, + "learning_rate": 9.523475481317822e-06, + "loss": 0.1732, + "step": 16934 + }, + { + "epoch": 6.886945912972753, + "grad_norm": 2.6923558879290552, + "learning_rate": 9.522463446570303e-06, + "loss": 0.0539, + "step": 16935 + }, + { + "epoch": 6.887352582350549, + "grad_norm": 2.4993218961436, + "learning_rate": 9.521451416724934e-06, + "loss": 0.0342, + "step": 16936 + }, + { + "epoch": 6.887759251728345, + "grad_norm": 0.6828794726928197, + "learning_rate": 9.52043939179211e-06, + "loss": 0.0111, + "step": 16937 + }, + { + "epoch": 6.888165921106141, + "grad_norm": 1.506263467204244, + "learning_rate": 9.519427371782216e-06, + "loss": 0.0284, + "step": 16938 + }, + { + "epoch": 6.888572590483936, + "grad_norm": 4.062053735228903, + "learning_rate": 9.51841535670564e-06, + "loss": 0.1329, + "step": 16939 + }, + { + "epoch": 6.888979259861732, + "grad_norm": 0.25672116321809735, + "learning_rate": 9.517403346572776e-06, + "loss": 0.0045, + "step": 16940 + }, + { + "epoch": 6.889385929239529, + "grad_norm": 3.1555893789862437, + "learning_rate": 9.516391341394008e-06, + "loss": 0.0674, + "step": 16941 + }, + { + "epoch": 6.8897925986173245, + "grad_norm": 2.4072344715518517, + "learning_rate": 9.515379341179726e-06, + "loss": 0.0403, + "step": 16942 + }, + { + "epoch": 6.89019926799512, + "grad_norm": 0.6370882242149196, + "learning_rate": 9.51436734594032e-06, + "loss": 0.0076, + "step": 16943 + }, + { + "epoch": 6.890605937372916, + "grad_norm": 0.040205574167758884, + "learning_rate": 9.513355355686178e-06, + "loss": 0.0006, + "step": 16944 + }, + { + "epoch": 6.891012606750712, + "grad_norm": 8.260795535084325, + "learning_rate": 9.512343370427686e-06, + "loss": 0.1537, + "step": 16945 + }, + { + "epoch": 6.891419276128508, + "grad_norm": 0.7097030403095227, + "learning_rate": 9.511331390175235e-06, + "loss": 0.0178, + "step": 16946 + }, + { + "epoch": 6.891825945506303, + "grad_norm": 3.623285137231121, + "learning_rate": 9.510319414939216e-06, + "loss": 0.152, + "step": 16947 + }, + { + "epoch": 6.892232614884099, + "grad_norm": 2.8411525345683795, + "learning_rate": 9.509307444730014e-06, + "loss": 0.0439, + "step": 16948 + }, + { + "epoch": 6.892639284261895, + "grad_norm": 1.8948438567081745, + "learning_rate": 9.508295479558015e-06, + "loss": 0.0282, + "step": 16949 + }, + { + "epoch": 6.893045953639691, + "grad_norm": 2.771023453703953, + "learning_rate": 9.507283519433614e-06, + "loss": 0.0444, + "step": 16950 + }, + { + "epoch": 6.893452623017486, + "grad_norm": 4.152410763079123, + "learning_rate": 9.506271564367194e-06, + "loss": 0.0874, + "step": 16951 + }, + { + "epoch": 6.893859292395282, + "grad_norm": 7.768158190401576, + "learning_rate": 9.505259614369143e-06, + "loss": 0.2014, + "step": 16952 + }, + { + "epoch": 6.894265961773079, + "grad_norm": 3.272592494526393, + "learning_rate": 9.504247669449854e-06, + "loss": 0.0523, + "step": 16953 + }, + { + "epoch": 6.894672631150875, + "grad_norm": 6.267093154080107, + "learning_rate": 9.503235729619712e-06, + "loss": 0.071, + "step": 16954 + }, + { + "epoch": 6.89507930052867, + "grad_norm": 8.143241101247465, + "learning_rate": 9.5022237948891e-06, + "loss": 0.4273, + "step": 16955 + }, + { + "epoch": 6.895485969906466, + "grad_norm": 2.151365629499497, + "learning_rate": 9.501211865268418e-06, + "loss": 0.0474, + "step": 16956 + }, + { + "epoch": 6.895892639284262, + "grad_norm": 1.223581170859372, + "learning_rate": 9.500199940768046e-06, + "loss": 0.045, + "step": 16957 + }, + { + "epoch": 6.896299308662058, + "grad_norm": 0.9045403106007164, + "learning_rate": 9.49918802139837e-06, + "loss": 0.0117, + "step": 16958 + }, + { + "epoch": 6.896705978039853, + "grad_norm": 7.760867840679243, + "learning_rate": 9.498176107169782e-06, + "loss": 0.3155, + "step": 16959 + }, + { + "epoch": 6.897112647417649, + "grad_norm": 2.2304290508655575, + "learning_rate": 9.497164198092672e-06, + "loss": 0.0251, + "step": 16960 + }, + { + "epoch": 6.897519316795445, + "grad_norm": 1.2537601505738685, + "learning_rate": 9.496152294177424e-06, + "loss": 0.018, + "step": 16961 + }, + { + "epoch": 6.897925986173242, + "grad_norm": 11.965474208632333, + "learning_rate": 9.495140395434422e-06, + "loss": 0.4573, + "step": 16962 + }, + { + "epoch": 6.898332655551037, + "grad_norm": 2.3077229887422424, + "learning_rate": 9.494128501874063e-06, + "loss": 0.025, + "step": 16963 + }, + { + "epoch": 6.898739324928833, + "grad_norm": 2.9205598857969006, + "learning_rate": 9.493116613506729e-06, + "loss": 0.0643, + "step": 16964 + }, + { + "epoch": 6.899145994306629, + "grad_norm": 7.816583454357691, + "learning_rate": 9.492104730342806e-06, + "loss": 0.2346, + "step": 16965 + }, + { + "epoch": 6.899552663684425, + "grad_norm": 0.32131476783740254, + "learning_rate": 9.491092852392685e-06, + "loss": 0.0047, + "step": 16966 + }, + { + "epoch": 6.89995933306222, + "grad_norm": 1.3753288483744024, + "learning_rate": 9.49008097966675e-06, + "loss": 0.0231, + "step": 16967 + }, + { + "epoch": 6.900366002440016, + "grad_norm": 6.810553342588049, + "learning_rate": 9.489069112175391e-06, + "loss": 0.1362, + "step": 16968 + }, + { + "epoch": 6.900772671817812, + "grad_norm": 12.747677748309496, + "learning_rate": 9.488057249928994e-06, + "loss": 0.8237, + "step": 16969 + }, + { + "epoch": 6.901179341195608, + "grad_norm": 1.471767131916673, + "learning_rate": 9.487045392937948e-06, + "loss": 0.0265, + "step": 16970 + }, + { + "epoch": 6.9015860105734035, + "grad_norm": 13.25960951423983, + "learning_rate": 9.486033541212639e-06, + "loss": 0.5334, + "step": 16971 + }, + { + "epoch": 6.901992679951199, + "grad_norm": 7.869198690627683, + "learning_rate": 9.485021694763456e-06, + "loss": 0.2099, + "step": 16972 + }, + { + "epoch": 6.902399349328996, + "grad_norm": 1.149100299793396, + "learning_rate": 9.48400985360078e-06, + "loss": 0.0145, + "step": 16973 + }, + { + "epoch": 6.902806018706792, + "grad_norm": 1.078834764935737, + "learning_rate": 9.482998017735006e-06, + "loss": 0.0338, + "step": 16974 + }, + { + "epoch": 6.903212688084587, + "grad_norm": 0.440762036020389, + "learning_rate": 9.481986187176519e-06, + "loss": 0.0057, + "step": 16975 + }, + { + "epoch": 6.903619357462383, + "grad_norm": 6.7107423591885755, + "learning_rate": 9.480974361935703e-06, + "loss": 0.1253, + "step": 16976 + }, + { + "epoch": 6.904026026840179, + "grad_norm": 3.323084348830737, + "learning_rate": 9.479962542022944e-06, + "loss": 0.1211, + "step": 16977 + }, + { + "epoch": 6.904432696217975, + "grad_norm": 0.07942493600659903, + "learning_rate": 9.478950727448634e-06, + "loss": 0.001, + "step": 16978 + }, + { + "epoch": 6.9048393655957705, + "grad_norm": 0.007744392678090186, + "learning_rate": 9.477938918223156e-06, + "loss": 0.0001, + "step": 16979 + }, + { + "epoch": 6.905246034973566, + "grad_norm": 9.134796794620407, + "learning_rate": 9.476927114356896e-06, + "loss": 0.1635, + "step": 16980 + }, + { + "epoch": 6.905652704351362, + "grad_norm": 4.771531101494565, + "learning_rate": 9.475915315860246e-06, + "loss": 0.0828, + "step": 16981 + }, + { + "epoch": 6.906059373729159, + "grad_norm": 0.28164445630461427, + "learning_rate": 9.474903522743588e-06, + "loss": 0.0046, + "step": 16982 + }, + { + "epoch": 6.9064660431069544, + "grad_norm": 4.246527829152736, + "learning_rate": 9.473891735017307e-06, + "loss": 0.0875, + "step": 16983 + }, + { + "epoch": 6.90687271248475, + "grad_norm": 2.7798489810880604, + "learning_rate": 9.472879952691794e-06, + "loss": 0.032, + "step": 16984 + }, + { + "epoch": 6.907279381862546, + "grad_norm": 0.07142496741071315, + "learning_rate": 9.471868175777435e-06, + "loss": 0.0006, + "step": 16985 + }, + { + "epoch": 6.907686051240342, + "grad_norm": 4.9799779608542805, + "learning_rate": 9.470856404284613e-06, + "loss": 0.0911, + "step": 16986 + }, + { + "epoch": 6.9080927206181375, + "grad_norm": 5.566393127425938, + "learning_rate": 9.469844638223717e-06, + "loss": 0.1496, + "step": 16987 + }, + { + "epoch": 6.908499389995933, + "grad_norm": 0.3898185408110264, + "learning_rate": 9.468832877605133e-06, + "loss": 0.0061, + "step": 16988 + }, + { + "epoch": 6.908906059373729, + "grad_norm": 6.52148825138933, + "learning_rate": 9.467821122439248e-06, + "loss": 0.1958, + "step": 16989 + }, + { + "epoch": 6.909312728751525, + "grad_norm": 2.123712706582574, + "learning_rate": 9.466809372736444e-06, + "loss": 0.0396, + "step": 16990 + }, + { + "epoch": 6.909719398129321, + "grad_norm": 0.6507337466272457, + "learning_rate": 9.465797628507112e-06, + "loss": 0.006, + "step": 16991 + }, + { + "epoch": 6.910126067507116, + "grad_norm": 1.3319472496805373, + "learning_rate": 9.464785889761636e-06, + "loss": 0.0175, + "step": 16992 + }, + { + "epoch": 6.910532736884912, + "grad_norm": 1.9200030821250405, + "learning_rate": 9.4637741565104e-06, + "loss": 0.0345, + "step": 16993 + }, + { + "epoch": 6.910939406262709, + "grad_norm": 5.049106332909232, + "learning_rate": 9.462762428763795e-06, + "loss": 0.112, + "step": 16994 + }, + { + "epoch": 6.9113460756405045, + "grad_norm": 6.42947109919688, + "learning_rate": 9.461750706532204e-06, + "loss": 0.266, + "step": 16995 + }, + { + "epoch": 6.9117527450183, + "grad_norm": 0.8014528836528887, + "learning_rate": 9.46073898982601e-06, + "loss": 0.0085, + "step": 16996 + }, + { + "epoch": 6.912159414396096, + "grad_norm": 0.17819078530622087, + "learning_rate": 9.459727278655606e-06, + "loss": 0.0024, + "step": 16997 + }, + { + "epoch": 6.912566083773892, + "grad_norm": 1.8830757713545445, + "learning_rate": 9.458715573031371e-06, + "loss": 0.0493, + "step": 16998 + }, + { + "epoch": 6.912972753151688, + "grad_norm": 5.500212069237623, + "learning_rate": 9.457703872963691e-06, + "loss": 0.1081, + "step": 16999 + }, + { + "epoch": 6.913379422529483, + "grad_norm": 0.5966470292421101, + "learning_rate": 9.456692178462958e-06, + "loss": 0.0097, + "step": 17000 + }, + { + "epoch": 6.913786091907279, + "grad_norm": 2.613009614988264, + "learning_rate": 9.455680489539551e-06, + "loss": 0.0421, + "step": 17001 + }, + { + "epoch": 6.914192761285075, + "grad_norm": 10.949353136618583, + "learning_rate": 9.454668806203859e-06, + "loss": 0.2767, + "step": 17002 + }, + { + "epoch": 6.9145994306628715, + "grad_norm": 3.940383864305323, + "learning_rate": 9.453657128466263e-06, + "loss": 0.0557, + "step": 17003 + }, + { + "epoch": 6.915006100040667, + "grad_norm": 4.505220421696075, + "learning_rate": 9.452645456337155e-06, + "loss": 0.0925, + "step": 17004 + }, + { + "epoch": 6.915412769418463, + "grad_norm": 0.6126798217543162, + "learning_rate": 9.451633789826917e-06, + "loss": 0.0086, + "step": 17005 + }, + { + "epoch": 6.915819438796259, + "grad_norm": 2.5338007757592016, + "learning_rate": 9.45062212894593e-06, + "loss": 0.0464, + "step": 17006 + }, + { + "epoch": 6.916226108174055, + "grad_norm": 6.629823556456254, + "learning_rate": 9.449610473704586e-06, + "loss": 0.1083, + "step": 17007 + }, + { + "epoch": 6.91663277755185, + "grad_norm": 0.10271561108918724, + "learning_rate": 9.448598824113268e-06, + "loss": 0.0017, + "step": 17008 + }, + { + "epoch": 6.917039446929646, + "grad_norm": 4.113334376471158, + "learning_rate": 9.44758718018236e-06, + "loss": 0.0685, + "step": 17009 + }, + { + "epoch": 6.917446116307442, + "grad_norm": 3.2187070709873753, + "learning_rate": 9.446575541922247e-06, + "loss": 0.0657, + "step": 17010 + }, + { + "epoch": 6.917852785685238, + "grad_norm": 0.7111341839586225, + "learning_rate": 9.445563909343315e-06, + "loss": 0.0107, + "step": 17011 + }, + { + "epoch": 6.918259455063033, + "grad_norm": 0.03673340297857035, + "learning_rate": 9.444552282455948e-06, + "loss": 0.0006, + "step": 17012 + }, + { + "epoch": 6.918666124440829, + "grad_norm": 0.8139997113226061, + "learning_rate": 9.443540661270534e-06, + "loss": 0.011, + "step": 17013 + }, + { + "epoch": 6.919072793818626, + "grad_norm": 0.9025326059543521, + "learning_rate": 9.442529045797453e-06, + "loss": 0.0112, + "step": 17014 + }, + { + "epoch": 6.919479463196422, + "grad_norm": 1.3673609279580767, + "learning_rate": 9.441517436047094e-06, + "loss": 0.0219, + "step": 17015 + }, + { + "epoch": 6.919886132574217, + "grad_norm": 1.1650704580367097, + "learning_rate": 9.440505832029836e-06, + "loss": 0.0229, + "step": 17016 + }, + { + "epoch": 6.920292801952013, + "grad_norm": 4.893045217749192, + "learning_rate": 9.439494233756069e-06, + "loss": 0.1318, + "step": 17017 + }, + { + "epoch": 6.920699471329809, + "grad_norm": 3.8432070161544134, + "learning_rate": 9.438482641236176e-06, + "loss": 0.0597, + "step": 17018 + }, + { + "epoch": 6.921106140707605, + "grad_norm": 5.162858612759049, + "learning_rate": 9.43747105448054e-06, + "loss": 0.0745, + "step": 17019 + }, + { + "epoch": 6.9215128100854, + "grad_norm": 7.36809021095593, + "learning_rate": 9.436459473499549e-06, + "loss": 0.1481, + "step": 17020 + }, + { + "epoch": 6.921919479463196, + "grad_norm": 8.94763645350039, + "learning_rate": 9.435447898303585e-06, + "loss": 0.2247, + "step": 17021 + }, + { + "epoch": 6.922326148840992, + "grad_norm": 3.7955181053595055, + "learning_rate": 9.43443632890303e-06, + "loss": 0.0662, + "step": 17022 + }, + { + "epoch": 6.922732818218789, + "grad_norm": 0.29343757179577745, + "learning_rate": 9.433424765308272e-06, + "loss": 0.0049, + "step": 17023 + }, + { + "epoch": 6.923139487596584, + "grad_norm": 8.091752972254708, + "learning_rate": 9.432413207529694e-06, + "loss": 0.2227, + "step": 17024 + }, + { + "epoch": 6.92354615697438, + "grad_norm": 6.147202886182908, + "learning_rate": 9.431401655577679e-06, + "loss": 0.1468, + "step": 17025 + }, + { + "epoch": 6.923952826352176, + "grad_norm": 8.773316607784574, + "learning_rate": 9.430390109462615e-06, + "loss": 0.2949, + "step": 17026 + }, + { + "epoch": 6.924359495729972, + "grad_norm": 5.028684411301362, + "learning_rate": 9.429378569194881e-06, + "loss": 0.0801, + "step": 17027 + }, + { + "epoch": 6.924766165107767, + "grad_norm": 4.83890768196838, + "learning_rate": 9.428367034784866e-06, + "loss": 0.0882, + "step": 17028 + }, + { + "epoch": 6.925172834485563, + "grad_norm": 3.3728560838766923, + "learning_rate": 9.427355506242947e-06, + "loss": 0.053, + "step": 17029 + }, + { + "epoch": 6.925579503863359, + "grad_norm": 4.1627726116077985, + "learning_rate": 9.426343983579513e-06, + "loss": 0.073, + "step": 17030 + }, + { + "epoch": 6.925986173241155, + "grad_norm": 1.180977124119777, + "learning_rate": 9.425332466804948e-06, + "loss": 0.0164, + "step": 17031 + }, + { + "epoch": 6.9263928426189505, + "grad_norm": 3.232865904364173, + "learning_rate": 9.424320955929633e-06, + "loss": 0.0565, + "step": 17032 + }, + { + "epoch": 6.926799511996746, + "grad_norm": 2.049159381440454, + "learning_rate": 9.423309450963954e-06, + "loss": 0.0323, + "step": 17033 + }, + { + "epoch": 6.927206181374542, + "grad_norm": 0.07674334453644016, + "learning_rate": 9.422297951918296e-06, + "loss": 0.0013, + "step": 17034 + }, + { + "epoch": 6.927612850752339, + "grad_norm": 0.21706514868021975, + "learning_rate": 9.421286458803037e-06, + "loss": 0.0023, + "step": 17035 + }, + { + "epoch": 6.9280195201301344, + "grad_norm": 0.44792119864168145, + "learning_rate": 9.420274971628564e-06, + "loss": 0.007, + "step": 17036 + }, + { + "epoch": 6.92842618950793, + "grad_norm": 13.940029946657033, + "learning_rate": 9.419263490405261e-06, + "loss": 0.3669, + "step": 17037 + }, + { + "epoch": 6.928832858885726, + "grad_norm": 1.9461808483805576, + "learning_rate": 9.418252015143512e-06, + "loss": 0.0388, + "step": 17038 + }, + { + "epoch": 6.929239528263522, + "grad_norm": 2.8924104263142296, + "learning_rate": 9.417240545853696e-06, + "loss": 0.0292, + "step": 17039 + }, + { + "epoch": 6.9296461976413175, + "grad_norm": 3.206720894724205, + "learning_rate": 9.4162290825462e-06, + "loss": 0.091, + "step": 17040 + }, + { + "epoch": 6.930052867019113, + "grad_norm": 0.10988344210755621, + "learning_rate": 9.41521762523141e-06, + "loss": 0.0015, + "step": 17041 + }, + { + "epoch": 6.930459536396909, + "grad_norm": 0.927273223102425, + "learning_rate": 9.414206173919698e-06, + "loss": 0.0142, + "step": 17042 + }, + { + "epoch": 6.930866205774705, + "grad_norm": 3.4404178239896397, + "learning_rate": 9.413194728621462e-06, + "loss": 0.1052, + "step": 17043 + }, + { + "epoch": 6.9312728751525015, + "grad_norm": 6.4915153950821445, + "learning_rate": 9.412183289347073e-06, + "loss": 0.3404, + "step": 17044 + }, + { + "epoch": 6.931679544530297, + "grad_norm": 3.827166905131051, + "learning_rate": 9.411171856106918e-06, + "loss": 0.0821, + "step": 17045 + }, + { + "epoch": 6.932086213908093, + "grad_norm": 0.3697470188440685, + "learning_rate": 9.410160428911383e-06, + "loss": 0.0035, + "step": 17046 + }, + { + "epoch": 6.932492883285889, + "grad_norm": 8.326805780759493, + "learning_rate": 9.409149007770846e-06, + "loss": 0.163, + "step": 17047 + }, + { + "epoch": 6.9328995526636845, + "grad_norm": 0.17308750628869637, + "learning_rate": 9.408137592695692e-06, + "loss": 0.0035, + "step": 17048 + }, + { + "epoch": 6.93330622204148, + "grad_norm": 3.181911081124629, + "learning_rate": 9.407126183696304e-06, + "loss": 0.0651, + "step": 17049 + }, + { + "epoch": 6.933712891419276, + "grad_norm": 0.9026938843859224, + "learning_rate": 9.406114780783066e-06, + "loss": 0.0116, + "step": 17050 + }, + { + "epoch": 6.934119560797072, + "grad_norm": 0.2203482286509067, + "learning_rate": 9.405103383966356e-06, + "loss": 0.0029, + "step": 17051 + }, + { + "epoch": 6.934526230174868, + "grad_norm": 4.933844939004145, + "learning_rate": 9.404091993256559e-06, + "loss": 0.0926, + "step": 17052 + }, + { + "epoch": 6.934932899552663, + "grad_norm": 1.834747359058706, + "learning_rate": 9.40308060866406e-06, + "loss": 0.0172, + "step": 17053 + }, + { + "epoch": 6.935339568930459, + "grad_norm": 8.837402007995934, + "learning_rate": 9.402069230199238e-06, + "loss": 0.2812, + "step": 17054 + }, + { + "epoch": 6.935746238308256, + "grad_norm": 0.7086818164384175, + "learning_rate": 9.401057857872475e-06, + "loss": 0.0105, + "step": 17055 + }, + { + "epoch": 6.9361529076860515, + "grad_norm": 2.7963164295654144, + "learning_rate": 9.400046491694154e-06, + "loss": 0.0355, + "step": 17056 + }, + { + "epoch": 6.936559577063847, + "grad_norm": 10.363342920515985, + "learning_rate": 9.399035131674662e-06, + "loss": 0.1891, + "step": 17057 + }, + { + "epoch": 6.936966246441643, + "grad_norm": 0.07449096317760183, + "learning_rate": 9.39802377782437e-06, + "loss": 0.001, + "step": 17058 + }, + { + "epoch": 6.937372915819439, + "grad_norm": 0.29773655565734686, + "learning_rate": 9.397012430153672e-06, + "loss": 0.0032, + "step": 17059 + }, + { + "epoch": 6.937779585197235, + "grad_norm": 1.005003491857913, + "learning_rate": 9.396001088672944e-06, + "loss": 0.0128, + "step": 17060 + }, + { + "epoch": 6.93818625457503, + "grad_norm": 2.061806710359144, + "learning_rate": 9.394989753392568e-06, + "loss": 0.0553, + "step": 17061 + }, + { + "epoch": 6.938592923952826, + "grad_norm": 5.447757919553144, + "learning_rate": 9.393978424322932e-06, + "loss": 0.103, + "step": 17062 + }, + { + "epoch": 6.938999593330622, + "grad_norm": 3.210087256387607, + "learning_rate": 9.392967101474406e-06, + "loss": 0.0561, + "step": 17063 + }, + { + "epoch": 6.9394062627084185, + "grad_norm": 0.26950822805561975, + "learning_rate": 9.391955784857381e-06, + "loss": 0.0049, + "step": 17064 + }, + { + "epoch": 6.939812932086214, + "grad_norm": 0.04561551870083863, + "learning_rate": 9.390944474482235e-06, + "loss": 0.0006, + "step": 17065 + }, + { + "epoch": 6.94021960146401, + "grad_norm": 1.9403961315046947, + "learning_rate": 9.389933170359349e-06, + "loss": 0.0431, + "step": 17066 + }, + { + "epoch": 6.940626270841806, + "grad_norm": 13.5730723274285, + "learning_rate": 9.388921872499108e-06, + "loss": 0.5421, + "step": 17067 + }, + { + "epoch": 6.941032940219602, + "grad_norm": 5.9369743854871455, + "learning_rate": 9.387910580911891e-06, + "loss": 0.2029, + "step": 17068 + }, + { + "epoch": 6.941439609597397, + "grad_norm": 0.6144718190176113, + "learning_rate": 9.38689929560808e-06, + "loss": 0.008, + "step": 17069 + }, + { + "epoch": 6.941846278975193, + "grad_norm": 8.852180844027469, + "learning_rate": 9.385888016598056e-06, + "loss": 0.5899, + "step": 17070 + }, + { + "epoch": 6.942252948352989, + "grad_norm": 2.7801383452524004, + "learning_rate": 9.3848767438922e-06, + "loss": 0.051, + "step": 17071 + }, + { + "epoch": 6.942659617730785, + "grad_norm": 11.947943732583813, + "learning_rate": 9.383865477500896e-06, + "loss": 0.5459, + "step": 17072 + }, + { + "epoch": 6.94306628710858, + "grad_norm": 0.8390833291607185, + "learning_rate": 9.38285421743452e-06, + "loss": 0.0158, + "step": 17073 + }, + { + "epoch": 6.943472956486376, + "grad_norm": 0.10774738157321749, + "learning_rate": 9.381842963703457e-06, + "loss": 0.0017, + "step": 17074 + }, + { + "epoch": 6.943879625864172, + "grad_norm": 0.17282338170180717, + "learning_rate": 9.380831716318089e-06, + "loss": 0.003, + "step": 17075 + }, + { + "epoch": 6.944286295241969, + "grad_norm": 9.568151907804198, + "learning_rate": 9.379820475288792e-06, + "loss": 0.2959, + "step": 17076 + }, + { + "epoch": 6.944692964619764, + "grad_norm": 4.207648972756583, + "learning_rate": 9.378809240625952e-06, + "loss": 0.0618, + "step": 17077 + }, + { + "epoch": 6.94509963399756, + "grad_norm": 0.03268892290555688, + "learning_rate": 9.377798012339948e-06, + "loss": 0.0003, + "step": 17078 + }, + { + "epoch": 6.945506303375356, + "grad_norm": 2.72211862788794, + "learning_rate": 9.376786790441161e-06, + "loss": 0.0417, + "step": 17079 + }, + { + "epoch": 6.945912972753152, + "grad_norm": 0.9175355638656734, + "learning_rate": 9.37577557493997e-06, + "loss": 0.014, + "step": 17080 + }, + { + "epoch": 6.946319642130947, + "grad_norm": 2.2812672215948258, + "learning_rate": 9.374764365846757e-06, + "loss": 0.0192, + "step": 17081 + }, + { + "epoch": 6.946726311508743, + "grad_norm": 6.772734730074178, + "learning_rate": 9.373753163171903e-06, + "loss": 0.1477, + "step": 17082 + }, + { + "epoch": 6.947132980886539, + "grad_norm": 5.961236791702821, + "learning_rate": 9.372741966925785e-06, + "loss": 0.1738, + "step": 17083 + }, + { + "epoch": 6.947539650264335, + "grad_norm": 0.4455110032978684, + "learning_rate": 9.371730777118791e-06, + "loss": 0.0071, + "step": 17084 + }, + { + "epoch": 6.947946319642131, + "grad_norm": 0.35592822628300985, + "learning_rate": 9.370719593761296e-06, + "loss": 0.0036, + "step": 17085 + }, + { + "epoch": 6.948352989019927, + "grad_norm": 3.5813142039042862, + "learning_rate": 9.369708416863677e-06, + "loss": 0.028, + "step": 17086 + }, + { + "epoch": 6.948759658397723, + "grad_norm": 9.357384925869658, + "learning_rate": 9.368697246436322e-06, + "loss": 0.1943, + "step": 17087 + }, + { + "epoch": 6.949166327775519, + "grad_norm": 2.092871331319733, + "learning_rate": 9.367686082489606e-06, + "loss": 0.0321, + "step": 17088 + }, + { + "epoch": 6.9495729971533144, + "grad_norm": 2.979587877225463, + "learning_rate": 9.366674925033911e-06, + "loss": 0.0268, + "step": 17089 + }, + { + "epoch": 6.94997966653111, + "grad_norm": 4.6426144100194735, + "learning_rate": 9.365663774079617e-06, + "loss": 0.0728, + "step": 17090 + }, + { + "epoch": 6.950386335908906, + "grad_norm": 0.019626512547604075, + "learning_rate": 9.364652629637104e-06, + "loss": 0.0003, + "step": 17091 + }, + { + "epoch": 6.950793005286702, + "grad_norm": 11.807962995580699, + "learning_rate": 9.363641491716751e-06, + "loss": 0.4895, + "step": 17092 + }, + { + "epoch": 6.9511996746644975, + "grad_norm": 1.119601843698592, + "learning_rate": 9.362630360328937e-06, + "loss": 0.0131, + "step": 17093 + }, + { + "epoch": 6.951606344042293, + "grad_norm": 11.03220706352786, + "learning_rate": 9.361619235484045e-06, + "loss": 0.3524, + "step": 17094 + }, + { + "epoch": 6.952013013420089, + "grad_norm": 0.6711137439484113, + "learning_rate": 9.360608117192453e-06, + "loss": 0.0056, + "step": 17095 + }, + { + "epoch": 6.952419682797886, + "grad_norm": 13.770689435907746, + "learning_rate": 9.35959700546454e-06, + "loss": 0.723, + "step": 17096 + }, + { + "epoch": 6.9528263521756815, + "grad_norm": 0.004864280452105943, + "learning_rate": 9.358585900310685e-06, + "loss": 0.0001, + "step": 17097 + }, + { + "epoch": 6.953233021553477, + "grad_norm": 1.2244412109319445, + "learning_rate": 9.357574801741271e-06, + "loss": 0.0384, + "step": 17098 + }, + { + "epoch": 6.953639690931273, + "grad_norm": 0.44695204715575043, + "learning_rate": 9.35656370976667e-06, + "loss": 0.0048, + "step": 17099 + }, + { + "epoch": 6.954046360309069, + "grad_norm": 1.1489965006811536, + "learning_rate": 9.355552624397272e-06, + "loss": 0.0146, + "step": 17100 + }, + { + "epoch": 6.9544530296868645, + "grad_norm": 12.935910182518102, + "learning_rate": 9.354541545643448e-06, + "loss": 0.2744, + "step": 17101 + }, + { + "epoch": 6.95485969906466, + "grad_norm": 0.1974579415091818, + "learning_rate": 9.353530473515578e-06, + "loss": 0.0029, + "step": 17102 + }, + { + "epoch": 6.955266368442456, + "grad_norm": 18.629431171824635, + "learning_rate": 9.352519408024047e-06, + "loss": 0.8718, + "step": 17103 + }, + { + "epoch": 6.955673037820252, + "grad_norm": 5.024656541643442, + "learning_rate": 9.35150834917923e-06, + "loss": 0.0899, + "step": 17104 + }, + { + "epoch": 6.9560797071980485, + "grad_norm": 5.835994765499896, + "learning_rate": 9.350497296991505e-06, + "loss": 0.1182, + "step": 17105 + }, + { + "epoch": 6.956486376575844, + "grad_norm": 0.9833968469366338, + "learning_rate": 9.349486251471249e-06, + "loss": 0.0167, + "step": 17106 + }, + { + "epoch": 6.95689304595364, + "grad_norm": 0.23173062029260008, + "learning_rate": 9.348475212628848e-06, + "loss": 0.0034, + "step": 17107 + }, + { + "epoch": 6.957299715331436, + "grad_norm": 3.560754613785087, + "learning_rate": 9.347464180474676e-06, + "loss": 0.0401, + "step": 17108 + }, + { + "epoch": 6.9577063847092315, + "grad_norm": 3.773064897165207, + "learning_rate": 9.34645315501911e-06, + "loss": 0.0874, + "step": 17109 + }, + { + "epoch": 6.958113054087027, + "grad_norm": 1.915727862909941, + "learning_rate": 9.345442136272534e-06, + "loss": 0.0136, + "step": 17110 + }, + { + "epoch": 6.958519723464823, + "grad_norm": 0.7079133827956643, + "learning_rate": 9.344431124245325e-06, + "loss": 0.0089, + "step": 17111 + }, + { + "epoch": 6.958926392842619, + "grad_norm": 4.54891677264771, + "learning_rate": 9.343420118947857e-06, + "loss": 0.1349, + "step": 17112 + }, + { + "epoch": 6.959333062220415, + "grad_norm": 2.947988311236163, + "learning_rate": 9.342409120390514e-06, + "loss": 0.0684, + "step": 17113 + }, + { + "epoch": 6.95973973159821, + "grad_norm": 0.7293449876857523, + "learning_rate": 9.341398128583673e-06, + "loss": 0.0145, + "step": 17114 + }, + { + "epoch": 6.960146400976006, + "grad_norm": 6.497043016543517, + "learning_rate": 9.340387143537711e-06, + "loss": 0.1259, + "step": 17115 + }, + { + "epoch": 6.960553070353802, + "grad_norm": 2.3573687945650144, + "learning_rate": 9.339376165263006e-06, + "loss": 0.0284, + "step": 17116 + }, + { + "epoch": 6.9609597397315985, + "grad_norm": 3.8697569023889096, + "learning_rate": 9.338365193769938e-06, + "loss": 0.1173, + "step": 17117 + }, + { + "epoch": 6.961366409109394, + "grad_norm": 1.2444396030521092, + "learning_rate": 9.337354229068885e-06, + "loss": 0.0193, + "step": 17118 + }, + { + "epoch": 6.96177307848719, + "grad_norm": 4.507815715266565, + "learning_rate": 9.336343271170222e-06, + "loss": 0.1052, + "step": 17119 + }, + { + "epoch": 6.962179747864986, + "grad_norm": 7.414106319270864, + "learning_rate": 9.335332320084331e-06, + "loss": 0.1787, + "step": 17120 + }, + { + "epoch": 6.962586417242782, + "grad_norm": 0.39040224615837366, + "learning_rate": 9.33432137582159e-06, + "loss": 0.0028, + "step": 17121 + }, + { + "epoch": 6.962993086620577, + "grad_norm": 4.185390942395094, + "learning_rate": 9.333310438392371e-06, + "loss": 0.0781, + "step": 17122 + }, + { + "epoch": 6.963399755998373, + "grad_norm": 6.7820535443930945, + "learning_rate": 9.33229950780706e-06, + "loss": 0.242, + "step": 17123 + }, + { + "epoch": 6.963806425376169, + "grad_norm": 0.6443001877074219, + "learning_rate": 9.331288584076028e-06, + "loss": 0.0111, + "step": 17124 + }, + { + "epoch": 6.964213094753965, + "grad_norm": 4.549090394664831, + "learning_rate": 9.330277667209655e-06, + "loss": 0.0702, + "step": 17125 + }, + { + "epoch": 6.964619764131761, + "grad_norm": 2.0330790159249026, + "learning_rate": 9.329266757218321e-06, + "loss": 0.0595, + "step": 17126 + }, + { + "epoch": 6.965026433509557, + "grad_norm": 8.454548621345683, + "learning_rate": 9.3282558541124e-06, + "loss": 0.3606, + "step": 17127 + }, + { + "epoch": 6.965433102887353, + "grad_norm": 9.631540015055991, + "learning_rate": 9.32724495790227e-06, + "loss": 0.304, + "step": 17128 + }, + { + "epoch": 6.965839772265149, + "grad_norm": 1.5048117356283461, + "learning_rate": 9.32623406859831e-06, + "loss": 0.0231, + "step": 17129 + }, + { + "epoch": 6.966246441642944, + "grad_norm": 4.569452034612071, + "learning_rate": 9.325223186210896e-06, + "loss": 0.0621, + "step": 17130 + }, + { + "epoch": 6.96665311102074, + "grad_norm": 0.024050916275347542, + "learning_rate": 9.324212310750407e-06, + "loss": 0.0002, + "step": 17131 + }, + { + "epoch": 6.967059780398536, + "grad_norm": 5.184858224781634, + "learning_rate": 9.323201442227216e-06, + "loss": 0.0977, + "step": 17132 + }, + { + "epoch": 6.967466449776332, + "grad_norm": 9.660718617457658, + "learning_rate": 9.322190580651705e-06, + "loss": 0.4106, + "step": 17133 + }, + { + "epoch": 6.967873119154127, + "grad_norm": 2.531604360498171, + "learning_rate": 9.321179726034249e-06, + "loss": 0.0599, + "step": 17134 + }, + { + "epoch": 6.968279788531923, + "grad_norm": 0.25921412121194776, + "learning_rate": 9.320168878385223e-06, + "loss": 0.0042, + "step": 17135 + }, + { + "epoch": 6.968686457909719, + "grad_norm": 5.919730921941484, + "learning_rate": 9.319158037715007e-06, + "loss": 0.1796, + "step": 17136 + }, + { + "epoch": 6.969093127287516, + "grad_norm": 9.761714175396108, + "learning_rate": 9.318147204033977e-06, + "loss": 0.3104, + "step": 17137 + }, + { + "epoch": 6.969499796665311, + "grad_norm": 0.28730928982909826, + "learning_rate": 9.317136377352507e-06, + "loss": 0.0035, + "step": 17138 + }, + { + "epoch": 6.969906466043107, + "grad_norm": 0.7300572341804205, + "learning_rate": 9.316125557680977e-06, + "loss": 0.0123, + "step": 17139 + }, + { + "epoch": 6.970313135420903, + "grad_norm": 0.37622727385600513, + "learning_rate": 9.315114745029763e-06, + "loss": 0.0052, + "step": 17140 + }, + { + "epoch": 6.970719804798699, + "grad_norm": 0.02150006773010746, + "learning_rate": 9.314103939409238e-06, + "loss": 0.0003, + "step": 17141 + }, + { + "epoch": 6.9711264741764944, + "grad_norm": 7.739478056508467, + "learning_rate": 9.313093140829785e-06, + "loss": 0.162, + "step": 17142 + }, + { + "epoch": 6.97153314355429, + "grad_norm": 1.1001001020748042, + "learning_rate": 9.312082349301775e-06, + "loss": 0.0165, + "step": 17143 + }, + { + "epoch": 6.971939812932086, + "grad_norm": 5.004306810402813, + "learning_rate": 9.311071564835586e-06, + "loss": 0.2295, + "step": 17144 + }, + { + "epoch": 6.972346482309882, + "grad_norm": 8.840948267636168, + "learning_rate": 9.310060787441592e-06, + "loss": 0.2603, + "step": 17145 + }, + { + "epoch": 6.972753151687678, + "grad_norm": 8.451840484888654, + "learning_rate": 9.309050017130176e-06, + "loss": 0.1595, + "step": 17146 + }, + { + "epoch": 6.973159821065474, + "grad_norm": 1.0857246089455637, + "learning_rate": 9.308039253911707e-06, + "loss": 0.0138, + "step": 17147 + }, + { + "epoch": 6.97356649044327, + "grad_norm": 3.2242750263687, + "learning_rate": 9.307028497796561e-06, + "loss": 0.0995, + "step": 17148 + }, + { + "epoch": 6.973973159821066, + "grad_norm": 3.4381359769650737, + "learning_rate": 9.306017748795118e-06, + "loss": 0.1506, + "step": 17149 + }, + { + "epoch": 6.9743798291988615, + "grad_norm": 3.820683961779545, + "learning_rate": 9.305007006917753e-06, + "loss": 0.0693, + "step": 17150 + }, + { + "epoch": 6.974786498576657, + "grad_norm": 0.11075806287607093, + "learning_rate": 9.303996272174839e-06, + "loss": 0.0018, + "step": 17151 + }, + { + "epoch": 6.975193167954453, + "grad_norm": 9.050460487258093, + "learning_rate": 9.302985544576755e-06, + "loss": 0.321, + "step": 17152 + }, + { + "epoch": 6.975599837332249, + "grad_norm": 3.407817640132888, + "learning_rate": 9.301974824133876e-06, + "loss": 0.0908, + "step": 17153 + }, + { + "epoch": 6.9760065067100445, + "grad_norm": 2.132586762589616, + "learning_rate": 9.300964110856576e-06, + "loss": 0.0308, + "step": 17154 + }, + { + "epoch": 6.97641317608784, + "grad_norm": 5.031269568703636, + "learning_rate": 9.299953404755228e-06, + "loss": 0.0782, + "step": 17155 + }, + { + "epoch": 6.976819845465636, + "grad_norm": 0.2749716927701595, + "learning_rate": 9.298942705840215e-06, + "loss": 0.0034, + "step": 17156 + }, + { + "epoch": 6.977226514843432, + "grad_norm": 1.100198960764319, + "learning_rate": 9.297932014121908e-06, + "loss": 0.0169, + "step": 17157 + }, + { + "epoch": 6.9776331842212285, + "grad_norm": 0.7759285104986529, + "learning_rate": 9.296921329610679e-06, + "loss": 0.0105, + "step": 17158 + }, + { + "epoch": 6.978039853599024, + "grad_norm": 1.856555121921403, + "learning_rate": 9.29591065231691e-06, + "loss": 0.0243, + "step": 17159 + }, + { + "epoch": 6.97844652297682, + "grad_norm": 4.767750899179098, + "learning_rate": 9.294899982250968e-06, + "loss": 0.1291, + "step": 17160 + }, + { + "epoch": 6.978853192354616, + "grad_norm": 0.029160400395579184, + "learning_rate": 9.293889319423235e-06, + "loss": 0.0004, + "step": 17161 + }, + { + "epoch": 6.9792598617324115, + "grad_norm": 0.034646989580294024, + "learning_rate": 9.292878663844088e-06, + "loss": 0.0004, + "step": 17162 + }, + { + "epoch": 6.979666531110207, + "grad_norm": 2.283484782964325, + "learning_rate": 9.291868015523892e-06, + "loss": 0.0549, + "step": 17163 + }, + { + "epoch": 6.980073200488003, + "grad_norm": 0.5488280397292915, + "learning_rate": 9.290857374473029e-06, + "loss": 0.0103, + "step": 17164 + }, + { + "epoch": 6.980479869865799, + "grad_norm": 5.452454103670932, + "learning_rate": 9.28984674070187e-06, + "loss": 0.117, + "step": 17165 + }, + { + "epoch": 6.980886539243595, + "grad_norm": 0.41945771377773367, + "learning_rate": 9.288836114220792e-06, + "loss": 0.0045, + "step": 17166 + }, + { + "epoch": 6.981293208621391, + "grad_norm": 0.8630059651753923, + "learning_rate": 9.28782549504017e-06, + "loss": 0.0139, + "step": 17167 + }, + { + "epoch": 6.981699877999187, + "grad_norm": 8.451045540089503, + "learning_rate": 9.28681488317038e-06, + "loss": 0.1544, + "step": 17168 + }, + { + "epoch": 6.982106547376983, + "grad_norm": 11.092822979422587, + "learning_rate": 9.28580427862179e-06, + "loss": 1.0307, + "step": 17169 + }, + { + "epoch": 6.9825132167547785, + "grad_norm": 12.047603277322862, + "learning_rate": 9.28479368140478e-06, + "loss": 0.373, + "step": 17170 + }, + { + "epoch": 6.982919886132574, + "grad_norm": 1.8019241059846685, + "learning_rate": 9.283783091529724e-06, + "loss": 0.0286, + "step": 17171 + }, + { + "epoch": 6.98332655551037, + "grad_norm": 3.0846944913823795, + "learning_rate": 9.282772509006996e-06, + "loss": 0.047, + "step": 17172 + }, + { + "epoch": 6.983733224888166, + "grad_norm": 1.8129867706336198, + "learning_rate": 9.281761933846965e-06, + "loss": 0.0254, + "step": 17173 + }, + { + "epoch": 6.984139894265962, + "grad_norm": 7.939974722672996, + "learning_rate": 9.280751366060012e-06, + "loss": 0.3122, + "step": 17174 + }, + { + "epoch": 6.984546563643757, + "grad_norm": 4.375650392397832, + "learning_rate": 9.279740805656509e-06, + "loss": 0.1284, + "step": 17175 + }, + { + "epoch": 6.984953233021553, + "grad_norm": 5.084396758644586, + "learning_rate": 9.278730252646826e-06, + "loss": 0.1921, + "step": 17176 + }, + { + "epoch": 6.985359902399349, + "grad_norm": 3.2398969331279472, + "learning_rate": 9.277719707041343e-06, + "loss": 0.0415, + "step": 17177 + }, + { + "epoch": 6.9857665717771456, + "grad_norm": 3.3839262666519008, + "learning_rate": 9.276709168850431e-06, + "loss": 0.057, + "step": 17178 + }, + { + "epoch": 6.986173241154941, + "grad_norm": 0.20858732394974516, + "learning_rate": 9.27569863808446e-06, + "loss": 0.0032, + "step": 17179 + }, + { + "epoch": 6.986579910532737, + "grad_norm": 5.963563119931793, + "learning_rate": 9.27468811475381e-06, + "loss": 0.1035, + "step": 17180 + }, + { + "epoch": 6.986986579910533, + "grad_norm": 0.10532567842843958, + "learning_rate": 9.273677598868851e-06, + "loss": 0.0018, + "step": 17181 + }, + { + "epoch": 6.987393249288329, + "grad_norm": 4.384983488927723, + "learning_rate": 9.272667090439956e-06, + "loss": 0.065, + "step": 17182 + }, + { + "epoch": 6.987799918666124, + "grad_norm": 0.9281428083962938, + "learning_rate": 9.2716565894775e-06, + "loss": 0.017, + "step": 17183 + }, + { + "epoch": 6.98820658804392, + "grad_norm": 7.715614880867074, + "learning_rate": 9.270646095991857e-06, + "loss": 0.434, + "step": 17184 + }, + { + "epoch": 6.988613257421716, + "grad_norm": 0.018891045251184943, + "learning_rate": 9.269635609993399e-06, + "loss": 0.0002, + "step": 17185 + }, + { + "epoch": 6.989019926799512, + "grad_norm": 7.879299662275929, + "learning_rate": 9.268625131492495e-06, + "loss": 0.1695, + "step": 17186 + }, + { + "epoch": 6.989426596177308, + "grad_norm": 9.864591165466566, + "learning_rate": 9.267614660499525e-06, + "loss": 0.3036, + "step": 17187 + }, + { + "epoch": 6.989833265555104, + "grad_norm": 9.594766753017902, + "learning_rate": 9.26660419702486e-06, + "loss": 0.2896, + "step": 17188 + }, + { + "epoch": 6.9902399349329, + "grad_norm": 2.0717998025404194, + "learning_rate": 9.265593741078869e-06, + "loss": 0.0303, + "step": 17189 + }, + { + "epoch": 6.990646604310696, + "grad_norm": 0.29220057686047946, + "learning_rate": 9.264583292671931e-06, + "loss": 0.0039, + "step": 17190 + }, + { + "epoch": 6.991053273688491, + "grad_norm": 1.1257437556456746, + "learning_rate": 9.263572851814413e-06, + "loss": 0.0169, + "step": 17191 + }, + { + "epoch": 6.991459943066287, + "grad_norm": 7.374333851388123, + "learning_rate": 9.262562418516692e-06, + "loss": 0.1937, + "step": 17192 + }, + { + "epoch": 6.991866612444083, + "grad_norm": 4.242445104194234, + "learning_rate": 9.261551992789139e-06, + "loss": 0.0849, + "step": 17193 + }, + { + "epoch": 6.992273281821879, + "grad_norm": 0.031309749936490175, + "learning_rate": 9.260541574642126e-06, + "loss": 0.0005, + "step": 17194 + }, + { + "epoch": 6.9926799511996744, + "grad_norm": 0.29968850020531057, + "learning_rate": 9.259531164086028e-06, + "loss": 0.0051, + "step": 17195 + }, + { + "epoch": 6.99308662057747, + "grad_norm": 3.6548566814105223, + "learning_rate": 9.258520761131212e-06, + "loss": 0.0827, + "step": 17196 + }, + { + "epoch": 6.993493289955266, + "grad_norm": 1.375639694104705, + "learning_rate": 9.257510365788054e-06, + "loss": 0.0238, + "step": 17197 + }, + { + "epoch": 6.993899959333062, + "grad_norm": 2.9331134755073087, + "learning_rate": 9.256499978066927e-06, + "loss": 0.0867, + "step": 17198 + }, + { + "epoch": 6.994306628710858, + "grad_norm": 2.0059367946580653, + "learning_rate": 9.2554895979782e-06, + "loss": 0.0304, + "step": 17199 + }, + { + "epoch": 6.994713298088654, + "grad_norm": 0.14683207282141053, + "learning_rate": 9.25447922553225e-06, + "loss": 0.0022, + "step": 17200 + }, + { + "epoch": 6.99511996746645, + "grad_norm": 2.553765970479559, + "learning_rate": 9.253468860739446e-06, + "loss": 0.049, + "step": 17201 + }, + { + "epoch": 6.995526636844246, + "grad_norm": 0.32894074876141277, + "learning_rate": 9.252458503610156e-06, + "loss": 0.0039, + "step": 17202 + }, + { + "epoch": 6.9959333062220415, + "grad_norm": 3.7408668441790804, + "learning_rate": 9.251448154154759e-06, + "loss": 0.1029, + "step": 17203 + }, + { + "epoch": 6.996339975599837, + "grad_norm": 6.693260916516429, + "learning_rate": 9.250437812383623e-06, + "loss": 0.1211, + "step": 17204 + }, + { + "epoch": 6.996746644977633, + "grad_norm": 8.523380903972642, + "learning_rate": 9.249427478307117e-06, + "loss": 0.2328, + "step": 17205 + }, + { + "epoch": 6.997153314355429, + "grad_norm": 2.3586571529672744, + "learning_rate": 9.24841715193562e-06, + "loss": 0.0574, + "step": 17206 + }, + { + "epoch": 6.9975599837332245, + "grad_norm": 0.5061486186322941, + "learning_rate": 9.247406833279497e-06, + "loss": 0.0063, + "step": 17207 + }, + { + "epoch": 6.997966653111021, + "grad_norm": 1.6369118663337066, + "learning_rate": 9.246396522349122e-06, + "loss": 0.0217, + "step": 17208 + }, + { + "epoch": 6.998373322488817, + "grad_norm": 0.04233119784526262, + "learning_rate": 9.245386219154865e-06, + "loss": 0.0006, + "step": 17209 + }, + { + "epoch": 6.998779991866613, + "grad_norm": 6.980400644406673, + "learning_rate": 9.2443759237071e-06, + "loss": 0.1164, + "step": 17210 + }, + { + "epoch": 6.9991866612444085, + "grad_norm": 9.05365809693777, + "learning_rate": 9.243365636016198e-06, + "loss": 0.2537, + "step": 17211 + }, + { + "epoch": 6.999593330622204, + "grad_norm": 6.132341843437032, + "learning_rate": 9.242355356092524e-06, + "loss": 0.1674, + "step": 17212 + }, + { + "epoch": 7.0, + "grad_norm": 14.034319203167533, + "learning_rate": 9.241345083946456e-06, + "loss": 0.8464, + "step": 17213 + }, + { + "epoch": 7.000406669377796, + "grad_norm": 0.06054170271389447, + "learning_rate": 9.240334819588363e-06, + "loss": 0.0009, + "step": 17214 + }, + { + "epoch": 7.0008133387555915, + "grad_norm": 0.7257124544156636, + "learning_rate": 9.239324563028613e-06, + "loss": 0.0104, + "step": 17215 + }, + { + "epoch": 7.001220008133387, + "grad_norm": 4.453743067914153, + "learning_rate": 9.238314314277581e-06, + "loss": 0.073, + "step": 17216 + }, + { + "epoch": 7.001626677511183, + "grad_norm": 0.13005156728541475, + "learning_rate": 9.237304073345638e-06, + "loss": 0.0031, + "step": 17217 + }, + { + "epoch": 7.002033346888979, + "grad_norm": 0.1325498728345356, + "learning_rate": 9.236293840243149e-06, + "loss": 0.0016, + "step": 17218 + }, + { + "epoch": 7.0024400162667755, + "grad_norm": 0.4494947480005574, + "learning_rate": 9.23528361498049e-06, + "loss": 0.005, + "step": 17219 + }, + { + "epoch": 7.002846685644571, + "grad_norm": 0.044458292735587525, + "learning_rate": 9.23427339756803e-06, + "loss": 0.0008, + "step": 17220 + }, + { + "epoch": 7.003253355022367, + "grad_norm": 5.504019028954592, + "learning_rate": 9.233263188016138e-06, + "loss": 0.1219, + "step": 17221 + }, + { + "epoch": 7.003660024400163, + "grad_norm": 4.948067719667471, + "learning_rate": 9.232252986335185e-06, + "loss": 0.0887, + "step": 17222 + }, + { + "epoch": 7.0040666937779585, + "grad_norm": 6.824139645341266, + "learning_rate": 9.231242792535543e-06, + "loss": 0.0992, + "step": 17223 + }, + { + "epoch": 7.004473363155754, + "grad_norm": 5.514505151476684, + "learning_rate": 9.230232606627578e-06, + "loss": 0.0914, + "step": 17224 + }, + { + "epoch": 7.00488003253355, + "grad_norm": 0.6182562412725957, + "learning_rate": 9.229222428621663e-06, + "loss": 0.006, + "step": 17225 + }, + { + "epoch": 7.005286701911346, + "grad_norm": 0.05407252998192, + "learning_rate": 9.22821225852817e-06, + "loss": 0.0009, + "step": 17226 + }, + { + "epoch": 7.005693371289142, + "grad_norm": 0.3611999669100632, + "learning_rate": 9.227202096357466e-06, + "loss": 0.0042, + "step": 17227 + }, + { + "epoch": 7.006100040666937, + "grad_norm": 0.7642075823631498, + "learning_rate": 9.22619194211992e-06, + "loss": 0.0116, + "step": 17228 + }, + { + "epoch": 7.006506710044734, + "grad_norm": 0.4564977677484747, + "learning_rate": 9.225181795825903e-06, + "loss": 0.0068, + "step": 17229 + }, + { + "epoch": 7.00691337942253, + "grad_norm": 0.1580281390684062, + "learning_rate": 9.224171657485787e-06, + "loss": 0.0023, + "step": 17230 + }, + { + "epoch": 7.0073200488003256, + "grad_norm": 0.3414048099472107, + "learning_rate": 9.223161527109938e-06, + "loss": 0.0048, + "step": 17231 + }, + { + "epoch": 7.007726718178121, + "grad_norm": 1.326285487013392, + "learning_rate": 9.222151404708728e-06, + "loss": 0.0208, + "step": 17232 + }, + { + "epoch": 7.008133387555917, + "grad_norm": 0.2239395140509874, + "learning_rate": 9.221141290292524e-06, + "loss": 0.0018, + "step": 17233 + }, + { + "epoch": 7.008540056933713, + "grad_norm": 2.2162494731379088, + "learning_rate": 9.220131183871698e-06, + "loss": 0.0194, + "step": 17234 + }, + { + "epoch": 7.008946726311509, + "grad_norm": 8.91189918337079, + "learning_rate": 9.219121085456616e-06, + "loss": 0.2562, + "step": 17235 + }, + { + "epoch": 7.009353395689304, + "grad_norm": 0.07368244346117238, + "learning_rate": 9.218110995057652e-06, + "loss": 0.0009, + "step": 17236 + }, + { + "epoch": 7.0097600650671, + "grad_norm": 0.33314638585746, + "learning_rate": 9.21710091268517e-06, + "loss": 0.007, + "step": 17237 + }, + { + "epoch": 7.010166734444896, + "grad_norm": 0.021740346813436076, + "learning_rate": 9.216090838349539e-06, + "loss": 0.0004, + "step": 17238 + }, + { + "epoch": 7.010573403822693, + "grad_norm": 2.438459630234442, + "learning_rate": 9.215080772061134e-06, + "loss": 0.0416, + "step": 17239 + }, + { + "epoch": 7.010980073200488, + "grad_norm": 2.5327694382123602, + "learning_rate": 9.214070713830318e-06, + "loss": 0.0338, + "step": 17240 + }, + { + "epoch": 7.011386742578284, + "grad_norm": 5.127053642740805, + "learning_rate": 9.213060663667462e-06, + "loss": 0.0765, + "step": 17241 + }, + { + "epoch": 7.01179341195608, + "grad_norm": 7.0661749327786785, + "learning_rate": 9.212050621582934e-06, + "loss": 0.2414, + "step": 17242 + }, + { + "epoch": 7.012200081333876, + "grad_norm": 1.7079364875952718, + "learning_rate": 9.211040587587104e-06, + "loss": 0.013, + "step": 17243 + }, + { + "epoch": 7.012606750711671, + "grad_norm": 0.15659974792293765, + "learning_rate": 9.210030561690338e-06, + "loss": 0.0025, + "step": 17244 + }, + { + "epoch": 7.013013420089467, + "grad_norm": 0.32464187184131643, + "learning_rate": 9.209020543903007e-06, + "loss": 0.0041, + "step": 17245 + }, + { + "epoch": 7.013420089467263, + "grad_norm": 1.5053337350168223, + "learning_rate": 9.208010534235479e-06, + "loss": 0.0223, + "step": 17246 + }, + { + "epoch": 7.013826758845059, + "grad_norm": 2.5512502832234203, + "learning_rate": 9.207000532698121e-06, + "loss": 0.0458, + "step": 17247 + }, + { + "epoch": 7.0142334282228544, + "grad_norm": 0.16825408488421492, + "learning_rate": 9.205990539301299e-06, + "loss": 0.0023, + "step": 17248 + }, + { + "epoch": 7.014640097600651, + "grad_norm": 0.15988582390956058, + "learning_rate": 9.204980554055385e-06, + "loss": 0.0021, + "step": 17249 + }, + { + "epoch": 7.015046766978447, + "grad_norm": 2.319302302166836, + "learning_rate": 9.20397057697075e-06, + "loss": 0.0273, + "step": 17250 + }, + { + "epoch": 7.015453436356243, + "grad_norm": 0.023639891053863413, + "learning_rate": 9.202960608057752e-06, + "loss": 0.0003, + "step": 17251 + }, + { + "epoch": 7.015860105734038, + "grad_norm": 0.6892360438457941, + "learning_rate": 9.201950647326765e-06, + "loss": 0.0078, + "step": 17252 + }, + { + "epoch": 7.016266775111834, + "grad_norm": 0.7646272057029555, + "learning_rate": 9.20094069478816e-06, + "loss": 0.0102, + "step": 17253 + }, + { + "epoch": 7.01667344448963, + "grad_norm": 0.8861573300118668, + "learning_rate": 9.199930750452298e-06, + "loss": 0.0116, + "step": 17254 + }, + { + "epoch": 7.017080113867426, + "grad_norm": 0.9100896283312054, + "learning_rate": 9.198920814329551e-06, + "loss": 0.0125, + "step": 17255 + }, + { + "epoch": 7.0174867832452215, + "grad_norm": 14.356872981931842, + "learning_rate": 9.197910886430284e-06, + "loss": 0.4203, + "step": 17256 + }, + { + "epoch": 7.017893452623017, + "grad_norm": 0.5241829594810691, + "learning_rate": 9.196900966764864e-06, + "loss": 0.0066, + "step": 17257 + }, + { + "epoch": 7.018300122000813, + "grad_norm": 3.7511081471984022, + "learning_rate": 9.195891055343662e-06, + "loss": 0.1716, + "step": 17258 + }, + { + "epoch": 7.018706791378609, + "grad_norm": 2.20488614813792, + "learning_rate": 9.194881152177044e-06, + "loss": 0.0464, + "step": 17259 + }, + { + "epoch": 7.019113460756405, + "grad_norm": 9.255208553817967, + "learning_rate": 9.193871257275376e-06, + "loss": 0.2735, + "step": 17260 + }, + { + "epoch": 7.019520130134201, + "grad_norm": 0.7577507952550049, + "learning_rate": 9.192861370649022e-06, + "loss": 0.012, + "step": 17261 + }, + { + "epoch": 7.019926799511997, + "grad_norm": 0.23106126285367964, + "learning_rate": 9.19185149230836e-06, + "loss": 0.0029, + "step": 17262 + }, + { + "epoch": 7.020333468889793, + "grad_norm": 0.34978324606918465, + "learning_rate": 9.190841622263741e-06, + "loss": 0.0043, + "step": 17263 + }, + { + "epoch": 7.0207401382675885, + "grad_norm": 0.9706611422141898, + "learning_rate": 9.189831760525543e-06, + "loss": 0.0124, + "step": 17264 + }, + { + "epoch": 7.021146807645384, + "grad_norm": 4.558390078881623, + "learning_rate": 9.188821907104129e-06, + "loss": 0.0607, + "step": 17265 + }, + { + "epoch": 7.02155347702318, + "grad_norm": 7.546708007988739, + "learning_rate": 9.187812062009866e-06, + "loss": 0.2416, + "step": 17266 + }, + { + "epoch": 7.021960146400976, + "grad_norm": 8.319314864618127, + "learning_rate": 9.186802225253122e-06, + "loss": 0.1966, + "step": 17267 + }, + { + "epoch": 7.0223668157787715, + "grad_norm": 1.3151183336898618, + "learning_rate": 9.185792396844262e-06, + "loss": 0.0165, + "step": 17268 + }, + { + "epoch": 7.022773485156567, + "grad_norm": 2.4751541379029236, + "learning_rate": 9.184782576793651e-06, + "loss": 0.045, + "step": 17269 + }, + { + "epoch": 7.023180154534364, + "grad_norm": 1.641849947466558, + "learning_rate": 9.183772765111661e-06, + "loss": 0.0307, + "step": 17270 + }, + { + "epoch": 7.02358682391216, + "grad_norm": 0.7174204824643483, + "learning_rate": 9.182762961808652e-06, + "loss": 0.0172, + "step": 17271 + }, + { + "epoch": 7.0239934932899555, + "grad_norm": 0.01410564065792769, + "learning_rate": 9.181753166894992e-06, + "loss": 0.0002, + "step": 17272 + }, + { + "epoch": 7.024400162667751, + "grad_norm": 4.4918948841159105, + "learning_rate": 9.180743380381047e-06, + "loss": 0.0879, + "step": 17273 + }, + { + "epoch": 7.024806832045547, + "grad_norm": 4.1419555570794335, + "learning_rate": 9.179733602277187e-06, + "loss": 0.0703, + "step": 17274 + }, + { + "epoch": 7.025213501423343, + "grad_norm": 0.00583909373577879, + "learning_rate": 9.178723832593772e-06, + "loss": 0.0001, + "step": 17275 + }, + { + "epoch": 7.0256201708011385, + "grad_norm": 0.015179878571788541, + "learning_rate": 9.177714071341169e-06, + "loss": 0.0003, + "step": 17276 + }, + { + "epoch": 7.026026840178934, + "grad_norm": 0.6699999328571296, + "learning_rate": 9.176704318529747e-06, + "loss": 0.0082, + "step": 17277 + }, + { + "epoch": 7.02643350955673, + "grad_norm": 0.22623031521474068, + "learning_rate": 9.17569457416987e-06, + "loss": 0.002, + "step": 17278 + }, + { + "epoch": 7.026840178934526, + "grad_norm": 2.3060497816008914, + "learning_rate": 9.1746848382719e-06, + "loss": 0.0276, + "step": 17279 + }, + { + "epoch": 7.0272468483123225, + "grad_norm": 5.998668328228737, + "learning_rate": 9.17367511084621e-06, + "loss": 0.1096, + "step": 17280 + }, + { + "epoch": 7.027653517690118, + "grad_norm": 4.364850870029855, + "learning_rate": 9.172665391903158e-06, + "loss": 0.0733, + "step": 17281 + }, + { + "epoch": 7.028060187067914, + "grad_norm": 1.9997219591878983, + "learning_rate": 9.17165568145311e-06, + "loss": 0.0428, + "step": 17282 + }, + { + "epoch": 7.02846685644571, + "grad_norm": 0.13217129774179365, + "learning_rate": 9.170645979506437e-06, + "loss": 0.0011, + "step": 17283 + }, + { + "epoch": 7.0288735258235056, + "grad_norm": 2.5762574496934336, + "learning_rate": 9.169636286073499e-06, + "loss": 0.048, + "step": 17284 + }, + { + "epoch": 7.029280195201301, + "grad_norm": 3.9753407258142013, + "learning_rate": 9.168626601164662e-06, + "loss": 0.0749, + "step": 17285 + }, + { + "epoch": 7.029686864579097, + "grad_norm": 8.818192455068015, + "learning_rate": 9.167616924790293e-06, + "loss": 0.108, + "step": 17286 + }, + { + "epoch": 7.030093533956893, + "grad_norm": 0.18821145682233983, + "learning_rate": 9.166607256960754e-06, + "loss": 0.0024, + "step": 17287 + }, + { + "epoch": 7.030500203334689, + "grad_norm": 1.5891128540498758, + "learning_rate": 9.165597597686411e-06, + "loss": 0.0227, + "step": 17288 + }, + { + "epoch": 7.030906872712484, + "grad_norm": 2.953114423980457, + "learning_rate": 9.164587946977627e-06, + "loss": 0.0839, + "step": 17289 + }, + { + "epoch": 7.031313542090281, + "grad_norm": 0.1808152638196847, + "learning_rate": 9.163578304844772e-06, + "loss": 0.0018, + "step": 17290 + }, + { + "epoch": 7.031720211468077, + "grad_norm": 0.04271414146312771, + "learning_rate": 9.162568671298204e-06, + "loss": 0.0005, + "step": 17291 + }, + { + "epoch": 7.032126880845873, + "grad_norm": 3.17338797246191, + "learning_rate": 9.16155904634829e-06, + "loss": 0.0485, + "step": 17292 + }, + { + "epoch": 7.032533550223668, + "grad_norm": 0.6581507938442347, + "learning_rate": 9.160549430005395e-06, + "loss": 0.0052, + "step": 17293 + }, + { + "epoch": 7.032940219601464, + "grad_norm": 4.840020756346101, + "learning_rate": 9.159539822279884e-06, + "loss": 0.0722, + "step": 17294 + }, + { + "epoch": 7.03334688897926, + "grad_norm": 4.202417242055028, + "learning_rate": 9.158530223182117e-06, + "loss": 0.2426, + "step": 17295 + }, + { + "epoch": 7.033753558357056, + "grad_norm": 5.434821576995931, + "learning_rate": 9.157520632722463e-06, + "loss": 0.2287, + "step": 17296 + }, + { + "epoch": 7.034160227734851, + "grad_norm": 1.01079095156145, + "learning_rate": 9.156511050911282e-06, + "loss": 0.0154, + "step": 17297 + }, + { + "epoch": 7.034566897112647, + "grad_norm": 0.57134045617818, + "learning_rate": 9.15550147775894e-06, + "loss": 0.0066, + "step": 17298 + }, + { + "epoch": 7.034973566490443, + "grad_norm": 0.13219842453086497, + "learning_rate": 9.1544919132758e-06, + "loss": 0.0016, + "step": 17299 + }, + { + "epoch": 7.035380235868239, + "grad_norm": 0.09548974678617583, + "learning_rate": 9.153482357472228e-06, + "loss": 0.0017, + "step": 17300 + }, + { + "epoch": 7.035786905246035, + "grad_norm": 0.27054833023487174, + "learning_rate": 9.152472810358585e-06, + "loss": 0.0035, + "step": 17301 + }, + { + "epoch": 7.036193574623831, + "grad_norm": 0.6177967257038133, + "learning_rate": 9.151463271945233e-06, + "loss": 0.0182, + "step": 17302 + }, + { + "epoch": 7.036600244001627, + "grad_norm": 5.290845421584303, + "learning_rate": 9.15045374224254e-06, + "loss": 0.1522, + "step": 17303 + }, + { + "epoch": 7.037006913379423, + "grad_norm": 4.63768548817818, + "learning_rate": 9.149444221260865e-06, + "loss": 0.0743, + "step": 17304 + }, + { + "epoch": 7.037413582757218, + "grad_norm": 0.7948118830270468, + "learning_rate": 9.148434709010573e-06, + "loss": 0.0164, + "step": 17305 + }, + { + "epoch": 7.037820252135014, + "grad_norm": 6.534658868682574, + "learning_rate": 9.14742520550203e-06, + "loss": 0.1312, + "step": 17306 + }, + { + "epoch": 7.03822692151281, + "grad_norm": 0.06420990808409652, + "learning_rate": 9.146415710745595e-06, + "loss": 0.0012, + "step": 17307 + }, + { + "epoch": 7.038633590890606, + "grad_norm": 1.1871507801075583, + "learning_rate": 9.145406224751631e-06, + "loss": 0.0167, + "step": 17308 + }, + { + "epoch": 7.0390402602684015, + "grad_norm": 0.15783044690708192, + "learning_rate": 9.144396747530504e-06, + "loss": 0.001, + "step": 17309 + }, + { + "epoch": 7.039446929646197, + "grad_norm": 0.5171091666350781, + "learning_rate": 9.143387279092574e-06, + "loss": 0.0054, + "step": 17310 + }, + { + "epoch": 7.039853599023994, + "grad_norm": 29.344632722498343, + "learning_rate": 9.142377819448208e-06, + "loss": 0.4642, + "step": 17311 + }, + { + "epoch": 7.04026026840179, + "grad_norm": 0.2803766647680677, + "learning_rate": 9.14136836860776e-06, + "loss": 0.003, + "step": 17312 + }, + { + "epoch": 7.040666937779585, + "grad_norm": 2.038303648644479, + "learning_rate": 9.1403589265816e-06, + "loss": 0.019, + "step": 17313 + }, + { + "epoch": 7.041073607157381, + "grad_norm": 0.40335551558840643, + "learning_rate": 9.13934949338009e-06, + "loss": 0.0066, + "step": 17314 + }, + { + "epoch": 7.041480276535177, + "grad_norm": 2.3627889005046963, + "learning_rate": 9.138340069013588e-06, + "loss": 0.0152, + "step": 17315 + }, + { + "epoch": 7.041886945912973, + "grad_norm": 1.6673519100322973, + "learning_rate": 9.13733065349246e-06, + "loss": 0.0203, + "step": 17316 + }, + { + "epoch": 7.0422936152907685, + "grad_norm": 5.461966779820373, + "learning_rate": 9.136321246827068e-06, + "loss": 0.0917, + "step": 17317 + }, + { + "epoch": 7.042700284668564, + "grad_norm": 0.7358002414690757, + "learning_rate": 9.13531184902777e-06, + "loss": 0.0089, + "step": 17318 + }, + { + "epoch": 7.04310695404636, + "grad_norm": 1.614700243266921, + "learning_rate": 9.134302460104933e-06, + "loss": 0.0208, + "step": 17319 + }, + { + "epoch": 7.043513623424156, + "grad_norm": 1.83784601813352, + "learning_rate": 9.133293080068919e-06, + "loss": 0.0281, + "step": 17320 + }, + { + "epoch": 7.043920292801952, + "grad_norm": 1.1796493018029446, + "learning_rate": 9.132283708930084e-06, + "loss": 0.0201, + "step": 17321 + }, + { + "epoch": 7.044326962179748, + "grad_norm": 0.04385408984552653, + "learning_rate": 9.131274346698797e-06, + "loss": 0.0006, + "step": 17322 + }, + { + "epoch": 7.044733631557544, + "grad_norm": 5.088282944532406, + "learning_rate": 9.130264993385414e-06, + "loss": 0.0843, + "step": 17323 + }, + { + "epoch": 7.04514030093534, + "grad_norm": 0.07280611888862287, + "learning_rate": 9.1292556490003e-06, + "loss": 0.0007, + "step": 17324 + }, + { + "epoch": 7.0455469703131355, + "grad_norm": 0.6873603203593772, + "learning_rate": 9.128246313553812e-06, + "loss": 0.0115, + "step": 17325 + }, + { + "epoch": 7.045953639690931, + "grad_norm": 0.08889102748809091, + "learning_rate": 9.127236987056316e-06, + "loss": 0.0012, + "step": 17326 + }, + { + "epoch": 7.046360309068727, + "grad_norm": 0.0983957185315368, + "learning_rate": 9.126227669518174e-06, + "loss": 0.0016, + "step": 17327 + }, + { + "epoch": 7.046766978446523, + "grad_norm": 0.0325715419334067, + "learning_rate": 9.12521836094974e-06, + "loss": 0.0005, + "step": 17328 + }, + { + "epoch": 7.0471736478243185, + "grad_norm": 2.537311559756203, + "learning_rate": 9.124209061361384e-06, + "loss": 0.0397, + "step": 17329 + }, + { + "epoch": 7.047580317202114, + "grad_norm": 4.1891632631146605, + "learning_rate": 9.12319977076346e-06, + "loss": 0.1166, + "step": 17330 + }, + { + "epoch": 7.047986986579911, + "grad_norm": 0.01072826114213534, + "learning_rate": 9.122190489166331e-06, + "loss": 0.0001, + "step": 17331 + }, + { + "epoch": 7.048393655957707, + "grad_norm": 0.3907515915248901, + "learning_rate": 9.121181216580361e-06, + "loss": 0.0073, + "step": 17332 + }, + { + "epoch": 7.0488003253355025, + "grad_norm": 1.1650365472861954, + "learning_rate": 9.120171953015909e-06, + "loss": 0.0289, + "step": 17333 + }, + { + "epoch": 7.049206994713298, + "grad_norm": 0.03594887307676262, + "learning_rate": 9.11916269848333e-06, + "loss": 0.0005, + "step": 17334 + }, + { + "epoch": 7.049613664091094, + "grad_norm": 6.572053189124827, + "learning_rate": 9.118153452992993e-06, + "loss": 0.2992, + "step": 17335 + }, + { + "epoch": 7.05002033346889, + "grad_norm": 3.2819049442689052, + "learning_rate": 9.117144216555255e-06, + "loss": 0.0355, + "step": 17336 + }, + { + "epoch": 7.0504270028466856, + "grad_norm": 5.206402179009721, + "learning_rate": 9.116134989180475e-06, + "loss": 0.1952, + "step": 17337 + }, + { + "epoch": 7.050833672224481, + "grad_norm": 3.7652363879606736, + "learning_rate": 9.115125770879014e-06, + "loss": 0.1086, + "step": 17338 + }, + { + "epoch": 7.051240341602277, + "grad_norm": 0.2975110293801855, + "learning_rate": 9.114116561661232e-06, + "loss": 0.0034, + "step": 17339 + }, + { + "epoch": 7.051647010980073, + "grad_norm": 0.08717842347416925, + "learning_rate": 9.11310736153749e-06, + "loss": 0.0016, + "step": 17340 + }, + { + "epoch": 7.052053680357869, + "grad_norm": 0.31424392220527125, + "learning_rate": 9.112098170518146e-06, + "loss": 0.0047, + "step": 17341 + }, + { + "epoch": 7.052460349735665, + "grad_norm": 1.2775912597580117, + "learning_rate": 9.111088988613563e-06, + "loss": 0.016, + "step": 17342 + }, + { + "epoch": 7.052867019113461, + "grad_norm": 0.025069931839243727, + "learning_rate": 9.110079815834098e-06, + "loss": 0.0003, + "step": 17343 + }, + { + "epoch": 7.053273688491257, + "grad_norm": 2.980692591551872, + "learning_rate": 9.10907065219011e-06, + "loss": 0.0412, + "step": 17344 + }, + { + "epoch": 7.053680357869053, + "grad_norm": 1.7195028397001757, + "learning_rate": 9.108061497691962e-06, + "loss": 0.0265, + "step": 17345 + }, + { + "epoch": 7.054087027246848, + "grad_norm": 0.20972913411847283, + "learning_rate": 9.107052352350012e-06, + "loss": 0.0027, + "step": 17346 + }, + { + "epoch": 7.054493696624644, + "grad_norm": 0.006888562733088425, + "learning_rate": 9.106043216174616e-06, + "loss": 0.0001, + "step": 17347 + }, + { + "epoch": 7.05490036600244, + "grad_norm": 1.2215608561985964, + "learning_rate": 9.10503408917614e-06, + "loss": 0.0206, + "step": 17348 + }, + { + "epoch": 7.055307035380236, + "grad_norm": 1.0386702898481197, + "learning_rate": 9.104024971364938e-06, + "loss": 0.0137, + "step": 17349 + }, + { + "epoch": 7.055713704758031, + "grad_norm": 0.009211952831348898, + "learning_rate": 9.103015862751372e-06, + "loss": 0.0002, + "step": 17350 + }, + { + "epoch": 7.056120374135827, + "grad_norm": 1.6405216276719936, + "learning_rate": 9.102006763345797e-06, + "loss": 0.027, + "step": 17351 + }, + { + "epoch": 7.056527043513624, + "grad_norm": 0.07875524612451384, + "learning_rate": 9.100997673158576e-06, + "loss": 0.001, + "step": 17352 + }, + { + "epoch": 7.05693371289142, + "grad_norm": 1.1601310340763218, + "learning_rate": 9.099988592200068e-06, + "loss": 0.0122, + "step": 17353 + }, + { + "epoch": 7.057340382269215, + "grad_norm": 0.3974299329603707, + "learning_rate": 9.098979520480626e-06, + "loss": 0.0044, + "step": 17354 + }, + { + "epoch": 7.057747051647011, + "grad_norm": 2.290302981615644, + "learning_rate": 9.097970458010616e-06, + "loss": 0.0356, + "step": 17355 + }, + { + "epoch": 7.058153721024807, + "grad_norm": 1.9818982684245396, + "learning_rate": 9.096961404800392e-06, + "loss": 0.0287, + "step": 17356 + }, + { + "epoch": 7.058560390402603, + "grad_norm": 6.625831117874989, + "learning_rate": 9.095952360860313e-06, + "loss": 0.1758, + "step": 17357 + }, + { + "epoch": 7.058967059780398, + "grad_norm": 1.5973739667568754, + "learning_rate": 9.094943326200739e-06, + "loss": 0.0151, + "step": 17358 + }, + { + "epoch": 7.059373729158194, + "grad_norm": 2.4338494452769415, + "learning_rate": 9.093934300832028e-06, + "loss": 0.0382, + "step": 17359 + }, + { + "epoch": 7.05978039853599, + "grad_norm": 1.554019927486903, + "learning_rate": 9.092925284764536e-06, + "loss": 0.0201, + "step": 17360 + }, + { + "epoch": 7.060187067913786, + "grad_norm": 0.6481153984901586, + "learning_rate": 9.091916278008626e-06, + "loss": 0.0091, + "step": 17361 + }, + { + "epoch": 7.060593737291582, + "grad_norm": 1.771275089218054, + "learning_rate": 9.09090728057465e-06, + "loss": 0.0166, + "step": 17362 + }, + { + "epoch": 7.061000406669378, + "grad_norm": 0.011823346713791838, + "learning_rate": 9.089898292472967e-06, + "loss": 0.0002, + "step": 17363 + }, + { + "epoch": 7.061407076047174, + "grad_norm": 0.0029183109335396296, + "learning_rate": 9.088889313713938e-06, + "loss": 0.0, + "step": 17364 + }, + { + "epoch": 7.06181374542497, + "grad_norm": 8.216882647888857, + "learning_rate": 9.087880344307917e-06, + "loss": 0.539, + "step": 17365 + }, + { + "epoch": 7.062220414802765, + "grad_norm": 0.12361081943367512, + "learning_rate": 9.086871384265263e-06, + "loss": 0.0015, + "step": 17366 + }, + { + "epoch": 7.062627084180561, + "grad_norm": 0.10223607827937255, + "learning_rate": 9.085862433596335e-06, + "loss": 0.0017, + "step": 17367 + }, + { + "epoch": 7.063033753558357, + "grad_norm": 2.679097385706367, + "learning_rate": 9.08485349231149e-06, + "loss": 0.0173, + "step": 17368 + }, + { + "epoch": 7.063440422936153, + "grad_norm": 0.6090219440711312, + "learning_rate": 9.083844560421082e-06, + "loss": 0.0075, + "step": 17369 + }, + { + "epoch": 7.0638470923139485, + "grad_norm": 0.016865360570175236, + "learning_rate": 9.082835637935473e-06, + "loss": 0.0002, + "step": 17370 + }, + { + "epoch": 7.064253761691744, + "grad_norm": 0.01649472144882892, + "learning_rate": 9.081826724865017e-06, + "loss": 0.0003, + "step": 17371 + }, + { + "epoch": 7.064660431069541, + "grad_norm": 13.877941901988303, + "learning_rate": 9.08081782122007e-06, + "loss": 0.38, + "step": 17372 + }, + { + "epoch": 7.065067100447337, + "grad_norm": 0.0020049888150659744, + "learning_rate": 9.079808927010993e-06, + "loss": 0.0, + "step": 17373 + }, + { + "epoch": 7.065473769825132, + "grad_norm": 1.9245343828969428, + "learning_rate": 9.078800042248142e-06, + "loss": 0.0442, + "step": 17374 + }, + { + "epoch": 7.065880439202928, + "grad_norm": 10.68234983661176, + "learning_rate": 9.077791166941868e-06, + "loss": 0.1079, + "step": 17375 + }, + { + "epoch": 7.066287108580724, + "grad_norm": 0.016729598851453677, + "learning_rate": 9.076782301102533e-06, + "loss": 0.0003, + "step": 17376 + }, + { + "epoch": 7.06669377795852, + "grad_norm": 8.749692145747634, + "learning_rate": 9.075773444740495e-06, + "loss": 0.3464, + "step": 17377 + }, + { + "epoch": 7.0671004473363155, + "grad_norm": 0.016815195285193436, + "learning_rate": 9.074764597866107e-06, + "loss": 0.0002, + "step": 17378 + }, + { + "epoch": 7.067507116714111, + "grad_norm": 6.064507651275878, + "learning_rate": 9.073755760489722e-06, + "loss": 0.1259, + "step": 17379 + }, + { + "epoch": 7.067913786091907, + "grad_norm": 3.906304465033389, + "learning_rate": 9.072746932621704e-06, + "loss": 0.0401, + "step": 17380 + }, + { + "epoch": 7.068320455469703, + "grad_norm": 0.5749156106694918, + "learning_rate": 9.071738114272406e-06, + "loss": 0.0092, + "step": 17381 + }, + { + "epoch": 7.0687271248474985, + "grad_norm": 3.74218975377662, + "learning_rate": 9.07072930545218e-06, + "loss": 0.0348, + "step": 17382 + }, + { + "epoch": 7.069133794225295, + "grad_norm": 5.356878558348962, + "learning_rate": 9.069720506171389e-06, + "loss": 0.0939, + "step": 17383 + }, + { + "epoch": 7.069540463603091, + "grad_norm": 2.4998956913719654, + "learning_rate": 9.068711716440384e-06, + "loss": 0.037, + "step": 17384 + }, + { + "epoch": 7.069947132980887, + "grad_norm": 8.008850522684048, + "learning_rate": 9.06770293626952e-06, + "loss": 0.298, + "step": 17385 + }, + { + "epoch": 7.0703538023586825, + "grad_norm": 5.59654218115061, + "learning_rate": 9.066694165669156e-06, + "loss": 0.1029, + "step": 17386 + }, + { + "epoch": 7.070760471736478, + "grad_norm": 13.953087142388908, + "learning_rate": 9.065685404649647e-06, + "loss": 0.3884, + "step": 17387 + }, + { + "epoch": 7.071167141114274, + "grad_norm": 0.001936863854400377, + "learning_rate": 9.064676653221346e-06, + "loss": 0.0, + "step": 17388 + }, + { + "epoch": 7.07157381049207, + "grad_norm": 8.004556075521746, + "learning_rate": 9.063667911394611e-06, + "loss": 0.2525, + "step": 17389 + }, + { + "epoch": 7.0719804798698656, + "grad_norm": 1.9679506839397243, + "learning_rate": 9.062659179179796e-06, + "loss": 0.0452, + "step": 17390 + }, + { + "epoch": 7.072387149247661, + "grad_norm": 0.04034639278847962, + "learning_rate": 9.061650456587257e-06, + "loss": 0.0007, + "step": 17391 + }, + { + "epoch": 7.072793818625457, + "grad_norm": 0.04500479689943524, + "learning_rate": 9.060641743627346e-06, + "loss": 0.0006, + "step": 17392 + }, + { + "epoch": 7.073200488003254, + "grad_norm": 10.458942970315025, + "learning_rate": 9.059633040310422e-06, + "loss": 0.3013, + "step": 17393 + }, + { + "epoch": 7.0736071573810495, + "grad_norm": 0.008778611645377583, + "learning_rate": 9.05862434664684e-06, + "loss": 0.0001, + "step": 17394 + }, + { + "epoch": 7.074013826758845, + "grad_norm": 2.6983519996721856, + "learning_rate": 9.057615662646948e-06, + "loss": 0.0307, + "step": 17395 + }, + { + "epoch": 7.074420496136641, + "grad_norm": 0.3303455584795442, + "learning_rate": 9.05660698832111e-06, + "loss": 0.0052, + "step": 17396 + }, + { + "epoch": 7.074827165514437, + "grad_norm": 1.1939753056563442, + "learning_rate": 9.055598323679675e-06, + "loss": 0.0213, + "step": 17397 + }, + { + "epoch": 7.075233834892233, + "grad_norm": 6.26034273352792, + "learning_rate": 9.054589668732996e-06, + "loss": 0.1003, + "step": 17398 + }, + { + "epoch": 7.075640504270028, + "grad_norm": 8.705230567606012, + "learning_rate": 9.053581023491431e-06, + "loss": 0.1745, + "step": 17399 + }, + { + "epoch": 7.076047173647824, + "grad_norm": 0.2410321004827738, + "learning_rate": 9.052572387965334e-06, + "loss": 0.003, + "step": 17400 + }, + { + "epoch": 7.07645384302562, + "grad_norm": 3.025374114275947, + "learning_rate": 9.051563762165057e-06, + "loss": 0.043, + "step": 17401 + }, + { + "epoch": 7.076860512403416, + "grad_norm": 8.6768118491178, + "learning_rate": 9.050555146100957e-06, + "loss": 0.2163, + "step": 17402 + }, + { + "epoch": 7.077267181781212, + "grad_norm": 0.9029568256687377, + "learning_rate": 9.049546539783387e-06, + "loss": 0.0113, + "step": 17403 + }, + { + "epoch": 7.077673851159008, + "grad_norm": 1.7913733138705406, + "learning_rate": 9.0485379432227e-06, + "loss": 0.02, + "step": 17404 + }, + { + "epoch": 7.078080520536804, + "grad_norm": 3.483236462757372, + "learning_rate": 9.047529356429247e-06, + "loss": 0.055, + "step": 17405 + }, + { + "epoch": 7.0784871899146, + "grad_norm": 7.861775409934424, + "learning_rate": 9.046520779413387e-06, + "loss": 0.2001, + "step": 17406 + }, + { + "epoch": 7.078893859292395, + "grad_norm": 0.11786712747694998, + "learning_rate": 9.04551221218547e-06, + "loss": 0.0013, + "step": 17407 + }, + { + "epoch": 7.079300528670191, + "grad_norm": 0.029035390565154497, + "learning_rate": 9.04450365475585e-06, + "loss": 0.0003, + "step": 17408 + }, + { + "epoch": 7.079707198047987, + "grad_norm": 0.14567727883497306, + "learning_rate": 9.043495107134882e-06, + "loss": 0.0018, + "step": 17409 + }, + { + "epoch": 7.080113867425783, + "grad_norm": 3.9177648083211305, + "learning_rate": 9.042486569332918e-06, + "loss": 0.0658, + "step": 17410 + }, + { + "epoch": 7.080520536803578, + "grad_norm": 0.9067536631139945, + "learning_rate": 9.04147804136031e-06, + "loss": 0.0186, + "step": 17411 + }, + { + "epoch": 7.080927206181374, + "grad_norm": 0.8012778462182526, + "learning_rate": 9.040469523227415e-06, + "loss": 0.0115, + "step": 17412 + }, + { + "epoch": 7.081333875559171, + "grad_norm": 0.03327944135383992, + "learning_rate": 9.039461014944582e-06, + "loss": 0.0005, + "step": 17413 + }, + { + "epoch": 7.081740544936967, + "grad_norm": 5.05263040859209, + "learning_rate": 9.038452516522165e-06, + "loss": 0.0862, + "step": 17414 + }, + { + "epoch": 7.082147214314762, + "grad_norm": 5.428606866835572, + "learning_rate": 9.037444027970515e-06, + "loss": 0.117, + "step": 17415 + }, + { + "epoch": 7.082553883692558, + "grad_norm": 2.3698569638883122, + "learning_rate": 9.036435549299988e-06, + "loss": 0.0329, + "step": 17416 + }, + { + "epoch": 7.082960553070354, + "grad_norm": 4.1727558109823955, + "learning_rate": 9.035427080520936e-06, + "loss": 0.0795, + "step": 17417 + }, + { + "epoch": 7.08336722244815, + "grad_norm": 10.34676036578233, + "learning_rate": 9.034418621643708e-06, + "loss": 0.359, + "step": 17418 + }, + { + "epoch": 7.083773891825945, + "grad_norm": 3.461624602038569, + "learning_rate": 9.033410172678662e-06, + "loss": 0.0339, + "step": 17419 + }, + { + "epoch": 7.084180561203741, + "grad_norm": 0.7211910374839992, + "learning_rate": 9.032401733636145e-06, + "loss": 0.0086, + "step": 17420 + }, + { + "epoch": 7.084587230581537, + "grad_norm": 0.026989274672089714, + "learning_rate": 9.03139330452651e-06, + "loss": 0.0004, + "step": 17421 + }, + { + "epoch": 7.084993899959333, + "grad_norm": 0.7184011189064732, + "learning_rate": 9.030384885360113e-06, + "loss": 0.0077, + "step": 17422 + }, + { + "epoch": 7.0854005693371285, + "grad_norm": 0.7769873122149747, + "learning_rate": 9.029376476147303e-06, + "loss": 0.009, + "step": 17423 + }, + { + "epoch": 7.085807238714925, + "grad_norm": 0.8880641460707699, + "learning_rate": 9.028368076898429e-06, + "loss": 0.0097, + "step": 17424 + }, + { + "epoch": 7.086213908092721, + "grad_norm": 0.5076417463811556, + "learning_rate": 9.02735968762385e-06, + "loss": 0.005, + "step": 17425 + }, + { + "epoch": 7.086620577470517, + "grad_norm": 8.053691776567842, + "learning_rate": 9.026351308333911e-06, + "loss": 0.2208, + "step": 17426 + }, + { + "epoch": 7.087027246848312, + "grad_norm": 0.7631883183616103, + "learning_rate": 9.025342939038965e-06, + "loss": 0.0078, + "step": 17427 + }, + { + "epoch": 7.087433916226108, + "grad_norm": 3.7808275248458703, + "learning_rate": 9.024334579749364e-06, + "loss": 0.0612, + "step": 17428 + }, + { + "epoch": 7.087840585603904, + "grad_norm": 0.0004070271838067283, + "learning_rate": 9.023326230475463e-06, + "loss": 0.0, + "step": 17429 + }, + { + "epoch": 7.0882472549817, + "grad_norm": 1.0514835705864956, + "learning_rate": 9.022317891227606e-06, + "loss": 0.0163, + "step": 17430 + }, + { + "epoch": 7.0886539243594955, + "grad_norm": 0.21000632869948033, + "learning_rate": 9.021309562016148e-06, + "loss": 0.0038, + "step": 17431 + }, + { + "epoch": 7.089060593737291, + "grad_norm": 0.657986833243113, + "learning_rate": 9.020301242851443e-06, + "loss": 0.0091, + "step": 17432 + }, + { + "epoch": 7.089467263115087, + "grad_norm": 5.365982109446125, + "learning_rate": 9.019292933743837e-06, + "loss": 0.1061, + "step": 17433 + }, + { + "epoch": 7.089873932492884, + "grad_norm": 0.1756214182297746, + "learning_rate": 9.018284634703682e-06, + "loss": 0.0026, + "step": 17434 + }, + { + "epoch": 7.090280601870679, + "grad_norm": 1.5169613102024448, + "learning_rate": 9.017276345741331e-06, + "loss": 0.0122, + "step": 17435 + }, + { + "epoch": 7.090687271248475, + "grad_norm": 2.3472566809553665, + "learning_rate": 9.016268066867132e-06, + "loss": 0.036, + "step": 17436 + }, + { + "epoch": 7.091093940626271, + "grad_norm": 0.0416022756308799, + "learning_rate": 9.015259798091435e-06, + "loss": 0.0007, + "step": 17437 + }, + { + "epoch": 7.091500610004067, + "grad_norm": 0.8856254688386491, + "learning_rate": 9.014251539424595e-06, + "loss": 0.0119, + "step": 17438 + }, + { + "epoch": 7.0919072793818625, + "grad_norm": 0.6303797177986458, + "learning_rate": 9.013243290876957e-06, + "loss": 0.0118, + "step": 17439 + }, + { + "epoch": 7.092313948759658, + "grad_norm": 0.0469318001857358, + "learning_rate": 9.012235052458874e-06, + "loss": 0.0005, + "step": 17440 + }, + { + "epoch": 7.092720618137454, + "grad_norm": 0.1550521937310796, + "learning_rate": 9.011226824180693e-06, + "loss": 0.0021, + "step": 17441 + }, + { + "epoch": 7.09312728751525, + "grad_norm": 0.8159626541558419, + "learning_rate": 9.010218606052768e-06, + "loss": 0.0085, + "step": 17442 + }, + { + "epoch": 7.0935339568930456, + "grad_norm": 0.09396124090708127, + "learning_rate": 9.009210398085449e-06, + "loss": 0.0018, + "step": 17443 + }, + { + "epoch": 7.093940626270842, + "grad_norm": 2.5046510447902928, + "learning_rate": 9.00820220028908e-06, + "loss": 0.0573, + "step": 17444 + }, + { + "epoch": 7.094347295648638, + "grad_norm": 1.0265948473438686, + "learning_rate": 9.007194012674018e-06, + "loss": 0.0134, + "step": 17445 + }, + { + "epoch": 7.094753965026434, + "grad_norm": 2.540303874584746, + "learning_rate": 9.006185835250608e-06, + "loss": 0.0416, + "step": 17446 + }, + { + "epoch": 7.0951606344042295, + "grad_norm": 11.848898814457094, + "learning_rate": 9.005177668029199e-06, + "loss": 0.5236, + "step": 17447 + }, + { + "epoch": 7.095567303782025, + "grad_norm": 1.755690154613012, + "learning_rate": 9.004169511020145e-06, + "loss": 0.0189, + "step": 17448 + }, + { + "epoch": 7.095973973159821, + "grad_norm": 4.157336141527943, + "learning_rate": 9.00316136423379e-06, + "loss": 0.0606, + "step": 17449 + }, + { + "epoch": 7.096380642537617, + "grad_norm": 0.18085860128894088, + "learning_rate": 9.002153227680484e-06, + "loss": 0.0032, + "step": 17450 + }, + { + "epoch": 7.096787311915413, + "grad_norm": 0.001582386299706793, + "learning_rate": 9.001145101370581e-06, + "loss": 0.0, + "step": 17451 + }, + { + "epoch": 7.097193981293208, + "grad_norm": 7.256366396782826, + "learning_rate": 9.000136985314426e-06, + "loss": 0.2923, + "step": 17452 + }, + { + "epoch": 7.097600650671004, + "grad_norm": 0.3292518863234188, + "learning_rate": 8.999128879522365e-06, + "loss": 0.0035, + "step": 17453 + }, + { + "epoch": 7.098007320048801, + "grad_norm": 0.6032497305548752, + "learning_rate": 8.99812078400475e-06, + "loss": 0.0107, + "step": 17454 + }, + { + "epoch": 7.0984139894265965, + "grad_norm": 0.01302674860459267, + "learning_rate": 8.99711269877193e-06, + "loss": 0.0002, + "step": 17455 + }, + { + "epoch": 7.098820658804392, + "grad_norm": 0.4733260639605411, + "learning_rate": 8.996104623834256e-06, + "loss": 0.0075, + "step": 17456 + }, + { + "epoch": 7.099227328182188, + "grad_norm": 0.7771749414465878, + "learning_rate": 8.995096559202069e-06, + "loss": 0.0108, + "step": 17457 + }, + { + "epoch": 7.099633997559984, + "grad_norm": 0.023062187507148717, + "learning_rate": 8.994088504885724e-06, + "loss": 0.0002, + "step": 17458 + }, + { + "epoch": 7.10004066693778, + "grad_norm": 13.182812337817717, + "learning_rate": 8.993080460895567e-06, + "loss": 0.708, + "step": 17459 + }, + { + "epoch": 7.100447336315575, + "grad_norm": 2.9408691920922827, + "learning_rate": 8.992072427241943e-06, + "loss": 0.0323, + "step": 17460 + }, + { + "epoch": 7.100854005693371, + "grad_norm": 0.8161297959929613, + "learning_rate": 8.99106440393521e-06, + "loss": 0.0103, + "step": 17461 + }, + { + "epoch": 7.101260675071167, + "grad_norm": 1.1825299672827074, + "learning_rate": 8.990056390985703e-06, + "loss": 0.0141, + "step": 17462 + }, + { + "epoch": 7.101667344448963, + "grad_norm": 0.62155167192106, + "learning_rate": 8.989048388403774e-06, + "loss": 0.0113, + "step": 17463 + }, + { + "epoch": 7.102074013826758, + "grad_norm": 2.9408780072010203, + "learning_rate": 8.988040396199777e-06, + "loss": 0.0795, + "step": 17464 + }, + { + "epoch": 7.102480683204555, + "grad_norm": 1.6463987010635968, + "learning_rate": 8.987032414384049e-06, + "loss": 0.0328, + "step": 17465 + }, + { + "epoch": 7.102887352582351, + "grad_norm": 6.1720863743693135, + "learning_rate": 8.986024442966946e-06, + "loss": 0.1664, + "step": 17466 + }, + { + "epoch": 7.103294021960147, + "grad_norm": 0.1601683853051572, + "learning_rate": 8.985016481958813e-06, + "loss": 0.0022, + "step": 17467 + }, + { + "epoch": 7.103700691337942, + "grad_norm": 0.05790709153809029, + "learning_rate": 8.984008531369996e-06, + "loss": 0.0008, + "step": 17468 + }, + { + "epoch": 7.104107360715738, + "grad_norm": 8.564206826856163, + "learning_rate": 8.983000591210842e-06, + "loss": 0.1932, + "step": 17469 + }, + { + "epoch": 7.104514030093534, + "grad_norm": 0.8032019126236815, + "learning_rate": 8.9819926614917e-06, + "loss": 0.0103, + "step": 17470 + }, + { + "epoch": 7.10492069947133, + "grad_norm": 0.7346476219810456, + "learning_rate": 8.980984742222915e-06, + "loss": 0.0091, + "step": 17471 + }, + { + "epoch": 7.105327368849125, + "grad_norm": 0.6112754540292539, + "learning_rate": 8.979976833414832e-06, + "loss": 0.008, + "step": 17472 + }, + { + "epoch": 7.105734038226921, + "grad_norm": 0.13531850745347074, + "learning_rate": 8.978968935077805e-06, + "loss": 0.0022, + "step": 17473 + }, + { + "epoch": 7.106140707604717, + "grad_norm": 3.814526043398567, + "learning_rate": 8.977961047222173e-06, + "loss": 0.0396, + "step": 17474 + }, + { + "epoch": 7.106547376982514, + "grad_norm": 4.7961563778137695, + "learning_rate": 8.976953169858285e-06, + "loss": 0.0846, + "step": 17475 + }, + { + "epoch": 7.106954046360309, + "grad_norm": 0.15595684083658043, + "learning_rate": 8.975945302996489e-06, + "loss": 0.0027, + "step": 17476 + }, + { + "epoch": 7.107360715738105, + "grad_norm": 1.798519592729944, + "learning_rate": 8.97493744664713e-06, + "loss": 0.0231, + "step": 17477 + }, + { + "epoch": 7.107767385115901, + "grad_norm": 2.675447251887045, + "learning_rate": 8.973929600820551e-06, + "loss": 0.0374, + "step": 17478 + }, + { + "epoch": 7.108174054493697, + "grad_norm": 0.4755787564706436, + "learning_rate": 8.972921765527105e-06, + "loss": 0.0064, + "step": 17479 + }, + { + "epoch": 7.108580723871492, + "grad_norm": 0.499706599459183, + "learning_rate": 8.971913940777131e-06, + "loss": 0.0045, + "step": 17480 + }, + { + "epoch": 7.108987393249288, + "grad_norm": 0.08331364360226719, + "learning_rate": 8.97090612658098e-06, + "loss": 0.0007, + "step": 17481 + }, + { + "epoch": 7.109394062627084, + "grad_norm": 4.644506519721406, + "learning_rate": 8.969898322948994e-06, + "loss": 0.0672, + "step": 17482 + }, + { + "epoch": 7.10980073200488, + "grad_norm": 2.7658610627080575, + "learning_rate": 8.96889052989152e-06, + "loss": 0.0357, + "step": 17483 + }, + { + "epoch": 7.1102074013826755, + "grad_norm": 0.03220364180063342, + "learning_rate": 8.967882747418906e-06, + "loss": 0.0003, + "step": 17484 + }, + { + "epoch": 7.110614070760472, + "grad_norm": 0.2993755458935586, + "learning_rate": 8.96687497554149e-06, + "loss": 0.0044, + "step": 17485 + }, + { + "epoch": 7.111020740138268, + "grad_norm": 0.04730167609686595, + "learning_rate": 8.965867214269628e-06, + "loss": 0.0004, + "step": 17486 + }, + { + "epoch": 7.111427409516064, + "grad_norm": 0.20421492518386608, + "learning_rate": 8.964859463613657e-06, + "loss": 0.0027, + "step": 17487 + }, + { + "epoch": 7.111834078893859, + "grad_norm": 6.265101039886857, + "learning_rate": 8.963851723583924e-06, + "loss": 0.1122, + "step": 17488 + }, + { + "epoch": 7.112240748271655, + "grad_norm": 0.07038469624817156, + "learning_rate": 8.962843994190776e-06, + "loss": 0.0008, + "step": 17489 + }, + { + "epoch": 7.112647417649451, + "grad_norm": 0.75980376000253, + "learning_rate": 8.961836275444557e-06, + "loss": 0.0136, + "step": 17490 + }, + { + "epoch": 7.113054087027247, + "grad_norm": 1.9915167941722696, + "learning_rate": 8.960828567355608e-06, + "loss": 0.0321, + "step": 17491 + }, + { + "epoch": 7.1134607564050425, + "grad_norm": 0.04760560926706652, + "learning_rate": 8.959820869934279e-06, + "loss": 0.0007, + "step": 17492 + }, + { + "epoch": 7.113867425782838, + "grad_norm": 2.1014866872508153, + "learning_rate": 8.958813183190912e-06, + "loss": 0.105, + "step": 17493 + }, + { + "epoch": 7.114274095160634, + "grad_norm": 0.2618442623539081, + "learning_rate": 8.957805507135852e-06, + "loss": 0.0021, + "step": 17494 + }, + { + "epoch": 7.114680764538431, + "grad_norm": 0.030687500760389782, + "learning_rate": 8.956797841779443e-06, + "loss": 0.0003, + "step": 17495 + }, + { + "epoch": 7.1150874339162264, + "grad_norm": 0.036357238318682, + "learning_rate": 8.955790187132029e-06, + "loss": 0.0005, + "step": 17496 + }, + { + "epoch": 7.115494103294022, + "grad_norm": 0.16035815214495336, + "learning_rate": 8.954782543203955e-06, + "loss": 0.0022, + "step": 17497 + }, + { + "epoch": 7.115900772671818, + "grad_norm": 0.17159465451532496, + "learning_rate": 8.953774910005562e-06, + "loss": 0.0023, + "step": 17498 + }, + { + "epoch": 7.116307442049614, + "grad_norm": 0.9543186518391569, + "learning_rate": 8.952767287547197e-06, + "loss": 0.0136, + "step": 17499 + }, + { + "epoch": 7.1167141114274095, + "grad_norm": 2.528298818453427, + "learning_rate": 8.951759675839204e-06, + "loss": 0.0266, + "step": 17500 + }, + { + "epoch": 7.117120780805205, + "grad_norm": 0.228413418568853, + "learning_rate": 8.950752074891923e-06, + "loss": 0.0037, + "step": 17501 + }, + { + "epoch": 7.117527450183001, + "grad_norm": 0.19022330104420282, + "learning_rate": 8.949744484715702e-06, + "loss": 0.0031, + "step": 17502 + }, + { + "epoch": 7.117934119560797, + "grad_norm": 2.1483037956313367, + "learning_rate": 8.948736905320884e-06, + "loss": 0.094, + "step": 17503 + }, + { + "epoch": 7.118340788938593, + "grad_norm": 5.372778219857943, + "learning_rate": 8.947729336717807e-06, + "loss": 0.1213, + "step": 17504 + }, + { + "epoch": 7.118747458316388, + "grad_norm": 2.1398783113707993, + "learning_rate": 8.94672177891682e-06, + "loss": 0.0113, + "step": 17505 + }, + { + "epoch": 7.119154127694185, + "grad_norm": 5.289857640411564, + "learning_rate": 8.945714231928264e-06, + "loss": 0.0871, + "step": 17506 + }, + { + "epoch": 7.119560797071981, + "grad_norm": 0.21864038345874506, + "learning_rate": 8.944706695762484e-06, + "loss": 0.0025, + "step": 17507 + }, + { + "epoch": 7.1199674664497765, + "grad_norm": 2.623927663576829, + "learning_rate": 8.943699170429816e-06, + "loss": 0.0297, + "step": 17508 + }, + { + "epoch": 7.120374135827572, + "grad_norm": 0.4054026061771622, + "learning_rate": 8.942691655940612e-06, + "loss": 0.0025, + "step": 17509 + }, + { + "epoch": 7.120780805205368, + "grad_norm": 8.531495679384989, + "learning_rate": 8.94168415230521e-06, + "loss": 0.106, + "step": 17510 + }, + { + "epoch": 7.121187474583164, + "grad_norm": 0.21170240447476024, + "learning_rate": 8.940676659533951e-06, + "loss": 0.0017, + "step": 17511 + }, + { + "epoch": 7.12159414396096, + "grad_norm": 0.15409626634153933, + "learning_rate": 8.939669177637178e-06, + "loss": 0.0014, + "step": 17512 + }, + { + "epoch": 7.122000813338755, + "grad_norm": 0.2667970155263127, + "learning_rate": 8.938661706625239e-06, + "loss": 0.0025, + "step": 17513 + }, + { + "epoch": 7.122407482716551, + "grad_norm": 0.0036046371133841345, + "learning_rate": 8.937654246508467e-06, + "loss": 0.0, + "step": 17514 + }, + { + "epoch": 7.122814152094347, + "grad_norm": 2.95501688939781, + "learning_rate": 8.936646797297212e-06, + "loss": 0.029, + "step": 17515 + }, + { + "epoch": 7.1232208214721435, + "grad_norm": 2.483054752509291, + "learning_rate": 8.935639359001812e-06, + "loss": 0.0491, + "step": 17516 + }, + { + "epoch": 7.123627490849939, + "grad_norm": 0.16198517572765395, + "learning_rate": 8.934631931632608e-06, + "loss": 0.0016, + "step": 17517 + }, + { + "epoch": 7.124034160227735, + "grad_norm": 1.1004586918392343, + "learning_rate": 8.933624515199946e-06, + "loss": 0.0097, + "step": 17518 + }, + { + "epoch": 7.124440829605531, + "grad_norm": 0.4185102862240558, + "learning_rate": 8.932617109714165e-06, + "loss": 0.0042, + "step": 17519 + }, + { + "epoch": 7.124847498983327, + "grad_norm": 0.6405838894281133, + "learning_rate": 8.931609715185605e-06, + "loss": 0.0096, + "step": 17520 + }, + { + "epoch": 7.125254168361122, + "grad_norm": 3.6506157067578906, + "learning_rate": 8.930602331624608e-06, + "loss": 0.099, + "step": 17521 + }, + { + "epoch": 7.125660837738918, + "grad_norm": 0.022706408846133765, + "learning_rate": 8.929594959041518e-06, + "loss": 0.0001, + "step": 17522 + }, + { + "epoch": 7.126067507116714, + "grad_norm": 4.888323155720852, + "learning_rate": 8.928587597446677e-06, + "loss": 0.0698, + "step": 17523 + }, + { + "epoch": 7.12647417649451, + "grad_norm": 0.08522129073111896, + "learning_rate": 8.927580246850418e-06, + "loss": 0.001, + "step": 17524 + }, + { + "epoch": 7.126880845872305, + "grad_norm": 0.016946231607196303, + "learning_rate": 8.926572907263092e-06, + "loss": 0.0002, + "step": 17525 + }, + { + "epoch": 7.127287515250102, + "grad_norm": 1.6576606000900689, + "learning_rate": 8.925565578695033e-06, + "loss": 0.0206, + "step": 17526 + }, + { + "epoch": 7.127694184627898, + "grad_norm": 10.72831900725662, + "learning_rate": 8.924558261156583e-06, + "loss": 0.2162, + "step": 17527 + }, + { + "epoch": 7.128100854005694, + "grad_norm": 4.692979910389392, + "learning_rate": 8.923550954658087e-06, + "loss": 0.0626, + "step": 17528 + }, + { + "epoch": 7.128507523383489, + "grad_norm": 2.900553382937181, + "learning_rate": 8.92254365920988e-06, + "loss": 0.0146, + "step": 17529 + }, + { + "epoch": 7.128914192761285, + "grad_norm": 2.0715391707460835, + "learning_rate": 8.921536374822306e-06, + "loss": 0.032, + "step": 17530 + }, + { + "epoch": 7.129320862139081, + "grad_norm": 2.460332066515183, + "learning_rate": 8.920529101505702e-06, + "loss": 0.0418, + "step": 17531 + }, + { + "epoch": 7.129727531516877, + "grad_norm": 4.374311803649643, + "learning_rate": 8.919521839270413e-06, + "loss": 0.0427, + "step": 17532 + }, + { + "epoch": 7.130134200894672, + "grad_norm": 0.5754415892786734, + "learning_rate": 8.918514588126774e-06, + "loss": 0.0037, + "step": 17533 + }, + { + "epoch": 7.130540870272468, + "grad_norm": 0.9774099554642566, + "learning_rate": 8.917507348085127e-06, + "loss": 0.014, + "step": 17534 + }, + { + "epoch": 7.130947539650264, + "grad_norm": 3.894153672650724, + "learning_rate": 8.916500119155813e-06, + "loss": 0.0535, + "step": 17535 + }, + { + "epoch": 7.131354209028061, + "grad_norm": 0.14724963315978132, + "learning_rate": 8.915492901349168e-06, + "loss": 0.0018, + "step": 17536 + }, + { + "epoch": 7.131760878405856, + "grad_norm": 0.021326187292363182, + "learning_rate": 8.914485694675535e-06, + "loss": 0.0003, + "step": 17537 + }, + { + "epoch": 7.132167547783652, + "grad_norm": 4.567905866540523, + "learning_rate": 8.913478499145255e-06, + "loss": 0.1004, + "step": 17538 + }, + { + "epoch": 7.132574217161448, + "grad_norm": 0.08141744147902519, + "learning_rate": 8.912471314768665e-06, + "loss": 0.0006, + "step": 17539 + }, + { + "epoch": 7.132980886539244, + "grad_norm": 6.50442998928133, + "learning_rate": 8.911464141556101e-06, + "loss": 0.217, + "step": 17540 + }, + { + "epoch": 7.133387555917039, + "grad_norm": 0.8949280687691463, + "learning_rate": 8.910456979517908e-06, + "loss": 0.0122, + "step": 17541 + }, + { + "epoch": 7.133794225294835, + "grad_norm": 0.046287805057353706, + "learning_rate": 8.909449828664423e-06, + "loss": 0.0008, + "step": 17542 + }, + { + "epoch": 7.134200894672631, + "grad_norm": 0.12011085888659762, + "learning_rate": 8.908442689005984e-06, + "loss": 0.0013, + "step": 17543 + }, + { + "epoch": 7.134607564050427, + "grad_norm": 2.679576305874385, + "learning_rate": 8.907435560552928e-06, + "loss": 0.1335, + "step": 17544 + }, + { + "epoch": 7.1350142334282225, + "grad_norm": 1.4853891229246043, + "learning_rate": 8.906428443315598e-06, + "loss": 0.0179, + "step": 17545 + }, + { + "epoch": 7.135420902806018, + "grad_norm": 0.05707304489001463, + "learning_rate": 8.905421337304329e-06, + "loss": 0.0009, + "step": 17546 + }, + { + "epoch": 7.135827572183815, + "grad_norm": 2.5136183750243517, + "learning_rate": 8.90441424252946e-06, + "loss": 0.0657, + "step": 17547 + }, + { + "epoch": 7.136234241561611, + "grad_norm": 0.24693813083181726, + "learning_rate": 8.903407159001333e-06, + "loss": 0.0033, + "step": 17548 + }, + { + "epoch": 7.1366409109394064, + "grad_norm": 0.02912066066284986, + "learning_rate": 8.902400086730283e-06, + "loss": 0.0003, + "step": 17549 + }, + { + "epoch": 7.137047580317202, + "grad_norm": 0.001305144594011194, + "learning_rate": 8.901393025726645e-06, + "loss": 0.0, + "step": 17550 + }, + { + "epoch": 7.137454249694998, + "grad_norm": 0.026128663470414124, + "learning_rate": 8.900385976000765e-06, + "loss": 0.0003, + "step": 17551 + }, + { + "epoch": 7.137860919072794, + "grad_norm": 0.3682926870128905, + "learning_rate": 8.899378937562975e-06, + "loss": 0.0028, + "step": 17552 + }, + { + "epoch": 7.1382675884505895, + "grad_norm": 0.5289910713580278, + "learning_rate": 8.898371910423612e-06, + "loss": 0.005, + "step": 17553 + }, + { + "epoch": 7.138674257828385, + "grad_norm": 2.5414815433133957, + "learning_rate": 8.89736489459302e-06, + "loss": 0.1188, + "step": 17554 + }, + { + "epoch": 7.139080927206181, + "grad_norm": 9.964608371725626, + "learning_rate": 8.896357890081529e-06, + "loss": 0.052, + "step": 17555 + }, + { + "epoch": 7.139487596583977, + "grad_norm": 0.28395897726032154, + "learning_rate": 8.895350896899482e-06, + "loss": 0.0034, + "step": 17556 + }, + { + "epoch": 7.1398942659617735, + "grad_norm": 0.17356470735059218, + "learning_rate": 8.894343915057211e-06, + "loss": 0.0023, + "step": 17557 + }, + { + "epoch": 7.140300935339569, + "grad_norm": 7.4959196725515636, + "learning_rate": 8.893336944565058e-06, + "loss": 0.2092, + "step": 17558 + }, + { + "epoch": 7.140707604717365, + "grad_norm": 1.2708198424562518, + "learning_rate": 8.892329985433358e-06, + "loss": 0.0162, + "step": 17559 + }, + { + "epoch": 7.141114274095161, + "grad_norm": 0.569156291589373, + "learning_rate": 8.891323037672446e-06, + "loss": 0.0061, + "step": 17560 + }, + { + "epoch": 7.1415209434729565, + "grad_norm": 12.460299025893212, + "learning_rate": 8.89031610129267e-06, + "loss": 0.185, + "step": 17561 + }, + { + "epoch": 7.141927612850752, + "grad_norm": 0.007464774964428498, + "learning_rate": 8.889309176304347e-06, + "loss": 0.0001, + "step": 17562 + }, + { + "epoch": 7.142334282228548, + "grad_norm": 4.085227582337556, + "learning_rate": 8.888302262717829e-06, + "loss": 0.0435, + "step": 17563 + }, + { + "epoch": 7.142740951606344, + "grad_norm": 3.021103795756602, + "learning_rate": 8.887295360543448e-06, + "loss": 0.0598, + "step": 17564 + }, + { + "epoch": 7.14314762098414, + "grad_norm": 0.0016065904574507795, + "learning_rate": 8.886288469791539e-06, + "loss": 0.0, + "step": 17565 + }, + { + "epoch": 7.143554290361935, + "grad_norm": 11.49633410825025, + "learning_rate": 8.885281590472438e-06, + "loss": 0.5547, + "step": 17566 + }, + { + "epoch": 7.143960959739732, + "grad_norm": 0.7585004630272382, + "learning_rate": 8.884274722596485e-06, + "loss": 0.0093, + "step": 17567 + }, + { + "epoch": 7.144367629117528, + "grad_norm": 0.2839122204104091, + "learning_rate": 8.883267866174011e-06, + "loss": 0.0027, + "step": 17568 + }, + { + "epoch": 7.1447742984953235, + "grad_norm": 0.9577597182237477, + "learning_rate": 8.882261021215358e-06, + "loss": 0.0098, + "step": 17569 + }, + { + "epoch": 7.145180967873119, + "grad_norm": 4.202295654044339, + "learning_rate": 8.881254187730856e-06, + "loss": 0.0978, + "step": 17570 + }, + { + "epoch": 7.145587637250915, + "grad_norm": 0.6553956674650037, + "learning_rate": 8.880247365730843e-06, + "loss": 0.0087, + "step": 17571 + }, + { + "epoch": 7.145994306628711, + "grad_norm": 0.23315960562239435, + "learning_rate": 8.879240555225654e-06, + "loss": 0.002, + "step": 17572 + }, + { + "epoch": 7.146400976006507, + "grad_norm": 1.5017466036888285, + "learning_rate": 8.878233756225624e-06, + "loss": 0.0256, + "step": 17573 + }, + { + "epoch": 7.146807645384302, + "grad_norm": 7.613464944742343, + "learning_rate": 8.87722696874109e-06, + "loss": 0.1438, + "step": 17574 + }, + { + "epoch": 7.147214314762098, + "grad_norm": 6.6705563894094775, + "learning_rate": 8.876220192782386e-06, + "loss": 0.2018, + "step": 17575 + }, + { + "epoch": 7.147620984139894, + "grad_norm": 8.784029710304647, + "learning_rate": 8.875213428359849e-06, + "loss": 0.0646, + "step": 17576 + }, + { + "epoch": 7.1480276535176905, + "grad_norm": 14.301551982118708, + "learning_rate": 8.87420667548381e-06, + "loss": 0.268, + "step": 17577 + }, + { + "epoch": 7.148434322895486, + "grad_norm": 0.5155592785298593, + "learning_rate": 8.873199934164606e-06, + "loss": 0.0095, + "step": 17578 + }, + { + "epoch": 7.148840992273282, + "grad_norm": 0.3367789929067733, + "learning_rate": 8.872193204412574e-06, + "loss": 0.0049, + "step": 17579 + }, + { + "epoch": 7.149247661651078, + "grad_norm": 0.06142439955136569, + "learning_rate": 8.871186486238044e-06, + "loss": 0.0006, + "step": 17580 + }, + { + "epoch": 7.149654331028874, + "grad_norm": 1.37017548291615, + "learning_rate": 8.870179779651354e-06, + "loss": 0.0112, + "step": 17581 + }, + { + "epoch": 7.150061000406669, + "grad_norm": 0.23106718843432725, + "learning_rate": 8.869173084662836e-06, + "loss": 0.0036, + "step": 17582 + }, + { + "epoch": 7.150467669784465, + "grad_norm": 9.19478145640268, + "learning_rate": 8.868166401282829e-06, + "loss": 0.2647, + "step": 17583 + }, + { + "epoch": 7.150874339162261, + "grad_norm": 0.5784540514110594, + "learning_rate": 8.86715972952166e-06, + "loss": 0.0063, + "step": 17584 + }, + { + "epoch": 7.151281008540057, + "grad_norm": 6.038277271027352, + "learning_rate": 8.866153069389667e-06, + "loss": 0.0822, + "step": 17585 + }, + { + "epoch": 7.151687677917852, + "grad_norm": 0.018762347292086416, + "learning_rate": 8.865146420897184e-06, + "loss": 0.0003, + "step": 17586 + }, + { + "epoch": 7.152094347295648, + "grad_norm": 1.9132877296248079, + "learning_rate": 8.864139784054545e-06, + "loss": 0.0252, + "step": 17587 + }, + { + "epoch": 7.152501016673445, + "grad_norm": 0.050466824482206525, + "learning_rate": 8.86313315887208e-06, + "loss": 0.0006, + "step": 17588 + }, + { + "epoch": 7.152907686051241, + "grad_norm": 0.474937416015748, + "learning_rate": 8.862126545360127e-06, + "loss": 0.0028, + "step": 17589 + }, + { + "epoch": 7.153314355429036, + "grad_norm": 1.2036060166671683, + "learning_rate": 8.86111994352902e-06, + "loss": 0.022, + "step": 17590 + }, + { + "epoch": 7.153721024806832, + "grad_norm": 0.23980355123553665, + "learning_rate": 8.860113353389085e-06, + "loss": 0.0012, + "step": 17591 + }, + { + "epoch": 7.154127694184628, + "grad_norm": 7.9293834496446545, + "learning_rate": 8.859106774950665e-06, + "loss": 0.127, + "step": 17592 + }, + { + "epoch": 7.154534363562424, + "grad_norm": 1.1305249985831252, + "learning_rate": 8.858100208224086e-06, + "loss": 0.0106, + "step": 17593 + }, + { + "epoch": 7.154941032940219, + "grad_norm": 0.016530704938262846, + "learning_rate": 8.857093653219682e-06, + "loss": 0.0001, + "step": 17594 + }, + { + "epoch": 7.155347702318015, + "grad_norm": 0.6732656784772675, + "learning_rate": 8.856087109947788e-06, + "loss": 0.008, + "step": 17595 + }, + { + "epoch": 7.155754371695811, + "grad_norm": 0.0036831707930794276, + "learning_rate": 8.855080578418736e-06, + "loss": 0.0, + "step": 17596 + }, + { + "epoch": 7.156161041073607, + "grad_norm": 0.03007747340378046, + "learning_rate": 8.854074058642858e-06, + "loss": 0.0005, + "step": 17597 + }, + { + "epoch": 7.156567710451403, + "grad_norm": 0.12684749364734096, + "learning_rate": 8.853067550630486e-06, + "loss": 0.0013, + "step": 17598 + }, + { + "epoch": 7.156974379829199, + "grad_norm": 26.391513698830384, + "learning_rate": 8.852061054391953e-06, + "loss": 0.6492, + "step": 17599 + }, + { + "epoch": 7.157381049206995, + "grad_norm": 0.7330593508689545, + "learning_rate": 8.851054569937592e-06, + "loss": 0.0089, + "step": 17600 + }, + { + "epoch": 7.157787718584791, + "grad_norm": 3.566721031347975, + "learning_rate": 8.850048097277732e-06, + "loss": 0.098, + "step": 17601 + }, + { + "epoch": 7.1581943879625864, + "grad_norm": 0.01948583008141646, + "learning_rate": 8.849041636422708e-06, + "loss": 0.0003, + "step": 17602 + }, + { + "epoch": 7.158601057340382, + "grad_norm": 1.015250678921285, + "learning_rate": 8.848035187382853e-06, + "loss": 0.0091, + "step": 17603 + }, + { + "epoch": 7.159007726718178, + "grad_norm": 0.327993664284124, + "learning_rate": 8.847028750168492e-06, + "loss": 0.0028, + "step": 17604 + }, + { + "epoch": 7.159414396095974, + "grad_norm": 9.218440305985485, + "learning_rate": 8.846022324789964e-06, + "loss": 0.3268, + "step": 17605 + }, + { + "epoch": 7.1598210654737695, + "grad_norm": 0.09000633982139415, + "learning_rate": 8.845015911257598e-06, + "loss": 0.0019, + "step": 17606 + }, + { + "epoch": 7.160227734851565, + "grad_norm": 2.263622029473988, + "learning_rate": 8.844009509581723e-06, + "loss": 0.0277, + "step": 17607 + }, + { + "epoch": 7.160634404229362, + "grad_norm": 3.2593509668614433, + "learning_rate": 8.843003119772674e-06, + "loss": 0.0504, + "step": 17608 + }, + { + "epoch": 7.161041073607158, + "grad_norm": 6.968675720456181, + "learning_rate": 8.841996741840781e-06, + "loss": 0.154, + "step": 17609 + }, + { + "epoch": 7.1614477429849535, + "grad_norm": 4.004522687398329, + "learning_rate": 8.840990375796374e-06, + "loss": 0.061, + "step": 17610 + }, + { + "epoch": 7.161854412362749, + "grad_norm": 0.6636904128100243, + "learning_rate": 8.83998402164978e-06, + "loss": 0.012, + "step": 17611 + }, + { + "epoch": 7.162261081740545, + "grad_norm": 1.709084527265646, + "learning_rate": 8.838977679411339e-06, + "loss": 0.0388, + "step": 17612 + }, + { + "epoch": 7.162667751118341, + "grad_norm": 0.49506447978700135, + "learning_rate": 8.837971349091375e-06, + "loss": 0.0052, + "step": 17613 + }, + { + "epoch": 7.1630744204961365, + "grad_norm": 0.7135746857327024, + "learning_rate": 8.836965030700217e-06, + "loss": 0.0077, + "step": 17614 + }, + { + "epoch": 7.163481089873932, + "grad_norm": 1.5484391351752784, + "learning_rate": 8.835958724248201e-06, + "loss": 0.0239, + "step": 17615 + }, + { + "epoch": 7.163887759251728, + "grad_norm": 1.139119853497536, + "learning_rate": 8.834952429745655e-06, + "loss": 0.0146, + "step": 17616 + }, + { + "epoch": 7.164294428629524, + "grad_norm": 0.4560694659897201, + "learning_rate": 8.833946147202908e-06, + "loss": 0.008, + "step": 17617 + }, + { + "epoch": 7.1647010980073205, + "grad_norm": 0.2991597382315783, + "learning_rate": 8.83293987663029e-06, + "loss": 0.0036, + "step": 17618 + }, + { + "epoch": 7.165107767385116, + "grad_norm": 0.4478845730163463, + "learning_rate": 8.831933618038133e-06, + "loss": 0.0065, + "step": 17619 + }, + { + "epoch": 7.165514436762912, + "grad_norm": 2.2603194709748724, + "learning_rate": 8.830927371436762e-06, + "loss": 0.0255, + "step": 17620 + }, + { + "epoch": 7.165921106140708, + "grad_norm": 0.17154803930735377, + "learning_rate": 8.829921136836515e-06, + "loss": 0.0024, + "step": 17621 + }, + { + "epoch": 7.1663277755185035, + "grad_norm": 0.014821490948962214, + "learning_rate": 8.828914914247713e-06, + "loss": 0.0002, + "step": 17622 + }, + { + "epoch": 7.166734444896299, + "grad_norm": 1.679560115244229, + "learning_rate": 8.82790870368069e-06, + "loss": 0.0214, + "step": 17623 + }, + { + "epoch": 7.167141114274095, + "grad_norm": 3.584493693144644, + "learning_rate": 8.826902505145773e-06, + "loss": 0.0469, + "step": 17624 + }, + { + "epoch": 7.167547783651891, + "grad_norm": 0.04892080390420836, + "learning_rate": 8.825896318653294e-06, + "loss": 0.0006, + "step": 17625 + }, + { + "epoch": 7.167954453029687, + "grad_norm": 3.579520128483419, + "learning_rate": 8.824890144213578e-06, + "loss": 0.0743, + "step": 17626 + }, + { + "epoch": 7.168361122407482, + "grad_norm": 0.01449885563661882, + "learning_rate": 8.823883981836956e-06, + "loss": 0.0002, + "step": 17627 + }, + { + "epoch": 7.168767791785278, + "grad_norm": 0.2731224694518726, + "learning_rate": 8.822877831533757e-06, + "loss": 0.0037, + "step": 17628 + }, + { + "epoch": 7.169174461163075, + "grad_norm": 9.603624228630872, + "learning_rate": 8.821871693314313e-06, + "loss": 0.263, + "step": 17629 + }, + { + "epoch": 7.1695811305408705, + "grad_norm": 0.4575195915921449, + "learning_rate": 8.820865567188945e-06, + "loss": 0.0048, + "step": 17630 + }, + { + "epoch": 7.169987799918666, + "grad_norm": 0.9334788340725507, + "learning_rate": 8.819859453167986e-06, + "loss": 0.0107, + "step": 17631 + }, + { + "epoch": 7.170394469296462, + "grad_norm": 0.038887760868486394, + "learning_rate": 8.818853351261767e-06, + "loss": 0.0005, + "step": 17632 + }, + { + "epoch": 7.170801138674258, + "grad_norm": 2.1688859556864446, + "learning_rate": 8.81784726148061e-06, + "loss": 0.0356, + "step": 17633 + }, + { + "epoch": 7.171207808052054, + "grad_norm": 6.551778171830066, + "learning_rate": 8.816841183834844e-06, + "loss": 0.2203, + "step": 17634 + }, + { + "epoch": 7.171614477429849, + "grad_norm": 2.0528186156269417, + "learning_rate": 8.815835118334803e-06, + "loss": 0.0444, + "step": 17635 + }, + { + "epoch": 7.172021146807645, + "grad_norm": 9.034805669551073, + "learning_rate": 8.814829064990809e-06, + "loss": 0.1546, + "step": 17636 + }, + { + "epoch": 7.172427816185441, + "grad_norm": 0.001675494217168132, + "learning_rate": 8.813823023813188e-06, + "loss": 0.0, + "step": 17637 + }, + { + "epoch": 7.172834485563237, + "grad_norm": 7.354985819403556, + "learning_rate": 8.812816994812275e-06, + "loss": 0.3384, + "step": 17638 + }, + { + "epoch": 7.173241154941033, + "grad_norm": 8.122551409228318, + "learning_rate": 8.811810977998391e-06, + "loss": 0.1286, + "step": 17639 + }, + { + "epoch": 7.173647824318829, + "grad_norm": 3.6924880877779382, + "learning_rate": 8.810804973381863e-06, + "loss": 0.0691, + "step": 17640 + }, + { + "epoch": 7.174054493696625, + "grad_norm": 0.01638778375399224, + "learning_rate": 8.809798980973025e-06, + "loss": 0.0001, + "step": 17641 + }, + { + "epoch": 7.174461163074421, + "grad_norm": 0.047553887186863666, + "learning_rate": 8.808793000782198e-06, + "loss": 0.0007, + "step": 17642 + }, + { + "epoch": 7.174867832452216, + "grad_norm": 0.39871205645321595, + "learning_rate": 8.807787032819708e-06, + "loss": 0.005, + "step": 17643 + }, + { + "epoch": 7.175274501830012, + "grad_norm": 4.421848516924109, + "learning_rate": 8.806781077095887e-06, + "loss": 0.0573, + "step": 17644 + }, + { + "epoch": 7.175681171207808, + "grad_norm": 0.043738327917984196, + "learning_rate": 8.805775133621058e-06, + "loss": 0.0005, + "step": 17645 + }, + { + "epoch": 7.176087840585604, + "grad_norm": 10.01923266666785, + "learning_rate": 8.804769202405547e-06, + "loss": 0.1077, + "step": 17646 + }, + { + "epoch": 7.176494509963399, + "grad_norm": 1.7014117820392585, + "learning_rate": 8.803763283459682e-06, + "loss": 0.0284, + "step": 17647 + }, + { + "epoch": 7.176901179341195, + "grad_norm": 7.763576012385428, + "learning_rate": 8.80275737679379e-06, + "loss": 0.2998, + "step": 17648 + }, + { + "epoch": 7.177307848718992, + "grad_norm": 1.4582501179633927, + "learning_rate": 8.801751482418196e-06, + "loss": 0.0335, + "step": 17649 + }, + { + "epoch": 7.177714518096788, + "grad_norm": 0.3209284073746679, + "learning_rate": 8.800745600343224e-06, + "loss": 0.0049, + "step": 17650 + }, + { + "epoch": 7.178121187474583, + "grad_norm": 1.2893721869575994, + "learning_rate": 8.799739730579205e-06, + "loss": 0.0175, + "step": 17651 + }, + { + "epoch": 7.178527856852379, + "grad_norm": 1.957001612020304, + "learning_rate": 8.798733873136461e-06, + "loss": 0.0337, + "step": 17652 + }, + { + "epoch": 7.178934526230175, + "grad_norm": 9.90102545373015, + "learning_rate": 8.797728028025317e-06, + "loss": 0.3298, + "step": 17653 + }, + { + "epoch": 7.179341195607971, + "grad_norm": 5.2814083747493275, + "learning_rate": 8.7967221952561e-06, + "loss": 0.1035, + "step": 17654 + }, + { + "epoch": 7.1797478649857664, + "grad_norm": 0.5435307532992556, + "learning_rate": 8.795716374839138e-06, + "loss": 0.0057, + "step": 17655 + }, + { + "epoch": 7.180154534363562, + "grad_norm": 0.8403022667746549, + "learning_rate": 8.79471056678475e-06, + "loss": 0.0081, + "step": 17656 + }, + { + "epoch": 7.180561203741358, + "grad_norm": 0.8634178210156813, + "learning_rate": 8.793704771103267e-06, + "loss": 0.005, + "step": 17657 + }, + { + "epoch": 7.180967873119154, + "grad_norm": 3.590387190632809, + "learning_rate": 8.792698987805011e-06, + "loss": 0.0628, + "step": 17658 + }, + { + "epoch": 7.18137454249695, + "grad_norm": 1.3693338452665496, + "learning_rate": 8.79169321690031e-06, + "loss": 0.0162, + "step": 17659 + }, + { + "epoch": 7.181781211874746, + "grad_norm": 1.028349018351456, + "learning_rate": 8.790687458399486e-06, + "loss": 0.0129, + "step": 17660 + }, + { + "epoch": 7.182187881252542, + "grad_norm": 3.7871201412619406, + "learning_rate": 8.789681712312858e-06, + "loss": 0.1248, + "step": 17661 + }, + { + "epoch": 7.182594550630338, + "grad_norm": 4.1221459168177335, + "learning_rate": 8.788675978650762e-06, + "loss": 0.0689, + "step": 17662 + }, + { + "epoch": 7.1830012200081335, + "grad_norm": 0.02335203852585959, + "learning_rate": 8.787670257423515e-06, + "loss": 0.0004, + "step": 17663 + }, + { + "epoch": 7.183407889385929, + "grad_norm": 0.05793865640259881, + "learning_rate": 8.786664548641443e-06, + "loss": 0.0011, + "step": 17664 + }, + { + "epoch": 7.183814558763725, + "grad_norm": 0.2073186732595837, + "learning_rate": 8.785658852314868e-06, + "loss": 0.0024, + "step": 17665 + }, + { + "epoch": 7.184221228141521, + "grad_norm": 0.4896438716642552, + "learning_rate": 8.784653168454118e-06, + "loss": 0.0079, + "step": 17666 + }, + { + "epoch": 7.1846278975193165, + "grad_norm": 1.142161184510164, + "learning_rate": 8.783647497069515e-06, + "loss": 0.0177, + "step": 17667 + }, + { + "epoch": 7.185034566897112, + "grad_norm": 0.5382697302972549, + "learning_rate": 8.78264183817138e-06, + "loss": 0.0038, + "step": 17668 + }, + { + "epoch": 7.185441236274908, + "grad_norm": 0.01614514623501254, + "learning_rate": 8.781636191770044e-06, + "loss": 0.0003, + "step": 17669 + }, + { + "epoch": 7.185847905652705, + "grad_norm": 1.0918923026941083, + "learning_rate": 8.780630557875822e-06, + "loss": 0.0137, + "step": 17670 + }, + { + "epoch": 7.1862545750305005, + "grad_norm": 1.4810237960657435, + "learning_rate": 8.779624936499041e-06, + "loss": 0.0157, + "step": 17671 + }, + { + "epoch": 7.186661244408296, + "grad_norm": 0.9876072857462828, + "learning_rate": 8.778619327650026e-06, + "loss": 0.0184, + "step": 17672 + }, + { + "epoch": 7.187067913786092, + "grad_norm": 1.3159893120497799, + "learning_rate": 8.777613731339098e-06, + "loss": 0.0167, + "step": 17673 + }, + { + "epoch": 7.187474583163888, + "grad_norm": 3.2788904837041706, + "learning_rate": 8.77660814757658e-06, + "loss": 0.0528, + "step": 17674 + }, + { + "epoch": 7.1878812525416835, + "grad_norm": 0.04705081106040838, + "learning_rate": 8.775602576372795e-06, + "loss": 0.0003, + "step": 17675 + }, + { + "epoch": 7.188287921919479, + "grad_norm": 0.7028411144062314, + "learning_rate": 8.774597017738065e-06, + "loss": 0.0093, + "step": 17676 + }, + { + "epoch": 7.188694591297275, + "grad_norm": 0.9493702012152478, + "learning_rate": 8.773591471682715e-06, + "loss": 0.0122, + "step": 17677 + }, + { + "epoch": 7.189101260675071, + "grad_norm": 2.8089966751486997, + "learning_rate": 8.772585938217064e-06, + "loss": 0.0516, + "step": 17678 + }, + { + "epoch": 7.189507930052867, + "grad_norm": 1.2266349742120841, + "learning_rate": 8.771580417351437e-06, + "loss": 0.0133, + "step": 17679 + }, + { + "epoch": 7.189914599430663, + "grad_norm": 2.5853476848423362, + "learning_rate": 8.770574909096157e-06, + "loss": 0.0227, + "step": 17680 + }, + { + "epoch": 7.190321268808459, + "grad_norm": 6.701490613116449, + "learning_rate": 8.76956941346154e-06, + "loss": 0.4467, + "step": 17681 + }, + { + "epoch": 7.190727938186255, + "grad_norm": 0.011561311816147352, + "learning_rate": 8.768563930457917e-06, + "loss": 0.0001, + "step": 17682 + }, + { + "epoch": 7.1911346075640505, + "grad_norm": 1.790312431433823, + "learning_rate": 8.767558460095602e-06, + "loss": 0.0221, + "step": 17683 + }, + { + "epoch": 7.191541276941846, + "grad_norm": 0.8522445689754926, + "learning_rate": 8.76655300238492e-06, + "loss": 0.0124, + "step": 17684 + }, + { + "epoch": 7.191947946319642, + "grad_norm": 2.516028415574573, + "learning_rate": 8.765547557336193e-06, + "loss": 0.0272, + "step": 17685 + }, + { + "epoch": 7.192354615697438, + "grad_norm": 1.2011429305630739, + "learning_rate": 8.764542124959742e-06, + "loss": 0.0206, + "step": 17686 + }, + { + "epoch": 7.192761285075234, + "grad_norm": 1.1706068601633421, + "learning_rate": 8.763536705265887e-06, + "loss": 0.0084, + "step": 17687 + }, + { + "epoch": 7.193167954453029, + "grad_norm": 1.2428120710265564, + "learning_rate": 8.762531298264949e-06, + "loss": 0.0195, + "step": 17688 + }, + { + "epoch": 7.193574623830825, + "grad_norm": 1.773997360734956, + "learning_rate": 8.761525903967252e-06, + "loss": 0.0205, + "step": 17689 + }, + { + "epoch": 7.193981293208622, + "grad_norm": 3.9396108554134766, + "learning_rate": 8.760520522383115e-06, + "loss": 0.0718, + "step": 17690 + }, + { + "epoch": 7.1943879625864176, + "grad_norm": 0.4471863252149106, + "learning_rate": 8.759515153522856e-06, + "loss": 0.0047, + "step": 17691 + }, + { + "epoch": 7.194794631964213, + "grad_norm": 0.2994575118780418, + "learning_rate": 8.758509797396802e-06, + "loss": 0.0033, + "step": 17692 + }, + { + "epoch": 7.195201301342009, + "grad_norm": 0.035631322315149684, + "learning_rate": 8.757504454015267e-06, + "loss": 0.0005, + "step": 17693 + }, + { + "epoch": 7.195607970719805, + "grad_norm": 0.13049106253579057, + "learning_rate": 8.756499123388572e-06, + "loss": 0.0015, + "step": 17694 + }, + { + "epoch": 7.196014640097601, + "grad_norm": 10.820503352647622, + "learning_rate": 8.755493805527044e-06, + "loss": 0.4815, + "step": 17695 + }, + { + "epoch": 7.196421309475396, + "grad_norm": 2.024627031431096, + "learning_rate": 8.754488500440998e-06, + "loss": 0.03, + "step": 17696 + }, + { + "epoch": 7.196827978853192, + "grad_norm": 0.690841768107837, + "learning_rate": 8.753483208140753e-06, + "loss": 0.0103, + "step": 17697 + }, + { + "epoch": 7.197234648230988, + "grad_norm": 0.0517246315190959, + "learning_rate": 8.752477928636628e-06, + "loss": 0.0006, + "step": 17698 + }, + { + "epoch": 7.197641317608784, + "grad_norm": 8.731931829854966, + "learning_rate": 8.75147266193895e-06, + "loss": 0.2248, + "step": 17699 + }, + { + "epoch": 7.19804798698658, + "grad_norm": 4.537921380855537, + "learning_rate": 8.750467408058031e-06, + "loss": 0.1355, + "step": 17700 + }, + { + "epoch": 7.198454656364376, + "grad_norm": 0.443678755687933, + "learning_rate": 8.749462167004191e-06, + "loss": 0.0053, + "step": 17701 + }, + { + "epoch": 7.198861325742172, + "grad_norm": 0.05307358530462171, + "learning_rate": 8.748456938787755e-06, + "loss": 0.0007, + "step": 17702 + }, + { + "epoch": 7.199267995119968, + "grad_norm": 3.2017887045893545, + "learning_rate": 8.747451723419036e-06, + "loss": 0.0349, + "step": 17703 + }, + { + "epoch": 7.199674664497763, + "grad_norm": 0.49510103908045877, + "learning_rate": 8.746446520908353e-06, + "loss": 0.0051, + "step": 17704 + }, + { + "epoch": 7.200081333875559, + "grad_norm": 4.973957206935447, + "learning_rate": 8.745441331266032e-06, + "loss": 0.1313, + "step": 17705 + }, + { + "epoch": 7.200488003253355, + "grad_norm": 14.792044934844661, + "learning_rate": 8.744436154502386e-06, + "loss": 0.404, + "step": 17706 + }, + { + "epoch": 7.200894672631151, + "grad_norm": 0.03602547774343324, + "learning_rate": 8.743430990627733e-06, + "loss": 0.0004, + "step": 17707 + }, + { + "epoch": 7.2013013420089464, + "grad_norm": 7.158773755205568, + "learning_rate": 8.742425839652395e-06, + "loss": 0.1932, + "step": 17708 + }, + { + "epoch": 7.201708011386742, + "grad_norm": 0.1501083398804428, + "learning_rate": 8.741420701586688e-06, + "loss": 0.001, + "step": 17709 + }, + { + "epoch": 7.202114680764538, + "grad_norm": 0.21006062801588302, + "learning_rate": 8.74041557644093e-06, + "loss": 0.0028, + "step": 17710 + }, + { + "epoch": 7.202521350142335, + "grad_norm": 4.0711570511550415, + "learning_rate": 8.73941046422544e-06, + "loss": 0.0457, + "step": 17711 + }, + { + "epoch": 7.20292801952013, + "grad_norm": 0.13176435352392896, + "learning_rate": 8.738405364950538e-06, + "loss": 0.0019, + "step": 17712 + }, + { + "epoch": 7.203334688897926, + "grad_norm": 0.06798232505582812, + "learning_rate": 8.73740027862654e-06, + "loss": 0.001, + "step": 17713 + }, + { + "epoch": 7.203741358275722, + "grad_norm": 0.9943673792093397, + "learning_rate": 8.736395205263762e-06, + "loss": 0.0105, + "step": 17714 + }, + { + "epoch": 7.204148027653518, + "grad_norm": 1.041515962749559, + "learning_rate": 8.735390144872524e-06, + "loss": 0.0143, + "step": 17715 + }, + { + "epoch": 7.2045546970313135, + "grad_norm": 3.552353148534996, + "learning_rate": 8.734385097463143e-06, + "loss": 0.0862, + "step": 17716 + }, + { + "epoch": 7.204961366409109, + "grad_norm": 0.7877078589601806, + "learning_rate": 8.733380063045933e-06, + "loss": 0.0104, + "step": 17717 + }, + { + "epoch": 7.205368035786905, + "grad_norm": 6.812137520840944, + "learning_rate": 8.732375041631218e-06, + "loss": 0.1927, + "step": 17718 + }, + { + "epoch": 7.205774705164701, + "grad_norm": 0.5674823866198019, + "learning_rate": 8.73137003322931e-06, + "loss": 0.0052, + "step": 17719 + }, + { + "epoch": 7.2061813745424965, + "grad_norm": 1.9913497882999578, + "learning_rate": 8.730365037850524e-06, + "loss": 0.0254, + "step": 17720 + }, + { + "epoch": 7.206588043920293, + "grad_norm": 3.8345971524459186, + "learning_rate": 8.729360055505183e-06, + "loss": 0.0951, + "step": 17721 + }, + { + "epoch": 7.206994713298089, + "grad_norm": 1.4023203292156219, + "learning_rate": 8.728355086203601e-06, + "loss": 0.0156, + "step": 17722 + }, + { + "epoch": 7.207401382675885, + "grad_norm": 1.4752306511089368, + "learning_rate": 8.727350129956093e-06, + "loss": 0.0195, + "step": 17723 + }, + { + "epoch": 7.2078080520536805, + "grad_norm": 3.4901738459083385, + "learning_rate": 8.726345186772977e-06, + "loss": 0.0475, + "step": 17724 + }, + { + "epoch": 7.208214721431476, + "grad_norm": 0.12494716495590998, + "learning_rate": 8.725340256664568e-06, + "loss": 0.0016, + "step": 17725 + }, + { + "epoch": 7.208621390809272, + "grad_norm": 4.013071829895013, + "learning_rate": 8.724335339641185e-06, + "loss": 0.0673, + "step": 17726 + }, + { + "epoch": 7.209028060187068, + "grad_norm": 3.5006633373690392, + "learning_rate": 8.723330435713137e-06, + "loss": 0.0531, + "step": 17727 + }, + { + "epoch": 7.2094347295648635, + "grad_norm": 3.2177642885810327, + "learning_rate": 8.722325544890748e-06, + "loss": 0.0371, + "step": 17728 + }, + { + "epoch": 7.209841398942659, + "grad_norm": 0.0006577175993811693, + "learning_rate": 8.72132066718433e-06, + "loss": 0.0, + "step": 17729 + }, + { + "epoch": 7.210248068320455, + "grad_norm": 0.1249004208484058, + "learning_rate": 8.720315802604198e-06, + "loss": 0.0013, + "step": 17730 + }, + { + "epoch": 7.210654737698252, + "grad_norm": 0.5703651165328174, + "learning_rate": 8.71931095116067e-06, + "loss": 0.007, + "step": 17731 + }, + { + "epoch": 7.2110614070760475, + "grad_norm": 2.1034655378790803, + "learning_rate": 8.71830611286406e-06, + "loss": 0.0241, + "step": 17732 + }, + { + "epoch": 7.211468076453843, + "grad_norm": 4.406521951538091, + "learning_rate": 8.71730128772468e-06, + "loss": 0.0538, + "step": 17733 + }, + { + "epoch": 7.211874745831639, + "grad_norm": 1.2760945132754542, + "learning_rate": 8.716296475752849e-06, + "loss": 0.0138, + "step": 17734 + }, + { + "epoch": 7.212281415209435, + "grad_norm": 8.812774900788579, + "learning_rate": 8.715291676958882e-06, + "loss": 0.1943, + "step": 17735 + }, + { + "epoch": 7.2126880845872305, + "grad_norm": 0.13313801106772885, + "learning_rate": 8.71428689135309e-06, + "loss": 0.0017, + "step": 17736 + }, + { + "epoch": 7.213094753965026, + "grad_norm": 0.00716303547575652, + "learning_rate": 8.713282118945792e-06, + "loss": 0.0001, + "step": 17737 + }, + { + "epoch": 7.213501423342822, + "grad_norm": 6.856626999923599, + "learning_rate": 8.712277359747302e-06, + "loss": 0.2319, + "step": 17738 + }, + { + "epoch": 7.213908092720618, + "grad_norm": 0.08724655386735161, + "learning_rate": 8.71127261376793e-06, + "loss": 0.0006, + "step": 17739 + }, + { + "epoch": 7.214314762098414, + "grad_norm": 1.1114672405128903, + "learning_rate": 8.710267881017994e-06, + "loss": 0.0175, + "step": 17740 + }, + { + "epoch": 7.21472143147621, + "grad_norm": 0.38180703714763203, + "learning_rate": 8.709263161507808e-06, + "loss": 0.0015, + "step": 17741 + }, + { + "epoch": 7.215128100854006, + "grad_norm": 5.995270926093023, + "learning_rate": 8.708258455247686e-06, + "loss": 0.2356, + "step": 17742 + }, + { + "epoch": 7.215534770231802, + "grad_norm": 0.02897199171197683, + "learning_rate": 8.707253762247938e-06, + "loss": 0.0003, + "step": 17743 + }, + { + "epoch": 7.2159414396095976, + "grad_norm": 0.7792468181884629, + "learning_rate": 8.706249082518885e-06, + "loss": 0.0086, + "step": 17744 + }, + { + "epoch": 7.216348108987393, + "grad_norm": 5.558771777971385, + "learning_rate": 8.705244416070835e-06, + "loss": 0.1309, + "step": 17745 + }, + { + "epoch": 7.216754778365189, + "grad_norm": 1.9898831962671495, + "learning_rate": 8.704239762914099e-06, + "loss": 0.0193, + "step": 17746 + }, + { + "epoch": 7.217161447742985, + "grad_norm": 1.6234936619955824, + "learning_rate": 8.703235123058999e-06, + "loss": 0.0158, + "step": 17747 + }, + { + "epoch": 7.217568117120781, + "grad_norm": 13.729525816162827, + "learning_rate": 8.702230496515841e-06, + "loss": 0.5287, + "step": 17748 + }, + { + "epoch": 7.217974786498576, + "grad_norm": 0.13940698753292088, + "learning_rate": 8.701225883294942e-06, + "loss": 0.0015, + "step": 17749 + }, + { + "epoch": 7.218381455876372, + "grad_norm": 1.7614039634593879, + "learning_rate": 8.700221283406608e-06, + "loss": 0.0244, + "step": 17750 + }, + { + "epoch": 7.218788125254168, + "grad_norm": 0.010794172018292423, + "learning_rate": 8.699216696861162e-06, + "loss": 0.0001, + "step": 17751 + }, + { + "epoch": 7.219194794631965, + "grad_norm": 0.13063803000653765, + "learning_rate": 8.69821212366891e-06, + "loss": 0.0014, + "step": 17752 + }, + { + "epoch": 7.21960146400976, + "grad_norm": 6.541832290280666, + "learning_rate": 8.697207563840163e-06, + "loss": 0.3482, + "step": 17753 + }, + { + "epoch": 7.220008133387556, + "grad_norm": 0.2706847342152539, + "learning_rate": 8.696203017385237e-06, + "loss": 0.0025, + "step": 17754 + }, + { + "epoch": 7.220414802765352, + "grad_norm": 1.453469242382616, + "learning_rate": 8.695198484314446e-06, + "loss": 0.0147, + "step": 17755 + }, + { + "epoch": 7.220821472143148, + "grad_norm": 1.1552119585158813, + "learning_rate": 8.694193964638095e-06, + "loss": 0.0188, + "step": 17756 + }, + { + "epoch": 7.221228141520943, + "grad_norm": 1.1030803613746534, + "learning_rate": 8.693189458366504e-06, + "loss": 0.0135, + "step": 17757 + }, + { + "epoch": 7.221634810898739, + "grad_norm": 0.6343265063849325, + "learning_rate": 8.692184965509979e-06, + "loss": 0.005, + "step": 17758 + }, + { + "epoch": 7.222041480276535, + "grad_norm": 4.5145666290163815, + "learning_rate": 8.691180486078831e-06, + "loss": 0.067, + "step": 17759 + }, + { + "epoch": 7.222448149654331, + "grad_norm": 13.762544724654198, + "learning_rate": 8.690176020083383e-06, + "loss": 0.2979, + "step": 17760 + }, + { + "epoch": 7.222854819032127, + "grad_norm": 12.189715950590616, + "learning_rate": 8.68917156753393e-06, + "loss": 0.1441, + "step": 17761 + }, + { + "epoch": 7.223261488409923, + "grad_norm": 4.326195725618496, + "learning_rate": 8.688167128440794e-06, + "loss": 0.1407, + "step": 17762 + }, + { + "epoch": 7.223668157787719, + "grad_norm": 0.21067360676136154, + "learning_rate": 8.68716270281428e-06, + "loss": 0.0011, + "step": 17763 + }, + { + "epoch": 7.224074827165515, + "grad_norm": 0.20186352713676145, + "learning_rate": 8.6861582906647e-06, + "loss": 0.0023, + "step": 17764 + }, + { + "epoch": 7.22448149654331, + "grad_norm": 0.32436775486286396, + "learning_rate": 8.68515389200237e-06, + "loss": 0.0028, + "step": 17765 + }, + { + "epoch": 7.224888165921106, + "grad_norm": 9.711148516179886, + "learning_rate": 8.684149506837596e-06, + "loss": 0.2627, + "step": 17766 + }, + { + "epoch": 7.225294835298902, + "grad_norm": 0.15785105858359277, + "learning_rate": 8.68314513518069e-06, + "loss": 0.0014, + "step": 17767 + }, + { + "epoch": 7.225701504676698, + "grad_norm": 0.6474845249792053, + "learning_rate": 8.68214077704196e-06, + "loss": 0.009, + "step": 17768 + }, + { + "epoch": 7.2261081740544935, + "grad_norm": 0.1555055632604752, + "learning_rate": 8.68113643243172e-06, + "loss": 0.0016, + "step": 17769 + }, + { + "epoch": 7.226514843432289, + "grad_norm": 0.02121433829245445, + "learning_rate": 8.680132101360278e-06, + "loss": 0.0003, + "step": 17770 + }, + { + "epoch": 7.226921512810085, + "grad_norm": 3.5869049799285104, + "learning_rate": 8.679127783837942e-06, + "loss": 0.0764, + "step": 17771 + }, + { + "epoch": 7.227328182187882, + "grad_norm": 9.89326354398459, + "learning_rate": 8.678123479875028e-06, + "loss": 0.3443, + "step": 17772 + }, + { + "epoch": 7.227734851565677, + "grad_norm": 0.714021814443836, + "learning_rate": 8.677119189481838e-06, + "loss": 0.011, + "step": 17773 + }, + { + "epoch": 7.228141520943473, + "grad_norm": 0.5094739599208484, + "learning_rate": 8.676114912668686e-06, + "loss": 0.0049, + "step": 17774 + }, + { + "epoch": 7.228548190321269, + "grad_norm": 0.0015601107251126116, + "learning_rate": 8.675110649445883e-06, + "loss": 0.0, + "step": 17775 + }, + { + "epoch": 7.228954859699065, + "grad_norm": 0.681468212291076, + "learning_rate": 8.674106399823736e-06, + "loss": 0.0075, + "step": 17776 + }, + { + "epoch": 7.2293615290768605, + "grad_norm": 4.474367035256437, + "learning_rate": 8.67310216381255e-06, + "loss": 0.058, + "step": 17777 + }, + { + "epoch": 7.229768198454656, + "grad_norm": 12.530650668248033, + "learning_rate": 8.672097941422641e-06, + "loss": 0.7639, + "step": 17778 + }, + { + "epoch": 7.230174867832452, + "grad_norm": 0.7273793131141869, + "learning_rate": 8.671093732664316e-06, + "loss": 0.0151, + "step": 17779 + }, + { + "epoch": 7.230581537210248, + "grad_norm": 0.03745896928510365, + "learning_rate": 8.670089537547883e-06, + "loss": 0.0006, + "step": 17780 + }, + { + "epoch": 7.2309882065880435, + "grad_norm": 1.5618751341848616, + "learning_rate": 8.669085356083646e-06, + "loss": 0.0055, + "step": 17781 + }, + { + "epoch": 7.23139487596584, + "grad_norm": 2.4932501320942118, + "learning_rate": 8.668081188281921e-06, + "loss": 0.0426, + "step": 17782 + }, + { + "epoch": 7.231801545343636, + "grad_norm": 1.766595618499996, + "learning_rate": 8.667077034153015e-06, + "loss": 0.0252, + "step": 17783 + }, + { + "epoch": 7.232208214721432, + "grad_norm": 8.998933869701123, + "learning_rate": 8.666072893707229e-06, + "loss": 0.2089, + "step": 17784 + }, + { + "epoch": 7.2326148840992275, + "grad_norm": 0.41789900649439415, + "learning_rate": 8.66506876695488e-06, + "loss": 0.0064, + "step": 17785 + }, + { + "epoch": 7.233021553477023, + "grad_norm": 0.05794150522517334, + "learning_rate": 8.66406465390627e-06, + "loss": 0.0007, + "step": 17786 + }, + { + "epoch": 7.233428222854819, + "grad_norm": 4.30996017672134, + "learning_rate": 8.663060554571707e-06, + "loss": 0.0524, + "step": 17787 + }, + { + "epoch": 7.233834892232615, + "grad_norm": 0.03924449856974255, + "learning_rate": 8.662056468961503e-06, + "loss": 0.0005, + "step": 17788 + }, + { + "epoch": 7.2342415616104105, + "grad_norm": 1.3499623962097422, + "learning_rate": 8.661052397085964e-06, + "loss": 0.0135, + "step": 17789 + }, + { + "epoch": 7.234648230988206, + "grad_norm": 2.174033523907284, + "learning_rate": 8.660048338955395e-06, + "loss": 0.0293, + "step": 17790 + }, + { + "epoch": 7.235054900366002, + "grad_norm": 0.21070123803105723, + "learning_rate": 8.659044294580102e-06, + "loss": 0.0027, + "step": 17791 + }, + { + "epoch": 7.235461569743798, + "grad_norm": 4.7728901930181244, + "learning_rate": 8.658040263970395e-06, + "loss": 0.0453, + "step": 17792 + }, + { + "epoch": 7.2358682391215945, + "grad_norm": 0.1697279507404979, + "learning_rate": 8.657036247136583e-06, + "loss": 0.0018, + "step": 17793 + }, + { + "epoch": 7.23627490849939, + "grad_norm": 1.721361672150284, + "learning_rate": 8.656032244088965e-06, + "loss": 0.028, + "step": 17794 + }, + { + "epoch": 7.236681577877186, + "grad_norm": 9.154416269281096, + "learning_rate": 8.655028254837856e-06, + "loss": 0.1782, + "step": 17795 + }, + { + "epoch": 7.237088247254982, + "grad_norm": 0.7832428189781834, + "learning_rate": 8.654024279393558e-06, + "loss": 0.0132, + "step": 17796 + }, + { + "epoch": 7.2374949166327776, + "grad_norm": 0.07609485221723608, + "learning_rate": 8.653020317766377e-06, + "loss": 0.0013, + "step": 17797 + }, + { + "epoch": 7.237901586010573, + "grad_norm": 8.982448014637901, + "learning_rate": 8.652016369966622e-06, + "loss": 0.195, + "step": 17798 + }, + { + "epoch": 7.238308255388369, + "grad_norm": 0.6005100924878289, + "learning_rate": 8.651012436004597e-06, + "loss": 0.0074, + "step": 17799 + }, + { + "epoch": 7.238714924766165, + "grad_norm": 9.428021202363178, + "learning_rate": 8.650008515890606e-06, + "loss": 0.1997, + "step": 17800 + }, + { + "epoch": 7.239121594143961, + "grad_norm": 0.847091984982843, + "learning_rate": 8.64900460963496e-06, + "loss": 0.0108, + "step": 17801 + }, + { + "epoch": 7.239528263521757, + "grad_norm": 8.693392939835945, + "learning_rate": 8.648000717247961e-06, + "loss": 0.4257, + "step": 17802 + }, + { + "epoch": 7.239934932899553, + "grad_norm": 4.00068617919895, + "learning_rate": 8.646996838739916e-06, + "loss": 0.1711, + "step": 17803 + }, + { + "epoch": 7.240341602277349, + "grad_norm": 3.4673495196036224, + "learning_rate": 8.645992974121127e-06, + "loss": 0.0302, + "step": 17804 + }, + { + "epoch": 7.240748271655145, + "grad_norm": 0.015493727295736604, + "learning_rate": 8.644989123401903e-06, + "loss": 0.0002, + "step": 17805 + }, + { + "epoch": 7.24115494103294, + "grad_norm": 0.10134083862225052, + "learning_rate": 8.643985286592547e-06, + "loss": 0.0012, + "step": 17806 + }, + { + "epoch": 7.241561610410736, + "grad_norm": 7.926414733282403, + "learning_rate": 8.642981463703364e-06, + "loss": 0.1316, + "step": 17807 + }, + { + "epoch": 7.241968279788532, + "grad_norm": 0.2535892478524905, + "learning_rate": 8.64197765474466e-06, + "loss": 0.0036, + "step": 17808 + }, + { + "epoch": 7.242374949166328, + "grad_norm": 0.8931703490232943, + "learning_rate": 8.64097385972674e-06, + "loss": 0.0171, + "step": 17809 + }, + { + "epoch": 7.242781618544123, + "grad_norm": 0.04355404406139455, + "learning_rate": 8.639970078659904e-06, + "loss": 0.0005, + "step": 17810 + }, + { + "epoch": 7.243188287921919, + "grad_norm": 0.4320516629089057, + "learning_rate": 8.638966311554464e-06, + "loss": 0.0083, + "step": 17811 + }, + { + "epoch": 7.243594957299715, + "grad_norm": 2.1998123329765633, + "learning_rate": 8.637962558420718e-06, + "loss": 0.0255, + "step": 17812 + }, + { + "epoch": 7.244001626677512, + "grad_norm": 0.1330586989242475, + "learning_rate": 8.63695881926897e-06, + "loss": 0.0018, + "step": 17813 + }, + { + "epoch": 7.244408296055307, + "grad_norm": 9.651701899283431, + "learning_rate": 8.635955094109528e-06, + "loss": 0.2283, + "step": 17814 + }, + { + "epoch": 7.244814965433103, + "grad_norm": 0.06779116716824171, + "learning_rate": 8.634951382952693e-06, + "loss": 0.0008, + "step": 17815 + }, + { + "epoch": 7.245221634810899, + "grad_norm": 3.699109049643787, + "learning_rate": 8.63394768580877e-06, + "loss": 0.0513, + "step": 17816 + }, + { + "epoch": 7.245628304188695, + "grad_norm": 0.05326820178741351, + "learning_rate": 8.632944002688058e-06, + "loss": 0.0008, + "step": 17817 + }, + { + "epoch": 7.24603497356649, + "grad_norm": 0.1401392066182607, + "learning_rate": 8.631940333600867e-06, + "loss": 0.0009, + "step": 17818 + }, + { + "epoch": 7.246441642944286, + "grad_norm": 0.25290758503329047, + "learning_rate": 8.630936678557499e-06, + "loss": 0.0033, + "step": 17819 + }, + { + "epoch": 7.246848312322082, + "grad_norm": 0.17843242836915651, + "learning_rate": 8.62993303756825e-06, + "loss": 0.0016, + "step": 17820 + }, + { + "epoch": 7.247254981699878, + "grad_norm": 0.3586546245935015, + "learning_rate": 8.62892941064343e-06, + "loss": 0.005, + "step": 17821 + }, + { + "epoch": 7.2476616510776735, + "grad_norm": 0.015033489787580106, + "learning_rate": 8.627925797793339e-06, + "loss": 0.0001, + "step": 17822 + }, + { + "epoch": 7.24806832045547, + "grad_norm": 0.012681176956129506, + "learning_rate": 8.62692219902828e-06, + "loss": 0.0001, + "step": 17823 + }, + { + "epoch": 7.248474989833266, + "grad_norm": 6.0701090052195505, + "learning_rate": 8.625918614358559e-06, + "loss": 0.1196, + "step": 17824 + }, + { + "epoch": 7.248881659211062, + "grad_norm": 12.119270598201048, + "learning_rate": 8.624915043794474e-06, + "loss": 0.1412, + "step": 17825 + }, + { + "epoch": 7.249288328588857, + "grad_norm": 2.0801329556114587, + "learning_rate": 8.623911487346324e-06, + "loss": 0.0311, + "step": 17826 + }, + { + "epoch": 7.249694997966653, + "grad_norm": 5.017247034102495, + "learning_rate": 8.622907945024418e-06, + "loss": 0.1433, + "step": 17827 + }, + { + "epoch": 7.250101667344449, + "grad_norm": 5.409733104339499, + "learning_rate": 8.621904416839056e-06, + "loss": 0.0648, + "step": 17828 + }, + { + "epoch": 7.250508336722245, + "grad_norm": 0.507295243926916, + "learning_rate": 8.620900902800537e-06, + "loss": 0.007, + "step": 17829 + }, + { + "epoch": 7.2509150061000405, + "grad_norm": 0.13094772968162463, + "learning_rate": 8.619897402919164e-06, + "loss": 0.0014, + "step": 17830 + }, + { + "epoch": 7.251321675477836, + "grad_norm": 6.237827338107207, + "learning_rate": 8.618893917205242e-06, + "loss": 0.2542, + "step": 17831 + }, + { + "epoch": 7.251728344855632, + "grad_norm": 0.0012383112920123908, + "learning_rate": 8.617890445669065e-06, + "loss": 0.0, + "step": 17832 + }, + { + "epoch": 7.252135014233428, + "grad_norm": 7.803373871391928, + "learning_rate": 8.61688698832094e-06, + "loss": 0.1331, + "step": 17833 + }, + { + "epoch": 7.252541683611224, + "grad_norm": 0.34569633518133963, + "learning_rate": 8.615883545171166e-06, + "loss": 0.0063, + "step": 17834 + }, + { + "epoch": 7.25294835298902, + "grad_norm": 0.011087314022579049, + "learning_rate": 8.614880116230045e-06, + "loss": 0.0001, + "step": 17835 + }, + { + "epoch": 7.253355022366816, + "grad_norm": 0.4598579210424511, + "learning_rate": 8.613876701507876e-06, + "loss": 0.0059, + "step": 17836 + }, + { + "epoch": 7.253761691744612, + "grad_norm": 4.963121563628686, + "learning_rate": 8.612873301014958e-06, + "loss": 0.0801, + "step": 17837 + }, + { + "epoch": 7.2541683611224075, + "grad_norm": 0.08569134812978449, + "learning_rate": 8.611869914761598e-06, + "loss": 0.0008, + "step": 17838 + }, + { + "epoch": 7.254575030500203, + "grad_norm": 0.016972736845967243, + "learning_rate": 8.610866542758089e-06, + "loss": 0.0002, + "step": 17839 + }, + { + "epoch": 7.254981699877999, + "grad_norm": 0.08209034844222501, + "learning_rate": 8.609863185014736e-06, + "loss": 0.0006, + "step": 17840 + }, + { + "epoch": 7.255388369255795, + "grad_norm": 5.047625211248451, + "learning_rate": 8.608859841541835e-06, + "loss": 0.1741, + "step": 17841 + }, + { + "epoch": 7.2557950386335905, + "grad_norm": 0.0014887790777723332, + "learning_rate": 8.60785651234969e-06, + "loss": 0.0, + "step": 17842 + }, + { + "epoch": 7.256201708011387, + "grad_norm": 0.012023434309003307, + "learning_rate": 8.606853197448596e-06, + "loss": 0.0002, + "step": 17843 + }, + { + "epoch": 7.256608377389183, + "grad_norm": 4.639896246997213, + "learning_rate": 8.605849896848856e-06, + "loss": 0.1646, + "step": 17844 + }, + { + "epoch": 7.257015046766979, + "grad_norm": 6.892133304406218, + "learning_rate": 8.604846610560771e-06, + "loss": 0.1562, + "step": 17845 + }, + { + "epoch": 7.2574217161447745, + "grad_norm": 4.52261831913358, + "learning_rate": 8.603843338594634e-06, + "loss": 0.0959, + "step": 17846 + }, + { + "epoch": 7.25782838552257, + "grad_norm": 0.19897401727465064, + "learning_rate": 8.60284008096075e-06, + "loss": 0.003, + "step": 17847 + }, + { + "epoch": 7.258235054900366, + "grad_norm": 11.352848683062557, + "learning_rate": 8.601836837669414e-06, + "loss": 0.2153, + "step": 17848 + }, + { + "epoch": 7.258641724278162, + "grad_norm": 5.073887159031416, + "learning_rate": 8.600833608730928e-06, + "loss": 0.2438, + "step": 17849 + }, + { + "epoch": 7.2590483936559576, + "grad_norm": 4.54187339351781, + "learning_rate": 8.59983039415559e-06, + "loss": 0.0727, + "step": 17850 + }, + { + "epoch": 7.259455063033753, + "grad_norm": 0.10436922282207305, + "learning_rate": 8.598827193953696e-06, + "loss": 0.0015, + "step": 17851 + }, + { + "epoch": 7.259861732411549, + "grad_norm": 1.7261834641488618, + "learning_rate": 8.597824008135548e-06, + "loss": 0.0215, + "step": 17852 + }, + { + "epoch": 7.260268401789345, + "grad_norm": 0.7005676599679314, + "learning_rate": 8.59682083671144e-06, + "loss": 0.0143, + "step": 17853 + }, + { + "epoch": 7.2606750711671415, + "grad_norm": 2.037838712955338, + "learning_rate": 8.595817679691673e-06, + "loss": 0.0344, + "step": 17854 + }, + { + "epoch": 7.261081740544937, + "grad_norm": 0.0034736161206672536, + "learning_rate": 8.594814537086546e-06, + "loss": 0.0001, + "step": 17855 + }, + { + "epoch": 7.261488409922733, + "grad_norm": 1.5300846053335138, + "learning_rate": 8.593811408906352e-06, + "loss": 0.0178, + "step": 17856 + }, + { + "epoch": 7.261895079300529, + "grad_norm": 13.435707903863438, + "learning_rate": 8.592808295161394e-06, + "loss": 0.243, + "step": 17857 + }, + { + "epoch": 7.262301748678325, + "grad_norm": 4.110770561598646, + "learning_rate": 8.591805195861966e-06, + "loss": 0.062, + "step": 17858 + }, + { + "epoch": 7.26270841805612, + "grad_norm": 7.6726246214343785, + "learning_rate": 8.590802111018365e-06, + "loss": 0.1778, + "step": 17859 + }, + { + "epoch": 7.263115087433916, + "grad_norm": 2.779458487185553, + "learning_rate": 8.589799040640896e-06, + "loss": 0.0658, + "step": 17860 + }, + { + "epoch": 7.263521756811712, + "grad_norm": 4.1613429354314855, + "learning_rate": 8.588795984739843e-06, + "loss": 0.0941, + "step": 17861 + }, + { + "epoch": 7.263928426189508, + "grad_norm": 2.3797995214636476, + "learning_rate": 8.587792943325512e-06, + "loss": 0.0331, + "step": 17862 + }, + { + "epoch": 7.264335095567303, + "grad_norm": 5.13082753735362, + "learning_rate": 8.586789916408197e-06, + "loss": 0.1447, + "step": 17863 + }, + { + "epoch": 7.2647417649451, + "grad_norm": 0.0186409512921829, + "learning_rate": 8.585786903998192e-06, + "loss": 0.0002, + "step": 17864 + }, + { + "epoch": 7.265148434322896, + "grad_norm": 14.583854818784024, + "learning_rate": 8.584783906105799e-06, + "loss": 0.7906, + "step": 17865 + }, + { + "epoch": 7.265555103700692, + "grad_norm": 0.004861093189080124, + "learning_rate": 8.583780922741312e-06, + "loss": 0.0001, + "step": 17866 + }, + { + "epoch": 7.265961773078487, + "grad_norm": 2.858399551151581, + "learning_rate": 8.582777953915023e-06, + "loss": 0.0553, + "step": 17867 + }, + { + "epoch": 7.266368442456283, + "grad_norm": 3.609978158994141, + "learning_rate": 8.581774999637236e-06, + "loss": 0.0425, + "step": 17868 + }, + { + "epoch": 7.266775111834079, + "grad_norm": 0.7711697057136637, + "learning_rate": 8.58077205991824e-06, + "loss": 0.0094, + "step": 17869 + }, + { + "epoch": 7.267181781211875, + "grad_norm": 1.8892030882375501, + "learning_rate": 8.579769134768335e-06, + "loss": 0.0281, + "step": 17870 + }, + { + "epoch": 7.26758845058967, + "grad_norm": 0.092271682342563, + "learning_rate": 8.57876622419781e-06, + "loss": 0.0008, + "step": 17871 + }, + { + "epoch": 7.267995119967466, + "grad_norm": 0.01248014096896458, + "learning_rate": 8.57776332821697e-06, + "loss": 0.0001, + "step": 17872 + }, + { + "epoch": 7.268401789345262, + "grad_norm": 0.2813862168284039, + "learning_rate": 8.576760446836103e-06, + "loss": 0.0044, + "step": 17873 + }, + { + "epoch": 7.268808458723058, + "grad_norm": 9.48303634114117, + "learning_rate": 8.575757580065505e-06, + "loss": 0.6532, + "step": 17874 + }, + { + "epoch": 7.269215128100854, + "grad_norm": 4.829178494186814, + "learning_rate": 8.574754727915475e-06, + "loss": 0.0856, + "step": 17875 + }, + { + "epoch": 7.26962179747865, + "grad_norm": 0.00442854217305078, + "learning_rate": 8.573751890396304e-06, + "loss": 0.0001, + "step": 17876 + }, + { + "epoch": 7.270028466856446, + "grad_norm": 5.249543648779602, + "learning_rate": 8.572749067518286e-06, + "loss": 0.1089, + "step": 17877 + }, + { + "epoch": 7.270435136234242, + "grad_norm": 0.7467759976826819, + "learning_rate": 8.57174625929172e-06, + "loss": 0.011, + "step": 17878 + }, + { + "epoch": 7.270841805612037, + "grad_norm": 0.01802323479747809, + "learning_rate": 8.570743465726897e-06, + "loss": 0.0002, + "step": 17879 + }, + { + "epoch": 7.271248474989833, + "grad_norm": 10.333928665021062, + "learning_rate": 8.569740686834109e-06, + "loss": 0.3839, + "step": 17880 + }, + { + "epoch": 7.271655144367629, + "grad_norm": 14.745754323930745, + "learning_rate": 8.568737922623654e-06, + "loss": 0.4148, + "step": 17881 + }, + { + "epoch": 7.272061813745425, + "grad_norm": 0.1359441189760734, + "learning_rate": 8.567735173105826e-06, + "loss": 0.002, + "step": 17882 + }, + { + "epoch": 7.2724684831232205, + "grad_norm": 0.25017582870312666, + "learning_rate": 8.566732438290917e-06, + "loss": 0.0043, + "step": 17883 + }, + { + "epoch": 7.272875152501017, + "grad_norm": 8.151284215926898, + "learning_rate": 8.56572971818922e-06, + "loss": 0.318, + "step": 17884 + }, + { + "epoch": 7.273281821878813, + "grad_norm": 6.3380665027331835, + "learning_rate": 8.56472701281103e-06, + "loss": 0.2284, + "step": 17885 + }, + { + "epoch": 7.273688491256609, + "grad_norm": 0.5833934910448079, + "learning_rate": 8.563724322166638e-06, + "loss": 0.0077, + "step": 17886 + }, + { + "epoch": 7.274095160634404, + "grad_norm": 0.011571945302878493, + "learning_rate": 8.56272164626634e-06, + "loss": 0.0002, + "step": 17887 + }, + { + "epoch": 7.2745018300122, + "grad_norm": 1.677963108087471, + "learning_rate": 8.561718985120428e-06, + "loss": 0.0265, + "step": 17888 + }, + { + "epoch": 7.274908499389996, + "grad_norm": 6.277223749829739, + "learning_rate": 8.560716338739194e-06, + "loss": 0.1231, + "step": 17889 + }, + { + "epoch": 7.275315168767792, + "grad_norm": 3.0932029098303717, + "learning_rate": 8.55971370713293e-06, + "loss": 0.035, + "step": 17890 + }, + { + "epoch": 7.2757218381455875, + "grad_norm": 3.18539187415593, + "learning_rate": 8.55871109031193e-06, + "loss": 0.0466, + "step": 17891 + }, + { + "epoch": 7.276128507523383, + "grad_norm": 4.189194337203542, + "learning_rate": 8.557708488286487e-06, + "loss": 0.0604, + "step": 17892 + }, + { + "epoch": 7.276535176901179, + "grad_norm": 0.34121141880423506, + "learning_rate": 8.556705901066894e-06, + "loss": 0.0056, + "step": 17893 + }, + { + "epoch": 7.276941846278975, + "grad_norm": 0.17483072157361465, + "learning_rate": 8.555703328663439e-06, + "loss": 0.0035, + "step": 17894 + }, + { + "epoch": 7.277348515656771, + "grad_norm": 3.162505661671986, + "learning_rate": 8.554700771086416e-06, + "loss": 0.0572, + "step": 17895 + }, + { + "epoch": 7.277755185034567, + "grad_norm": 0.12279706275431887, + "learning_rate": 8.553698228346117e-06, + "loss": 0.0018, + "step": 17896 + }, + { + "epoch": 7.278161854412363, + "grad_norm": 7.614412288314583, + "learning_rate": 8.552695700452833e-06, + "loss": 0.0355, + "step": 17897 + }, + { + "epoch": 7.278568523790159, + "grad_norm": 17.530683748854752, + "learning_rate": 8.551693187416859e-06, + "loss": 0.6584, + "step": 17898 + }, + { + "epoch": 7.2789751931679545, + "grad_norm": 3.415446805379355, + "learning_rate": 8.550690689248482e-06, + "loss": 0.063, + "step": 17899 + }, + { + "epoch": 7.27938186254575, + "grad_norm": 4.695561152489592, + "learning_rate": 8.549688205957992e-06, + "loss": 0.0229, + "step": 17900 + }, + { + "epoch": 7.279788531923546, + "grad_norm": 0.02434057425125486, + "learning_rate": 8.548685737555686e-06, + "loss": 0.0002, + "step": 17901 + }, + { + "epoch": 7.280195201301342, + "grad_norm": 3.252854365539241, + "learning_rate": 8.547683284051851e-06, + "loss": 0.0698, + "step": 17902 + }, + { + "epoch": 7.2806018706791376, + "grad_norm": 0.18569468013666493, + "learning_rate": 8.546680845456776e-06, + "loss": 0.0029, + "step": 17903 + }, + { + "epoch": 7.281008540056933, + "grad_norm": 0.11121148789680864, + "learning_rate": 8.545678421780754e-06, + "loss": 0.0023, + "step": 17904 + }, + { + "epoch": 7.28141520943473, + "grad_norm": 1.563274975158322, + "learning_rate": 8.544676013034076e-06, + "loss": 0.0213, + "step": 17905 + }, + { + "epoch": 7.281821878812526, + "grad_norm": 5.207884231616163, + "learning_rate": 8.543673619227031e-06, + "loss": 0.104, + "step": 17906 + }, + { + "epoch": 7.2822285481903215, + "grad_norm": 0.34944429142882594, + "learning_rate": 8.542671240369908e-06, + "loss": 0.0052, + "step": 17907 + }, + { + "epoch": 7.282635217568117, + "grad_norm": 0.560860794530912, + "learning_rate": 8.541668876473e-06, + "loss": 0.0051, + "step": 17908 + }, + { + "epoch": 7.283041886945913, + "grad_norm": 0.5564369729071909, + "learning_rate": 8.540666527546593e-06, + "loss": 0.0096, + "step": 17909 + }, + { + "epoch": 7.283448556323709, + "grad_norm": 0.7006928579535439, + "learning_rate": 8.539664193600978e-06, + "loss": 0.0042, + "step": 17910 + }, + { + "epoch": 7.283855225701505, + "grad_norm": 2.1860381773659823, + "learning_rate": 8.538661874646448e-06, + "loss": 0.0401, + "step": 17911 + }, + { + "epoch": 7.2842618950793, + "grad_norm": 1.8150963977664591, + "learning_rate": 8.537659570693289e-06, + "loss": 0.022, + "step": 17912 + }, + { + "epoch": 7.284668564457096, + "grad_norm": 0.09641803111370677, + "learning_rate": 8.536657281751788e-06, + "loss": 0.001, + "step": 17913 + }, + { + "epoch": 7.285075233834892, + "grad_norm": 7.106772789811664, + "learning_rate": 8.535655007832238e-06, + "loss": 0.1974, + "step": 17914 + }, + { + "epoch": 7.285481903212688, + "grad_norm": 0.7447035488743076, + "learning_rate": 8.534652748944928e-06, + "loss": 0.0122, + "step": 17915 + }, + { + "epoch": 7.285888572590484, + "grad_norm": 0.3895370955022405, + "learning_rate": 8.53365050510014e-06, + "loss": 0.0032, + "step": 17916 + }, + { + "epoch": 7.28629524196828, + "grad_norm": 0.27731202173719144, + "learning_rate": 8.532648276308171e-06, + "loss": 0.0057, + "step": 17917 + }, + { + "epoch": 7.286701911346076, + "grad_norm": 0.06211935287358216, + "learning_rate": 8.531646062579308e-06, + "loss": 0.0008, + "step": 17918 + }, + { + "epoch": 7.287108580723872, + "grad_norm": 5.881431033476548, + "learning_rate": 8.530643863923835e-06, + "loss": 0.1101, + "step": 17919 + }, + { + "epoch": 7.287515250101667, + "grad_norm": 2.4280658300498787, + "learning_rate": 8.529641680352041e-06, + "loss": 0.0277, + "step": 17920 + }, + { + "epoch": 7.287921919479463, + "grad_norm": 0.9344746953121462, + "learning_rate": 8.528639511874217e-06, + "loss": 0.0149, + "step": 17921 + }, + { + "epoch": 7.288328588857259, + "grad_norm": 0.0011748306827277066, + "learning_rate": 8.52763735850065e-06, + "loss": 0.0, + "step": 17922 + }, + { + "epoch": 7.288735258235055, + "grad_norm": 2.4201852598938487, + "learning_rate": 8.526635220241622e-06, + "loss": 0.037, + "step": 17923 + }, + { + "epoch": 7.28914192761285, + "grad_norm": 0.034052417150659, + "learning_rate": 8.525633097107429e-06, + "loss": 0.0003, + "step": 17924 + }, + { + "epoch": 7.289548596990647, + "grad_norm": 7.442100362870264, + "learning_rate": 8.524630989108353e-06, + "loss": 0.1214, + "step": 17925 + }, + { + "epoch": 7.289955266368443, + "grad_norm": 3.1431153566810512, + "learning_rate": 8.523628896254681e-06, + "loss": 0.0502, + "step": 17926 + }, + { + "epoch": 7.290361935746239, + "grad_norm": 0.0005378597635152427, + "learning_rate": 8.522626818556704e-06, + "loss": 0.0, + "step": 17927 + }, + { + "epoch": 7.290768605124034, + "grad_norm": 7.045568866780859, + "learning_rate": 8.521624756024706e-06, + "loss": 0.1656, + "step": 17928 + }, + { + "epoch": 7.29117527450183, + "grad_norm": 1.7765319227906486, + "learning_rate": 8.520622708668971e-06, + "loss": 0.021, + "step": 17929 + }, + { + "epoch": 7.291581943879626, + "grad_norm": 0.019155768297393006, + "learning_rate": 8.51962067649979e-06, + "loss": 0.0002, + "step": 17930 + }, + { + "epoch": 7.291988613257422, + "grad_norm": 1.8771483604350563, + "learning_rate": 8.518618659527452e-06, + "loss": 0.0226, + "step": 17931 + }, + { + "epoch": 7.292395282635217, + "grad_norm": 2.6000551145055244, + "learning_rate": 8.517616657762234e-06, + "loss": 0.0385, + "step": 17932 + }, + { + "epoch": 7.292801952013013, + "grad_norm": 0.003946046651165821, + "learning_rate": 8.516614671214428e-06, + "loss": 0.0, + "step": 17933 + }, + { + "epoch": 7.293208621390809, + "grad_norm": 0.02053109399801102, + "learning_rate": 8.51561269989432e-06, + "loss": 0.0003, + "step": 17934 + }, + { + "epoch": 7.293615290768605, + "grad_norm": 0.7186908596564582, + "learning_rate": 8.514610743812195e-06, + "loss": 0.0079, + "step": 17935 + }, + { + "epoch": 7.294021960146401, + "grad_norm": 0.03789308584544992, + "learning_rate": 8.513608802978336e-06, + "loss": 0.0004, + "step": 17936 + }, + { + "epoch": 7.294428629524197, + "grad_norm": 3.5334821396351246, + "learning_rate": 8.512606877403033e-06, + "loss": 0.0542, + "step": 17937 + }, + { + "epoch": 7.294835298901993, + "grad_norm": 0.2615203353743625, + "learning_rate": 8.511604967096568e-06, + "loss": 0.0041, + "step": 17938 + }, + { + "epoch": 7.295241968279789, + "grad_norm": 0.15503781474899658, + "learning_rate": 8.510603072069225e-06, + "loss": 0.0016, + "step": 17939 + }, + { + "epoch": 7.295648637657584, + "grad_norm": 0.033517978359297707, + "learning_rate": 8.509601192331296e-06, + "loss": 0.0003, + "step": 17940 + }, + { + "epoch": 7.29605530703538, + "grad_norm": 17.659711538958202, + "learning_rate": 8.508599327893058e-06, + "loss": 0.1376, + "step": 17941 + }, + { + "epoch": 7.296461976413176, + "grad_norm": 0.09711803448355444, + "learning_rate": 8.507597478764797e-06, + "loss": 0.0012, + "step": 17942 + }, + { + "epoch": 7.296868645790972, + "grad_norm": 0.03653685820942597, + "learning_rate": 8.5065956449568e-06, + "loss": 0.0005, + "step": 17943 + }, + { + "epoch": 7.2972753151687675, + "grad_norm": 1.45093442235577, + "learning_rate": 8.505593826479354e-06, + "loss": 0.02, + "step": 17944 + }, + { + "epoch": 7.297681984546563, + "grad_norm": 0.9707116866837527, + "learning_rate": 8.504592023342737e-06, + "loss": 0.0131, + "step": 17945 + }, + { + "epoch": 7.29808865392436, + "grad_norm": 0.0576786703910429, + "learning_rate": 8.503590235557234e-06, + "loss": 0.0005, + "step": 17946 + }, + { + "epoch": 7.298495323302156, + "grad_norm": 7.515865539183133, + "learning_rate": 8.50258846313313e-06, + "loss": 0.1293, + "step": 17947 + }, + { + "epoch": 7.298901992679951, + "grad_norm": 0.08611990673768861, + "learning_rate": 8.501586706080712e-06, + "loss": 0.0008, + "step": 17948 + }, + { + "epoch": 7.299308662057747, + "grad_norm": 0.016237974800487248, + "learning_rate": 8.500584964410258e-06, + "loss": 0.0003, + "step": 17949 + }, + { + "epoch": 7.299715331435543, + "grad_norm": 0.01961320762401619, + "learning_rate": 8.499583238132055e-06, + "loss": 0.0002, + "step": 17950 + }, + { + "epoch": 7.300122000813339, + "grad_norm": 4.612757503982523, + "learning_rate": 8.498581527256387e-06, + "loss": 0.0744, + "step": 17951 + }, + { + "epoch": 7.3005286701911345, + "grad_norm": 0.013127988405557322, + "learning_rate": 8.497579831793531e-06, + "loss": 0.0002, + "step": 17952 + }, + { + "epoch": 7.30093533956893, + "grad_norm": 4.8356476147138, + "learning_rate": 8.496578151753778e-06, + "loss": 0.214, + "step": 17953 + }, + { + "epoch": 7.301342008946726, + "grad_norm": 2.331452330480225, + "learning_rate": 8.495576487147406e-06, + "loss": 0.0313, + "step": 17954 + }, + { + "epoch": 7.301748678324522, + "grad_norm": 2.6050677754108915, + "learning_rate": 8.494574837984698e-06, + "loss": 0.0412, + "step": 17955 + }, + { + "epoch": 7.302155347702318, + "grad_norm": 0.03752867007025643, + "learning_rate": 8.493573204275936e-06, + "loss": 0.0003, + "step": 17956 + }, + { + "epoch": 7.302562017080114, + "grad_norm": 11.589688779077226, + "learning_rate": 8.492571586031404e-06, + "loss": 0.3888, + "step": 17957 + }, + { + "epoch": 7.30296868645791, + "grad_norm": 0.16470358744787578, + "learning_rate": 8.491569983261385e-06, + "loss": 0.0027, + "step": 17958 + }, + { + "epoch": 7.303375355835706, + "grad_norm": 1.0974169345891251, + "learning_rate": 8.490568395976158e-06, + "loss": 0.0146, + "step": 17959 + }, + { + "epoch": 7.3037820252135015, + "grad_norm": 5.621541241044141, + "learning_rate": 8.48956682418601e-06, + "loss": 0.0876, + "step": 17960 + }, + { + "epoch": 7.304188694591297, + "grad_norm": 2.951811879715703, + "learning_rate": 8.488565267901213e-06, + "loss": 0.0407, + "step": 17961 + }, + { + "epoch": 7.304595363969093, + "grad_norm": 2.547434705053219, + "learning_rate": 8.487563727132056e-06, + "loss": 0.0378, + "step": 17962 + }, + { + "epoch": 7.305002033346889, + "grad_norm": 3.7900646747489395, + "learning_rate": 8.486562201888821e-06, + "loss": 0.061, + "step": 17963 + }, + { + "epoch": 7.305408702724685, + "grad_norm": 0.19625849985228713, + "learning_rate": 8.485560692181782e-06, + "loss": 0.0021, + "step": 17964 + }, + { + "epoch": 7.30581537210248, + "grad_norm": 2.104099526249874, + "learning_rate": 8.484559198021227e-06, + "loss": 0.0165, + "step": 17965 + }, + { + "epoch": 7.306222041480277, + "grad_norm": 3.663457208427633, + "learning_rate": 8.483557719417436e-06, + "loss": 0.0558, + "step": 17966 + }, + { + "epoch": 7.306628710858073, + "grad_norm": 6.321415831109326, + "learning_rate": 8.482556256380685e-06, + "loss": 0.1221, + "step": 17967 + }, + { + "epoch": 7.3070353802358685, + "grad_norm": 0.0022089971806003045, + "learning_rate": 8.48155480892126e-06, + "loss": 0.0, + "step": 17968 + }, + { + "epoch": 7.307442049613664, + "grad_norm": 0.2985794735778905, + "learning_rate": 8.480553377049438e-06, + "loss": 0.0042, + "step": 17969 + }, + { + "epoch": 7.30784871899146, + "grad_norm": 0.18421214445407014, + "learning_rate": 8.479551960775501e-06, + "loss": 0.0023, + "step": 17970 + }, + { + "epoch": 7.308255388369256, + "grad_norm": 9.152408283769718, + "learning_rate": 8.478550560109729e-06, + "loss": 0.2287, + "step": 17971 + }, + { + "epoch": 7.308662057747052, + "grad_norm": 1.9280132420183844, + "learning_rate": 8.4775491750624e-06, + "loss": 0.032, + "step": 17972 + }, + { + "epoch": 7.309068727124847, + "grad_norm": 4.868536880669836, + "learning_rate": 8.476547805643795e-06, + "loss": 0.1797, + "step": 17973 + }, + { + "epoch": 7.309475396502643, + "grad_norm": 0.13030806820392649, + "learning_rate": 8.475546451864194e-06, + "loss": 0.0016, + "step": 17974 + }, + { + "epoch": 7.309882065880439, + "grad_norm": 1.8362867096789282, + "learning_rate": 8.474545113733877e-06, + "loss": 0.0147, + "step": 17975 + }, + { + "epoch": 7.310288735258235, + "grad_norm": 0.5444567582950116, + "learning_rate": 8.473543791263121e-06, + "loss": 0.0052, + "step": 17976 + }, + { + "epoch": 7.310695404636031, + "grad_norm": 0.005217516910862776, + "learning_rate": 8.472542484462204e-06, + "loss": 0.0001, + "step": 17977 + }, + { + "epoch": 7.311102074013827, + "grad_norm": 0.06527086508797117, + "learning_rate": 8.47154119334141e-06, + "loss": 0.0007, + "step": 17978 + }, + { + "epoch": 7.311508743391623, + "grad_norm": 0.941349927479009, + "learning_rate": 8.470539917911016e-06, + "loss": 0.017, + "step": 17979 + }, + { + "epoch": 7.311915412769419, + "grad_norm": 9.976127708388375, + "learning_rate": 8.469538658181295e-06, + "loss": 0.18, + "step": 17980 + }, + { + "epoch": 7.312322082147214, + "grad_norm": 0.05870173161509692, + "learning_rate": 8.468537414162534e-06, + "loss": 0.0004, + "step": 17981 + }, + { + "epoch": 7.31272875152501, + "grad_norm": 10.191219452037599, + "learning_rate": 8.467536185865007e-06, + "loss": 0.1727, + "step": 17982 + }, + { + "epoch": 7.313135420902806, + "grad_norm": 0.09187356046933316, + "learning_rate": 8.466534973298989e-06, + "loss": 0.0011, + "step": 17983 + }, + { + "epoch": 7.313542090280602, + "grad_norm": 0.0932639638832735, + "learning_rate": 8.465533776474763e-06, + "loss": 0.001, + "step": 17984 + }, + { + "epoch": 7.313948759658397, + "grad_norm": 4.780619326654642, + "learning_rate": 8.464532595402607e-06, + "loss": 0.3072, + "step": 17985 + }, + { + "epoch": 7.314355429036193, + "grad_norm": 0.6883537274845339, + "learning_rate": 8.463531430092796e-06, + "loss": 0.0119, + "step": 17986 + }, + { + "epoch": 7.31476209841399, + "grad_norm": 0.004876173050957825, + "learning_rate": 8.462530280555606e-06, + "loss": 0.0001, + "step": 17987 + }, + { + "epoch": 7.315168767791786, + "grad_norm": 4.437307079497057, + "learning_rate": 8.461529146801318e-06, + "loss": 0.1073, + "step": 17988 + }, + { + "epoch": 7.315575437169581, + "grad_norm": 0.03615989168391906, + "learning_rate": 8.46052802884021e-06, + "loss": 0.0004, + "step": 17989 + }, + { + "epoch": 7.315982106547377, + "grad_norm": 0.014860075731510705, + "learning_rate": 8.45952692668255e-06, + "loss": 0.0002, + "step": 17990 + }, + { + "epoch": 7.316388775925173, + "grad_norm": 0.08166899345777133, + "learning_rate": 8.458525840338628e-06, + "loss": 0.0008, + "step": 17991 + }, + { + "epoch": 7.316795445302969, + "grad_norm": 4.387696993050846, + "learning_rate": 8.457524769818712e-06, + "loss": 0.059, + "step": 17992 + }, + { + "epoch": 7.317202114680764, + "grad_norm": 6.24374311920678, + "learning_rate": 8.456523715133077e-06, + "loss": 0.1577, + "step": 17993 + }, + { + "epoch": 7.31760878405856, + "grad_norm": 0.7303937320390067, + "learning_rate": 8.455522676292007e-06, + "loss": 0.0102, + "step": 17994 + }, + { + "epoch": 7.318015453436356, + "grad_norm": 0.08800425965617141, + "learning_rate": 8.454521653305774e-06, + "loss": 0.0012, + "step": 17995 + }, + { + "epoch": 7.318422122814152, + "grad_norm": 0.06076578243325596, + "learning_rate": 8.453520646184652e-06, + "loss": 0.0009, + "step": 17996 + }, + { + "epoch": 7.3188287921919475, + "grad_norm": 7.113441027408026, + "learning_rate": 8.45251965493892e-06, + "loss": 0.1523, + "step": 17997 + }, + { + "epoch": 7.319235461569744, + "grad_norm": 0.02167995132977648, + "learning_rate": 8.451518679578855e-06, + "loss": 0.0003, + "step": 17998 + }, + { + "epoch": 7.31964213094754, + "grad_norm": 0.934499653505126, + "learning_rate": 8.450517720114728e-06, + "loss": 0.0132, + "step": 17999 + }, + { + "epoch": 7.320048800325336, + "grad_norm": 1.6155058628729047, + "learning_rate": 8.449516776556812e-06, + "loss": 0.0225, + "step": 18000 + }, + { + "epoch": 7.320455469703131, + "grad_norm": 0.11168707979714358, + "learning_rate": 8.448515848915393e-06, + "loss": 0.0014, + "step": 18001 + }, + { + "epoch": 7.320862139080927, + "grad_norm": 8.314013605954406, + "learning_rate": 8.447514937200738e-06, + "loss": 0.2604, + "step": 18002 + }, + { + "epoch": 7.321268808458723, + "grad_norm": 0.16951203513098667, + "learning_rate": 8.446514041423123e-06, + "loss": 0.002, + "step": 18003 + }, + { + "epoch": 7.321675477836519, + "grad_norm": 0.17482949629778888, + "learning_rate": 8.445513161592822e-06, + "loss": 0.0026, + "step": 18004 + }, + { + "epoch": 7.3220821472143145, + "grad_norm": 0.7237243413573294, + "learning_rate": 8.444512297720112e-06, + "loss": 0.0087, + "step": 18005 + }, + { + "epoch": 7.32248881659211, + "grad_norm": 0.08098596415313189, + "learning_rate": 8.443511449815264e-06, + "loss": 0.0014, + "step": 18006 + }, + { + "epoch": 7.322895485969907, + "grad_norm": 6.825281179286134, + "learning_rate": 8.442510617888558e-06, + "loss": 0.1415, + "step": 18007 + }, + { + "epoch": 7.323302155347703, + "grad_norm": 4.872225248995178, + "learning_rate": 8.441509801950262e-06, + "loss": 0.084, + "step": 18008 + }, + { + "epoch": 7.3237088247254984, + "grad_norm": 0.9360357447329712, + "learning_rate": 8.440509002010656e-06, + "loss": 0.0564, + "step": 18009 + }, + { + "epoch": 7.324115494103294, + "grad_norm": 9.616313846804918, + "learning_rate": 8.439508218080004e-06, + "loss": 0.3902, + "step": 18010 + }, + { + "epoch": 7.32452216348109, + "grad_norm": 7.31286645622125, + "learning_rate": 8.438507450168589e-06, + "loss": 0.1836, + "step": 18011 + }, + { + "epoch": 7.324928832858886, + "grad_norm": 0.3113312651750859, + "learning_rate": 8.43750669828668e-06, + "loss": 0.0049, + "step": 18012 + }, + { + "epoch": 7.3253355022366815, + "grad_norm": 5.900342934856738, + "learning_rate": 8.43650596244455e-06, + "loss": 0.1774, + "step": 18013 + }, + { + "epoch": 7.325742171614477, + "grad_norm": 5.415785292450606, + "learning_rate": 8.435505242652477e-06, + "loss": 0.2443, + "step": 18014 + }, + { + "epoch": 7.326148840992273, + "grad_norm": 0.07971480683199184, + "learning_rate": 8.434504538920726e-06, + "loss": 0.0005, + "step": 18015 + }, + { + "epoch": 7.326555510370069, + "grad_norm": 8.197330059228188, + "learning_rate": 8.433503851259575e-06, + "loss": 0.1705, + "step": 18016 + }, + { + "epoch": 7.326962179747865, + "grad_norm": 0.9874010753557421, + "learning_rate": 8.432503179679297e-06, + "loss": 0.014, + "step": 18017 + }, + { + "epoch": 7.327368849125661, + "grad_norm": 0.08112965319083655, + "learning_rate": 8.431502524190162e-06, + "loss": 0.0012, + "step": 18018 + }, + { + "epoch": 7.327775518503457, + "grad_norm": 0.8327967045124064, + "learning_rate": 8.43050188480244e-06, + "loss": 0.0083, + "step": 18019 + }, + { + "epoch": 7.328182187881253, + "grad_norm": 3.6012979428897856, + "learning_rate": 8.42950126152641e-06, + "loss": 0.0971, + "step": 18020 + }, + { + "epoch": 7.3285888572590485, + "grad_norm": 12.151475491554987, + "learning_rate": 8.42850065437234e-06, + "loss": 0.2616, + "step": 18021 + }, + { + "epoch": 7.328995526636844, + "grad_norm": 0.007763061992784267, + "learning_rate": 8.4275000633505e-06, + "loss": 0.0001, + "step": 18022 + }, + { + "epoch": 7.32940219601464, + "grad_norm": 4.5670416059970105, + "learning_rate": 8.426499488471163e-06, + "loss": 0.1204, + "step": 18023 + }, + { + "epoch": 7.329808865392436, + "grad_norm": 4.015326376003942, + "learning_rate": 8.425498929744602e-06, + "loss": 0.1164, + "step": 18024 + }, + { + "epoch": 7.330215534770232, + "grad_norm": 16.929694703386197, + "learning_rate": 8.424498387181088e-06, + "loss": 0.197, + "step": 18025 + }, + { + "epoch": 7.330622204148027, + "grad_norm": 4.475543445624333, + "learning_rate": 8.423497860790887e-06, + "loss": 0.0899, + "step": 18026 + }, + { + "epoch": 7.331028873525823, + "grad_norm": 0.1091069524790445, + "learning_rate": 8.422497350584277e-06, + "loss": 0.0012, + "step": 18027 + }, + { + "epoch": 7.33143554290362, + "grad_norm": 0.003257440893493395, + "learning_rate": 8.421496856571526e-06, + "loss": 0.0, + "step": 18028 + }, + { + "epoch": 7.3318422122814155, + "grad_norm": 0.5186833322655555, + "learning_rate": 8.420496378762901e-06, + "loss": 0.0028, + "step": 18029 + }, + { + "epoch": 7.332248881659211, + "grad_norm": 0.3827971893256455, + "learning_rate": 8.419495917168677e-06, + "loss": 0.0065, + "step": 18030 + }, + { + "epoch": 7.332655551037007, + "grad_norm": 0.3381313025128505, + "learning_rate": 8.418495471799124e-06, + "loss": 0.0043, + "step": 18031 + }, + { + "epoch": 7.333062220414803, + "grad_norm": 0.2945510226507775, + "learning_rate": 8.417495042664509e-06, + "loss": 0.0034, + "step": 18032 + }, + { + "epoch": 7.333468889792599, + "grad_norm": 3.991496237047312, + "learning_rate": 8.416494629775107e-06, + "loss": 0.0551, + "step": 18033 + }, + { + "epoch": 7.333875559170394, + "grad_norm": 2.124346906213045, + "learning_rate": 8.415494233141183e-06, + "loss": 0.0156, + "step": 18034 + }, + { + "epoch": 7.33428222854819, + "grad_norm": 0.06769987632661302, + "learning_rate": 8.414493852773008e-06, + "loss": 0.001, + "step": 18035 + }, + { + "epoch": 7.334688897925986, + "grad_norm": 0.03475122885752915, + "learning_rate": 8.41349348868085e-06, + "loss": 0.0005, + "step": 18036 + }, + { + "epoch": 7.335095567303782, + "grad_norm": 0.0845945901592462, + "learning_rate": 8.412493140874983e-06, + "loss": 0.0012, + "step": 18037 + }, + { + "epoch": 7.335502236681577, + "grad_norm": 3.663896224431856, + "learning_rate": 8.41149280936567e-06, + "loss": 0.0199, + "step": 18038 + }, + { + "epoch": 7.335908906059374, + "grad_norm": 0.2759755623879829, + "learning_rate": 8.410492494163184e-06, + "loss": 0.0033, + "step": 18039 + }, + { + "epoch": 7.33631557543717, + "grad_norm": 2.2721483603619337, + "learning_rate": 8.409492195277792e-06, + "loss": 0.0411, + "step": 18040 + }, + { + "epoch": 7.336722244814966, + "grad_norm": 4.051102090763644, + "learning_rate": 8.408491912719766e-06, + "loss": 0.0662, + "step": 18041 + }, + { + "epoch": 7.337128914192761, + "grad_norm": 4.836957650110503, + "learning_rate": 8.407491646499367e-06, + "loss": 0.1029, + "step": 18042 + }, + { + "epoch": 7.337535583570557, + "grad_norm": 7.482762989383019, + "learning_rate": 8.406491396626871e-06, + "loss": 0.1389, + "step": 18043 + }, + { + "epoch": 7.337942252948353, + "grad_norm": 1.6293528616405086, + "learning_rate": 8.405491163112543e-06, + "loss": 0.0206, + "step": 18044 + }, + { + "epoch": 7.338348922326149, + "grad_norm": 0.017756725895541458, + "learning_rate": 8.404490945966646e-06, + "loss": 0.0003, + "step": 18045 + }, + { + "epoch": 7.338755591703944, + "grad_norm": 8.52390295509542, + "learning_rate": 8.403490745199457e-06, + "loss": 0.1646, + "step": 18046 + }, + { + "epoch": 7.33916226108174, + "grad_norm": 1.0996578903108023, + "learning_rate": 8.40249056082124e-06, + "loss": 0.0199, + "step": 18047 + }, + { + "epoch": 7.339568930459537, + "grad_norm": 0.20976426231837508, + "learning_rate": 8.401490392842258e-06, + "loss": 0.003, + "step": 18048 + }, + { + "epoch": 7.339975599837333, + "grad_norm": 0.09442646474044777, + "learning_rate": 8.400490241272781e-06, + "loss": 0.0012, + "step": 18049 + }, + { + "epoch": 7.340382269215128, + "grad_norm": 2.261595366538274, + "learning_rate": 8.39949010612308e-06, + "loss": 0.0276, + "step": 18050 + }, + { + "epoch": 7.340788938592924, + "grad_norm": 4.639442109659134, + "learning_rate": 8.398489987403418e-06, + "loss": 0.0713, + "step": 18051 + }, + { + "epoch": 7.34119560797072, + "grad_norm": 0.008249094725858828, + "learning_rate": 8.39748988512406e-06, + "loss": 0.0001, + "step": 18052 + }, + { + "epoch": 7.341602277348516, + "grad_norm": 0.7277429235068139, + "learning_rate": 8.396489799295278e-06, + "loss": 0.0101, + "step": 18053 + }, + { + "epoch": 7.342008946726311, + "grad_norm": 0.1562593548552194, + "learning_rate": 8.395489729927333e-06, + "loss": 0.0012, + "step": 18054 + }, + { + "epoch": 7.342415616104107, + "grad_norm": 5.74997027599385, + "learning_rate": 8.394489677030493e-06, + "loss": 0.1129, + "step": 18055 + }, + { + "epoch": 7.342822285481903, + "grad_norm": 0.19130607714543857, + "learning_rate": 8.393489640615026e-06, + "loss": 0.003, + "step": 18056 + }, + { + "epoch": 7.343228954859699, + "grad_norm": 0.009720200680853585, + "learning_rate": 8.392489620691196e-06, + "loss": 0.0002, + "step": 18057 + }, + { + "epoch": 7.3436356242374945, + "grad_norm": 6.997962960639222, + "learning_rate": 8.39148961726927e-06, + "loss": 0.1679, + "step": 18058 + }, + { + "epoch": 7.344042293615291, + "grad_norm": 3.4142848795966896, + "learning_rate": 8.390489630359516e-06, + "loss": 0.0635, + "step": 18059 + }, + { + "epoch": 7.344448962993087, + "grad_norm": 0.3457468905811955, + "learning_rate": 8.389489659972189e-06, + "loss": 0.0035, + "step": 18060 + }, + { + "epoch": 7.344855632370883, + "grad_norm": 2.7746855159209054, + "learning_rate": 8.388489706117564e-06, + "loss": 0.0468, + "step": 18061 + }, + { + "epoch": 7.3452623017486784, + "grad_norm": 0.004739973345111926, + "learning_rate": 8.387489768805904e-06, + "loss": 0.0001, + "step": 18062 + }, + { + "epoch": 7.345668971126474, + "grad_norm": 0.5334619923791338, + "learning_rate": 8.386489848047472e-06, + "loss": 0.0041, + "step": 18063 + }, + { + "epoch": 7.34607564050427, + "grad_norm": 0.005072403845662321, + "learning_rate": 8.385489943852534e-06, + "loss": 0.0001, + "step": 18064 + }, + { + "epoch": 7.346482309882066, + "grad_norm": 0.01672069081374382, + "learning_rate": 8.384490056231353e-06, + "loss": 0.0002, + "step": 18065 + }, + { + "epoch": 7.3468889792598615, + "grad_norm": 0.05143809306441809, + "learning_rate": 8.383490185194197e-06, + "loss": 0.0004, + "step": 18066 + }, + { + "epoch": 7.347295648637657, + "grad_norm": 0.12679911629452317, + "learning_rate": 8.382490330751325e-06, + "loss": 0.0012, + "step": 18067 + }, + { + "epoch": 7.347702318015453, + "grad_norm": 2.9279407516518168, + "learning_rate": 8.381490492913007e-06, + "loss": 0.053, + "step": 18068 + }, + { + "epoch": 7.34810898739325, + "grad_norm": 0.03616624669344736, + "learning_rate": 8.380490671689504e-06, + "loss": 0.0004, + "step": 18069 + }, + { + "epoch": 7.3485156567710455, + "grad_norm": 0.13470486896341055, + "learning_rate": 8.379490867091075e-06, + "loss": 0.0016, + "step": 18070 + }, + { + "epoch": 7.348922326148841, + "grad_norm": 0.030462169074102024, + "learning_rate": 8.37849107912799e-06, + "loss": 0.0004, + "step": 18071 + }, + { + "epoch": 7.349328995526637, + "grad_norm": 0.2687590434045375, + "learning_rate": 8.377491307810512e-06, + "loss": 0.0033, + "step": 18072 + }, + { + "epoch": 7.349735664904433, + "grad_norm": 7.836285813988076, + "learning_rate": 8.376491553148899e-06, + "loss": 0.1788, + "step": 18073 + }, + { + "epoch": 7.3501423342822285, + "grad_norm": 2.399977086107292, + "learning_rate": 8.37549181515342e-06, + "loss": 0.0361, + "step": 18074 + }, + { + "epoch": 7.350549003660024, + "grad_norm": 0.010270876132557331, + "learning_rate": 8.374492093834334e-06, + "loss": 0.0001, + "step": 18075 + }, + { + "epoch": 7.35095567303782, + "grad_norm": 4.75812138668943, + "learning_rate": 8.373492389201905e-06, + "loss": 0.0822, + "step": 18076 + }, + { + "epoch": 7.351362342415616, + "grad_norm": 8.58876182443815, + "learning_rate": 8.372492701266395e-06, + "loss": 0.1762, + "step": 18077 + }, + { + "epoch": 7.351769011793412, + "grad_norm": 0.5127918384149429, + "learning_rate": 8.371493030038067e-06, + "loss": 0.0043, + "step": 18078 + }, + { + "epoch": 7.352175681171207, + "grad_norm": 6.833249259162335, + "learning_rate": 8.370493375527183e-06, + "loss": 0.0798, + "step": 18079 + }, + { + "epoch": 7.352582350549004, + "grad_norm": 0.00543788250808156, + "learning_rate": 8.369493737744004e-06, + "loss": 0.0001, + "step": 18080 + }, + { + "epoch": 7.3529890199268, + "grad_norm": 2.7736858894292773, + "learning_rate": 8.368494116698792e-06, + "loss": 0.0337, + "step": 18081 + }, + { + "epoch": 7.3533956893045955, + "grad_norm": 0.01710093732561559, + "learning_rate": 8.36749451240181e-06, + "loss": 0.0002, + "step": 18082 + }, + { + "epoch": 7.353802358682391, + "grad_norm": 0.38213522549936313, + "learning_rate": 8.366494924863317e-06, + "loss": 0.0052, + "step": 18083 + }, + { + "epoch": 7.354209028060187, + "grad_norm": 2.346602818349854, + "learning_rate": 8.365495354093576e-06, + "loss": 0.025, + "step": 18084 + }, + { + "epoch": 7.354615697437983, + "grad_norm": 0.0870247282933917, + "learning_rate": 8.36449580010285e-06, + "loss": 0.0011, + "step": 18085 + }, + { + "epoch": 7.355022366815779, + "grad_norm": 0.13782570885373452, + "learning_rate": 8.363496262901395e-06, + "loss": 0.002, + "step": 18086 + }, + { + "epoch": 7.355429036193574, + "grad_norm": 2.190640783214816, + "learning_rate": 8.362496742499475e-06, + "loss": 0.0394, + "step": 18087 + }, + { + "epoch": 7.35583570557137, + "grad_norm": 0.015768869679787217, + "learning_rate": 8.361497238907352e-06, + "loss": 0.0003, + "step": 18088 + }, + { + "epoch": 7.356242374949167, + "grad_norm": 0.014047695888876744, + "learning_rate": 8.360497752135283e-06, + "loss": 0.0002, + "step": 18089 + }, + { + "epoch": 7.3566490443269625, + "grad_norm": 0.04723147504403076, + "learning_rate": 8.359498282193529e-06, + "loss": 0.0005, + "step": 18090 + }, + { + "epoch": 7.357055713704758, + "grad_norm": 0.019812350297935034, + "learning_rate": 8.358498829092352e-06, + "loss": 0.0003, + "step": 18091 + }, + { + "epoch": 7.357462383082554, + "grad_norm": 3.1460497070929505, + "learning_rate": 8.35749939284201e-06, + "loss": 0.0481, + "step": 18092 + }, + { + "epoch": 7.35786905246035, + "grad_norm": 7.181662934771534, + "learning_rate": 8.356499973452765e-06, + "loss": 0.0442, + "step": 18093 + }, + { + "epoch": 7.358275721838146, + "grad_norm": 2.327411347816433, + "learning_rate": 8.355500570934874e-06, + "loss": 0.0404, + "step": 18094 + }, + { + "epoch": 7.358682391215941, + "grad_norm": 10.40553597951881, + "learning_rate": 8.354501185298597e-06, + "loss": 0.2933, + "step": 18095 + }, + { + "epoch": 7.359089060593737, + "grad_norm": 4.955340648007525, + "learning_rate": 8.353501816554191e-06, + "loss": 0.0397, + "step": 18096 + }, + { + "epoch": 7.359495729971533, + "grad_norm": 0.05552238078661085, + "learning_rate": 8.352502464711922e-06, + "loss": 0.0005, + "step": 18097 + }, + { + "epoch": 7.359902399349329, + "grad_norm": 3.861279424472157, + "learning_rate": 8.351503129782042e-06, + "loss": 0.1138, + "step": 18098 + }, + { + "epoch": 7.360309068727124, + "grad_norm": 0.2599269232388334, + "learning_rate": 8.350503811774812e-06, + "loss": 0.0034, + "step": 18099 + }, + { + "epoch": 7.360715738104921, + "grad_norm": 7.61110806348918, + "learning_rate": 8.349504510700492e-06, + "loss": 0.1395, + "step": 18100 + }, + { + "epoch": 7.361122407482717, + "grad_norm": 1.6271955895039654, + "learning_rate": 8.348505226569336e-06, + "loss": 0.0224, + "step": 18101 + }, + { + "epoch": 7.361529076860513, + "grad_norm": 6.126025514412398, + "learning_rate": 8.347505959391608e-06, + "loss": 0.1116, + "step": 18102 + }, + { + "epoch": 7.361935746238308, + "grad_norm": 10.482282957119972, + "learning_rate": 8.346506709177561e-06, + "loss": 0.1347, + "step": 18103 + }, + { + "epoch": 7.362342415616104, + "grad_norm": 0.10510177762078884, + "learning_rate": 8.345507475937456e-06, + "loss": 0.0017, + "step": 18104 + }, + { + "epoch": 7.3627490849939, + "grad_norm": 0.6306413991520832, + "learning_rate": 8.344508259681551e-06, + "loss": 0.0094, + "step": 18105 + }, + { + "epoch": 7.363155754371696, + "grad_norm": 0.4812217389389831, + "learning_rate": 8.343509060420099e-06, + "loss": 0.0021, + "step": 18106 + }, + { + "epoch": 7.363562423749491, + "grad_norm": 0.09520190782993024, + "learning_rate": 8.342509878163361e-06, + "loss": 0.0018, + "step": 18107 + }, + { + "epoch": 7.363969093127287, + "grad_norm": 0.030212701720141625, + "learning_rate": 8.341510712921594e-06, + "loss": 0.0003, + "step": 18108 + }, + { + "epoch": 7.364375762505083, + "grad_norm": 3.1045325899990024, + "learning_rate": 8.340511564705052e-06, + "loss": 0.0273, + "step": 18109 + }, + { + "epoch": 7.36478243188288, + "grad_norm": 0.005156913337754466, + "learning_rate": 8.339512433523997e-06, + "loss": 0.0, + "step": 18110 + }, + { + "epoch": 7.365189101260675, + "grad_norm": 0.1855475088031431, + "learning_rate": 8.338513319388681e-06, + "loss": 0.0026, + "step": 18111 + }, + { + "epoch": 7.365595770638471, + "grad_norm": 0.13314966149604282, + "learning_rate": 8.337514222309365e-06, + "loss": 0.0017, + "step": 18112 + }, + { + "epoch": 7.366002440016267, + "grad_norm": 2.1771897483852816, + "learning_rate": 8.336515142296297e-06, + "loss": 0.0278, + "step": 18113 + }, + { + "epoch": 7.366409109394063, + "grad_norm": 9.610085236268628, + "learning_rate": 8.33551607935974e-06, + "loss": 0.2382, + "step": 18114 + }, + { + "epoch": 7.3668157787718584, + "grad_norm": 3.7629510882987325, + "learning_rate": 8.334517033509951e-06, + "loss": 0.1745, + "step": 18115 + }, + { + "epoch": 7.367222448149654, + "grad_norm": 12.270645538996193, + "learning_rate": 8.33351800475718e-06, + "loss": 0.3984, + "step": 18116 + }, + { + "epoch": 7.36762911752745, + "grad_norm": 3.4881425458322908, + "learning_rate": 8.332518993111686e-06, + "loss": 0.0577, + "step": 18117 + }, + { + "epoch": 7.368035786905246, + "grad_norm": 11.073121942679226, + "learning_rate": 8.331519998583725e-06, + "loss": 0.5348, + "step": 18118 + }, + { + "epoch": 7.3684424562830415, + "grad_norm": 0.023482446307490704, + "learning_rate": 8.330521021183549e-06, + "loss": 0.0002, + "step": 18119 + }, + { + "epoch": 7.368849125660837, + "grad_norm": 9.002426480564722, + "learning_rate": 8.329522060921416e-06, + "loss": 0.1908, + "step": 18120 + }, + { + "epoch": 7.369255795038634, + "grad_norm": 0.07688262846236418, + "learning_rate": 8.32852311780758e-06, + "loss": 0.0012, + "step": 18121 + }, + { + "epoch": 7.36966246441643, + "grad_norm": 0.9211649749338512, + "learning_rate": 8.327524191852294e-06, + "loss": 0.0093, + "step": 18122 + }, + { + "epoch": 7.3700691337942255, + "grad_norm": 9.512219104491894, + "learning_rate": 8.326525283065815e-06, + "loss": 0.1919, + "step": 18123 + }, + { + "epoch": 7.370475803172021, + "grad_norm": 1.7303497637362977, + "learning_rate": 8.325526391458397e-06, + "loss": 0.0219, + "step": 18124 + }, + { + "epoch": 7.370882472549817, + "grad_norm": 1.3879439336256514, + "learning_rate": 8.324527517040294e-06, + "loss": 0.0165, + "step": 18125 + }, + { + "epoch": 7.371289141927613, + "grad_norm": 0.04009732416392678, + "learning_rate": 8.323528659821757e-06, + "loss": 0.0004, + "step": 18126 + }, + { + "epoch": 7.3716958113054085, + "grad_norm": 0.011346709697800881, + "learning_rate": 8.322529819813042e-06, + "loss": 0.0001, + "step": 18127 + }, + { + "epoch": 7.372102480683204, + "grad_norm": 5.035534925385973, + "learning_rate": 8.321530997024405e-06, + "loss": 0.0905, + "step": 18128 + }, + { + "epoch": 7.372509150061, + "grad_norm": 0.15259597456498986, + "learning_rate": 8.320532191466092e-06, + "loss": 0.0025, + "step": 18129 + }, + { + "epoch": 7.372915819438797, + "grad_norm": 10.241813117429459, + "learning_rate": 8.319533403148368e-06, + "loss": 0.1169, + "step": 18130 + }, + { + "epoch": 7.3733224888165925, + "grad_norm": 0.2550339133727217, + "learning_rate": 8.318534632081476e-06, + "loss": 0.0015, + "step": 18131 + }, + { + "epoch": 7.373729158194388, + "grad_norm": 2.1825823579213113, + "learning_rate": 8.317535878275671e-06, + "loss": 0.047, + "step": 18132 + }, + { + "epoch": 7.374135827572184, + "grad_norm": 1.9129542321984938, + "learning_rate": 8.31653714174121e-06, + "loss": 0.0151, + "step": 18133 + }, + { + "epoch": 7.37454249694998, + "grad_norm": 0.11280071746663765, + "learning_rate": 8.31553842248834e-06, + "loss": 0.0009, + "step": 18134 + }, + { + "epoch": 7.3749491663277755, + "grad_norm": 5.4240168816624825, + "learning_rate": 8.314539720527318e-06, + "loss": 0.1676, + "step": 18135 + }, + { + "epoch": 7.375355835705571, + "grad_norm": 1.9715171869102948, + "learning_rate": 8.313541035868393e-06, + "loss": 0.029, + "step": 18136 + }, + { + "epoch": 7.375762505083367, + "grad_norm": 8.074897862126502, + "learning_rate": 8.312542368521818e-06, + "loss": 0.0877, + "step": 18137 + }, + { + "epoch": 7.376169174461163, + "grad_norm": 9.529543689518949, + "learning_rate": 8.311543718497845e-06, + "loss": 0.0731, + "step": 18138 + }, + { + "epoch": 7.376575843838959, + "grad_norm": 2.4383727978555143, + "learning_rate": 8.310545085806725e-06, + "loss": 0.0453, + "step": 18139 + }, + { + "epoch": 7.376982513216754, + "grad_norm": 11.618252539925066, + "learning_rate": 8.30954647045871e-06, + "loss": 0.4652, + "step": 18140 + }, + { + "epoch": 7.377389182594551, + "grad_norm": 0.0014562923228475667, + "learning_rate": 8.308547872464053e-06, + "loss": 0.0, + "step": 18141 + }, + { + "epoch": 7.377795851972347, + "grad_norm": 0.02530810034166366, + "learning_rate": 8.307549291833e-06, + "loss": 0.0002, + "step": 18142 + }, + { + "epoch": 7.3782025213501425, + "grad_norm": 0.3668790878586838, + "learning_rate": 8.306550728575809e-06, + "loss": 0.0055, + "step": 18143 + }, + { + "epoch": 7.378609190727938, + "grad_norm": 0.0003147253516769494, + "learning_rate": 8.305552182702726e-06, + "loss": 0.0, + "step": 18144 + }, + { + "epoch": 7.379015860105734, + "grad_norm": 2.504614891908371, + "learning_rate": 8.304553654224e-06, + "loss": 0.016, + "step": 18145 + }, + { + "epoch": 7.37942252948353, + "grad_norm": 0.35654470073824684, + "learning_rate": 8.303555143149886e-06, + "loss": 0.0049, + "step": 18146 + }, + { + "epoch": 7.379829198861326, + "grad_norm": 0.02817360647686083, + "learning_rate": 8.302556649490634e-06, + "loss": 0.0003, + "step": 18147 + }, + { + "epoch": 7.380235868239121, + "grad_norm": 0.5081950430779261, + "learning_rate": 8.301558173256488e-06, + "loss": 0.0062, + "step": 18148 + }, + { + "epoch": 7.380642537616917, + "grad_norm": 0.12359700726972732, + "learning_rate": 8.300559714457706e-06, + "loss": 0.0022, + "step": 18149 + }, + { + "epoch": 7.381049206994713, + "grad_norm": 0.007108705692513844, + "learning_rate": 8.299561273104535e-06, + "loss": 0.0001, + "step": 18150 + }, + { + "epoch": 7.3814558763725096, + "grad_norm": 1.8984307309974864, + "learning_rate": 8.29856284920722e-06, + "loss": 0.0338, + "step": 18151 + }, + { + "epoch": 7.381862545750305, + "grad_norm": 0.15965746331359953, + "learning_rate": 8.297564442776014e-06, + "loss": 0.0011, + "step": 18152 + }, + { + "epoch": 7.382269215128101, + "grad_norm": 0.02517361793841162, + "learning_rate": 8.296566053821166e-06, + "loss": 0.0004, + "step": 18153 + }, + { + "epoch": 7.382675884505897, + "grad_norm": 1.9199670609738222, + "learning_rate": 8.295567682352926e-06, + "loss": 0.0174, + "step": 18154 + }, + { + "epoch": 7.383082553883693, + "grad_norm": 8.821707371743395, + "learning_rate": 8.29456932838154e-06, + "loss": 0.2501, + "step": 18155 + }, + { + "epoch": 7.383489223261488, + "grad_norm": 1.0912032467497002, + "learning_rate": 8.29357099191726e-06, + "loss": 0.0169, + "step": 18156 + }, + { + "epoch": 7.383895892639284, + "grad_norm": 0.3052378459999631, + "learning_rate": 8.292572672970332e-06, + "loss": 0.0033, + "step": 18157 + }, + { + "epoch": 7.38430256201708, + "grad_norm": 4.239894950624094, + "learning_rate": 8.291574371551002e-06, + "loss": 0.1036, + "step": 18158 + }, + { + "epoch": 7.384709231394876, + "grad_norm": 2.351102144257705, + "learning_rate": 8.290576087669529e-06, + "loss": 0.0414, + "step": 18159 + }, + { + "epoch": 7.385115900772671, + "grad_norm": 11.424529359002335, + "learning_rate": 8.289577821336146e-06, + "loss": 0.3657, + "step": 18160 + }, + { + "epoch": 7.385522570150467, + "grad_norm": 5.535680238191861, + "learning_rate": 8.288579572561109e-06, + "loss": 0.0906, + "step": 18161 + }, + { + "epoch": 7.385929239528264, + "grad_norm": 3.498449076467364, + "learning_rate": 8.287581341354665e-06, + "loss": 0.123, + "step": 18162 + }, + { + "epoch": 7.38633590890606, + "grad_norm": 3.9138593862814086, + "learning_rate": 8.286583127727057e-06, + "loss": 0.0861, + "step": 18163 + }, + { + "epoch": 7.386742578283855, + "grad_norm": 0.4918746068987849, + "learning_rate": 8.285584931688537e-06, + "loss": 0.0041, + "step": 18164 + }, + { + "epoch": 7.387149247661651, + "grad_norm": 0.01543376986542233, + "learning_rate": 8.284586753249353e-06, + "loss": 0.0002, + "step": 18165 + }, + { + "epoch": 7.387555917039447, + "grad_norm": 0.07685816916265605, + "learning_rate": 8.283588592419748e-06, + "loss": 0.0008, + "step": 18166 + }, + { + "epoch": 7.387962586417243, + "grad_norm": 2.169580598239417, + "learning_rate": 8.282590449209968e-06, + "loss": 0.0271, + "step": 18167 + }, + { + "epoch": 7.3883692557950384, + "grad_norm": 0.02515909594794676, + "learning_rate": 8.281592323630263e-06, + "loss": 0.0004, + "step": 18168 + }, + { + "epoch": 7.388775925172834, + "grad_norm": 0.3999531255798282, + "learning_rate": 8.280594215690879e-06, + "loss": 0.007, + "step": 18169 + }, + { + "epoch": 7.38918259455063, + "grad_norm": 0.01014549518316285, + "learning_rate": 8.279596125402056e-06, + "loss": 0.0001, + "step": 18170 + }, + { + "epoch": 7.389589263928427, + "grad_norm": 0.42824351257629456, + "learning_rate": 8.278598052774049e-06, + "loss": 0.0043, + "step": 18171 + }, + { + "epoch": 7.389995933306222, + "grad_norm": 0.7792844973025734, + "learning_rate": 8.277599997817098e-06, + "loss": 0.0077, + "step": 18172 + }, + { + "epoch": 7.390402602684018, + "grad_norm": 0.21097751661850406, + "learning_rate": 8.27660196054145e-06, + "loss": 0.0021, + "step": 18173 + }, + { + "epoch": 7.390809272061814, + "grad_norm": 22.647275423131735, + "learning_rate": 8.27560394095735e-06, + "loss": 0.2934, + "step": 18174 + }, + { + "epoch": 7.39121594143961, + "grad_norm": 0.015596263601699824, + "learning_rate": 8.274605939075045e-06, + "loss": 0.0002, + "step": 18175 + }, + { + "epoch": 7.3916226108174055, + "grad_norm": 1.963105485642278, + "learning_rate": 8.273607954904776e-06, + "loss": 0.0379, + "step": 18176 + }, + { + "epoch": 7.392029280195201, + "grad_norm": 3.5183574736450676, + "learning_rate": 8.272609988456791e-06, + "loss": 0.0379, + "step": 18177 + }, + { + "epoch": 7.392435949572997, + "grad_norm": 0.061505368214856235, + "learning_rate": 8.271612039741334e-06, + "loss": 0.0009, + "step": 18178 + }, + { + "epoch": 7.392842618950793, + "grad_norm": 2.6924079407237795, + "learning_rate": 8.270614108768651e-06, + "loss": 0.0565, + "step": 18179 + }, + { + "epoch": 7.3932492883285885, + "grad_norm": 0.09072176071270939, + "learning_rate": 8.269616195548982e-06, + "loss": 0.0014, + "step": 18180 + }, + { + "epoch": 7.393655957706384, + "grad_norm": 11.328735607396512, + "learning_rate": 8.268618300092575e-06, + "loss": 0.2404, + "step": 18181 + }, + { + "epoch": 7.394062627084181, + "grad_norm": 2.323639107399281, + "learning_rate": 8.267620422409673e-06, + "loss": 0.0387, + "step": 18182 + }, + { + "epoch": 7.394469296461977, + "grad_norm": 0.9218697031349949, + "learning_rate": 8.266622562510516e-06, + "loss": 0.0121, + "step": 18183 + }, + { + "epoch": 7.3948759658397725, + "grad_norm": 0.15798295810953045, + "learning_rate": 8.265624720405354e-06, + "loss": 0.0012, + "step": 18184 + }, + { + "epoch": 7.395282635217568, + "grad_norm": 0.0641342577098761, + "learning_rate": 8.264626896104429e-06, + "loss": 0.0006, + "step": 18185 + }, + { + "epoch": 7.395689304595364, + "grad_norm": 0.2892015456397928, + "learning_rate": 8.263629089617977e-06, + "loss": 0.0028, + "step": 18186 + }, + { + "epoch": 7.39609597397316, + "grad_norm": 1.6503036118810408, + "learning_rate": 8.26263130095625e-06, + "loss": 0.039, + "step": 18187 + }, + { + "epoch": 7.3965026433509555, + "grad_norm": 1.1818490841066138, + "learning_rate": 8.261633530129486e-06, + "loss": 0.0146, + "step": 18188 + }, + { + "epoch": 7.396909312728751, + "grad_norm": 0.1848364297860721, + "learning_rate": 8.260635777147927e-06, + "loss": 0.0018, + "step": 18189 + }, + { + "epoch": 7.397315982106547, + "grad_norm": 0.2603952441347377, + "learning_rate": 8.25963804202182e-06, + "loss": 0.0036, + "step": 18190 + }, + { + "epoch": 7.397722651484343, + "grad_norm": 7.311665648102381, + "learning_rate": 8.258640324761404e-06, + "loss": 0.1528, + "step": 18191 + }, + { + "epoch": 7.3981293208621395, + "grad_norm": 0.17802115866180226, + "learning_rate": 8.257642625376922e-06, + "loss": 0.0017, + "step": 18192 + }, + { + "epoch": 7.398535990239935, + "grad_norm": 1.8751841596226635, + "learning_rate": 8.256644943878612e-06, + "loss": 0.0263, + "step": 18193 + }, + { + "epoch": 7.398942659617731, + "grad_norm": 0.8249737506309921, + "learning_rate": 8.255647280276722e-06, + "loss": 0.0096, + "step": 18194 + }, + { + "epoch": 7.399349328995527, + "grad_norm": 1.4109503331879987, + "learning_rate": 8.254649634581491e-06, + "loss": 0.0334, + "step": 18195 + }, + { + "epoch": 7.3997559983733225, + "grad_norm": 5.79271242566736, + "learning_rate": 8.253652006803158e-06, + "loss": 0.1687, + "step": 18196 + }, + { + "epoch": 7.400162667751118, + "grad_norm": 11.584903407876503, + "learning_rate": 8.252654396951968e-06, + "loss": 0.3969, + "step": 18197 + }, + { + "epoch": 7.400569337128914, + "grad_norm": 5.223613358506145, + "learning_rate": 8.251656805038157e-06, + "loss": 0.1625, + "step": 18198 + }, + { + "epoch": 7.40097600650671, + "grad_norm": 5.605382072629249, + "learning_rate": 8.25065923107197e-06, + "loss": 0.1174, + "step": 18199 + }, + { + "epoch": 7.401382675884506, + "grad_norm": 2.8826648910525177, + "learning_rate": 8.249661675063648e-06, + "loss": 0.0285, + "step": 18200 + }, + { + "epoch": 7.401789345262301, + "grad_norm": 0.018671169440072827, + "learning_rate": 8.248664137023428e-06, + "loss": 0.0002, + "step": 18201 + }, + { + "epoch": 7.402196014640097, + "grad_norm": 10.472800255965984, + "learning_rate": 8.247666616961552e-06, + "loss": 0.3446, + "step": 18202 + }, + { + "epoch": 7.402602684017894, + "grad_norm": 0.006611162397256087, + "learning_rate": 8.246669114888261e-06, + "loss": 0.0001, + "step": 18203 + }, + { + "epoch": 7.4030093533956896, + "grad_norm": 0.09425708273622177, + "learning_rate": 8.245671630813797e-06, + "loss": 0.0015, + "step": 18204 + }, + { + "epoch": 7.403416022773485, + "grad_norm": 5.778404057940138, + "learning_rate": 8.244674164748392e-06, + "loss": 0.0453, + "step": 18205 + }, + { + "epoch": 7.403822692151281, + "grad_norm": 7.570881009284165, + "learning_rate": 8.243676716702291e-06, + "loss": 0.128, + "step": 18206 + }, + { + "epoch": 7.404229361529077, + "grad_norm": 0.1762149209320219, + "learning_rate": 8.242679286685734e-06, + "loss": 0.0022, + "step": 18207 + }, + { + "epoch": 7.404636030906873, + "grad_norm": 4.66710225925565, + "learning_rate": 8.241681874708958e-06, + "loss": 0.08, + "step": 18208 + }, + { + "epoch": 7.405042700284668, + "grad_norm": 0.0061077836599339, + "learning_rate": 8.240684480782201e-06, + "loss": 0.0001, + "step": 18209 + }, + { + "epoch": 7.405449369662464, + "grad_norm": 0.16451903436985524, + "learning_rate": 8.239687104915706e-06, + "loss": 0.0022, + "step": 18210 + }, + { + "epoch": 7.40585603904026, + "grad_norm": 4.447926553651741, + "learning_rate": 8.238689747119708e-06, + "loss": 0.0952, + "step": 18211 + }, + { + "epoch": 7.406262708418057, + "grad_norm": 13.262492514890415, + "learning_rate": 8.237692407404444e-06, + "loss": 0.5401, + "step": 18212 + }, + { + "epoch": 7.406669377795852, + "grad_norm": 8.102524023063912, + "learning_rate": 8.236695085780156e-06, + "loss": 0.2557, + "step": 18213 + }, + { + "epoch": 7.407076047173648, + "grad_norm": 0.9122531616430307, + "learning_rate": 8.235697782257082e-06, + "loss": 0.0141, + "step": 18214 + }, + { + "epoch": 7.407482716551444, + "grad_norm": 0.7365489546836373, + "learning_rate": 8.234700496845455e-06, + "loss": 0.0095, + "step": 18215 + }, + { + "epoch": 7.40788938592924, + "grad_norm": 0.009427465656272777, + "learning_rate": 8.233703229555519e-06, + "loss": 0.0001, + "step": 18216 + }, + { + "epoch": 7.408296055307035, + "grad_norm": 0.062342981627676874, + "learning_rate": 8.232705980397506e-06, + "loss": 0.0008, + "step": 18217 + }, + { + "epoch": 7.408702724684831, + "grad_norm": 0.14745741158005526, + "learning_rate": 8.231708749381658e-06, + "loss": 0.0015, + "step": 18218 + }, + { + "epoch": 7.409109394062627, + "grad_norm": 2.924409476272285, + "learning_rate": 8.230711536518205e-06, + "loss": 0.0526, + "step": 18219 + }, + { + "epoch": 7.409516063440423, + "grad_norm": 0.004029255944908537, + "learning_rate": 8.229714341817393e-06, + "loss": 0.0001, + "step": 18220 + }, + { + "epoch": 7.4099227328182184, + "grad_norm": 1.9082520659780435, + "learning_rate": 8.228717165289453e-06, + "loss": 0.0302, + "step": 18221 + }, + { + "epoch": 7.410329402196014, + "grad_norm": 5.195663734444743, + "learning_rate": 8.227720006944622e-06, + "loss": 0.0607, + "step": 18222 + }, + { + "epoch": 7.410736071573811, + "grad_norm": 0.07466972638917868, + "learning_rate": 8.226722866793137e-06, + "loss": 0.0011, + "step": 18223 + }, + { + "epoch": 7.411142740951607, + "grad_norm": 2.2433890386417312, + "learning_rate": 8.225725744845237e-06, + "loss": 0.0148, + "step": 18224 + }, + { + "epoch": 7.411549410329402, + "grad_norm": 3.0735008663779366, + "learning_rate": 8.22472864111115e-06, + "loss": 0.0361, + "step": 18225 + }, + { + "epoch": 7.411956079707198, + "grad_norm": 0.24416367579584267, + "learning_rate": 8.223731555601121e-06, + "loss": 0.0027, + "step": 18226 + }, + { + "epoch": 7.412362749084994, + "grad_norm": 6.370285991611675, + "learning_rate": 8.222734488325382e-06, + "loss": 0.1929, + "step": 18227 + }, + { + "epoch": 7.41276941846279, + "grad_norm": 5.83904766155396, + "learning_rate": 8.221737439294167e-06, + "loss": 0.0458, + "step": 18228 + }, + { + "epoch": 7.4131760878405855, + "grad_norm": 0.023015758518519235, + "learning_rate": 8.22074040851771e-06, + "loss": 0.0003, + "step": 18229 + }, + { + "epoch": 7.413582757218381, + "grad_norm": 0.3923124889028503, + "learning_rate": 8.21974339600625e-06, + "loss": 0.0027, + "step": 18230 + }, + { + "epoch": 7.413989426596177, + "grad_norm": 1.088289499247293, + "learning_rate": 8.218746401770021e-06, + "loss": 0.0072, + "step": 18231 + }, + { + "epoch": 7.414396095973973, + "grad_norm": 1.9207271616899733, + "learning_rate": 8.217749425819255e-06, + "loss": 0.0137, + "step": 18232 + }, + { + "epoch": 7.414802765351769, + "grad_norm": 0.06576219758314196, + "learning_rate": 8.216752468164188e-06, + "loss": 0.0012, + "step": 18233 + }, + { + "epoch": 7.415209434729565, + "grad_norm": 0.32406119667718536, + "learning_rate": 8.215755528815056e-06, + "loss": 0.0039, + "step": 18234 + }, + { + "epoch": 7.415616104107361, + "grad_norm": 0.039216416557405376, + "learning_rate": 8.21475860778209e-06, + "loss": 0.0005, + "step": 18235 + }, + { + "epoch": 7.416022773485157, + "grad_norm": 0.548227382482052, + "learning_rate": 8.213761705075527e-06, + "loss": 0.0037, + "step": 18236 + }, + { + "epoch": 7.4164294428629525, + "grad_norm": 1.8287142348847978, + "learning_rate": 8.212764820705598e-06, + "loss": 0.0214, + "step": 18237 + }, + { + "epoch": 7.416836112240748, + "grad_norm": 8.011370576447236, + "learning_rate": 8.211767954682537e-06, + "loss": 0.056, + "step": 18238 + }, + { + "epoch": 7.417242781618544, + "grad_norm": 11.603054271824552, + "learning_rate": 8.210771107016579e-06, + "loss": 0.253, + "step": 18239 + }, + { + "epoch": 7.41764945099634, + "grad_norm": 0.6554809053248263, + "learning_rate": 8.209774277717957e-06, + "loss": 0.0112, + "step": 18240 + }, + { + "epoch": 7.4180561203741355, + "grad_norm": 0.13892865253778455, + "learning_rate": 8.208777466796902e-06, + "loss": 0.0016, + "step": 18241 + }, + { + "epoch": 7.418462789751931, + "grad_norm": 2.66725300449771, + "learning_rate": 8.207780674263645e-06, + "loss": 0.03, + "step": 18242 + }, + { + "epoch": 7.418869459129727, + "grad_norm": 2.158996909871094, + "learning_rate": 8.206783900128426e-06, + "loss": 0.0229, + "step": 18243 + }, + { + "epoch": 7.419276128507524, + "grad_norm": 0.8406690625981088, + "learning_rate": 8.205787144401472e-06, + "loss": 0.0089, + "step": 18244 + }, + { + "epoch": 7.4196827978853195, + "grad_norm": 0.011320639855049589, + "learning_rate": 8.204790407093012e-06, + "loss": 0.0002, + "step": 18245 + }, + { + "epoch": 7.420089467263115, + "grad_norm": 3.0020362421192908, + "learning_rate": 8.203793688213286e-06, + "loss": 0.0174, + "step": 18246 + }, + { + "epoch": 7.420496136640911, + "grad_norm": 6.959584037131964, + "learning_rate": 8.202796987772521e-06, + "loss": 0.0703, + "step": 18247 + }, + { + "epoch": 7.420902806018707, + "grad_norm": 0.0640918790257835, + "learning_rate": 8.201800305780947e-06, + "loss": 0.001, + "step": 18248 + }, + { + "epoch": 7.4213094753965025, + "grad_norm": 4.774245157919427, + "learning_rate": 8.2008036422488e-06, + "loss": 0.1635, + "step": 18249 + }, + { + "epoch": 7.421716144774298, + "grad_norm": 3.29091583194039, + "learning_rate": 8.19980699718631e-06, + "loss": 0.0432, + "step": 18250 + }, + { + "epoch": 7.422122814152094, + "grad_norm": 2.6569964442811527, + "learning_rate": 8.198810370603705e-06, + "loss": 0.0412, + "step": 18251 + }, + { + "epoch": 7.42252948352989, + "grad_norm": 3.9952380576079913, + "learning_rate": 8.19781376251122e-06, + "loss": 0.1086, + "step": 18252 + }, + { + "epoch": 7.4229361529076865, + "grad_norm": 0.04692074417529235, + "learning_rate": 8.196817172919083e-06, + "loss": 0.0004, + "step": 18253 + }, + { + "epoch": 7.423342822285482, + "grad_norm": 6.224558976684254, + "learning_rate": 8.195820601837526e-06, + "loss": 0.0905, + "step": 18254 + }, + { + "epoch": 7.423749491663278, + "grad_norm": 0.07354213107639951, + "learning_rate": 8.194824049276776e-06, + "loss": 0.0009, + "step": 18255 + }, + { + "epoch": 7.424156161041074, + "grad_norm": 2.328643670046781, + "learning_rate": 8.193827515247069e-06, + "loss": 0.0255, + "step": 18256 + }, + { + "epoch": 7.4245628304188696, + "grad_norm": 3.507300482495295, + "learning_rate": 8.19283099975863e-06, + "loss": 0.0695, + "step": 18257 + }, + { + "epoch": 7.424969499796665, + "grad_norm": 0.06343316271596293, + "learning_rate": 8.19183450282169e-06, + "loss": 0.0007, + "step": 18258 + }, + { + "epoch": 7.425376169174461, + "grad_norm": 0.09059938656597531, + "learning_rate": 8.190838024446484e-06, + "loss": 0.001, + "step": 18259 + }, + { + "epoch": 7.425782838552257, + "grad_norm": 0.03150155873964659, + "learning_rate": 8.189841564643228e-06, + "loss": 0.0005, + "step": 18260 + }, + { + "epoch": 7.426189507930053, + "grad_norm": 5.027659412401948, + "learning_rate": 8.188845123422165e-06, + "loss": 0.1289, + "step": 18261 + }, + { + "epoch": 7.426596177307848, + "grad_norm": 0.2953666865615492, + "learning_rate": 8.187848700793516e-06, + "loss": 0.0035, + "step": 18262 + }, + { + "epoch": 7.427002846685644, + "grad_norm": 5.692733527982785, + "learning_rate": 8.186852296767512e-06, + "loss": 0.0767, + "step": 18263 + }, + { + "epoch": 7.427409516063441, + "grad_norm": 20.676223614657808, + "learning_rate": 8.18585591135438e-06, + "loss": 0.2493, + "step": 18264 + }, + { + "epoch": 7.427816185441237, + "grad_norm": 2.379946791368448, + "learning_rate": 8.184859544564354e-06, + "loss": 0.0539, + "step": 18265 + }, + { + "epoch": 7.428222854819032, + "grad_norm": 2.9724884966879173, + "learning_rate": 8.183863196407655e-06, + "loss": 0.0637, + "step": 18266 + }, + { + "epoch": 7.428629524196828, + "grad_norm": 0.8454650995088857, + "learning_rate": 8.182866866894516e-06, + "loss": 0.0097, + "step": 18267 + }, + { + "epoch": 7.429036193574624, + "grad_norm": 6.744472484725879, + "learning_rate": 8.181870556035164e-06, + "loss": 0.0958, + "step": 18268 + }, + { + "epoch": 7.42944286295242, + "grad_norm": 29.44243826619417, + "learning_rate": 8.180874263839825e-06, + "loss": 0.5797, + "step": 18269 + }, + { + "epoch": 7.429849532330215, + "grad_norm": 0.16285529628019502, + "learning_rate": 8.179877990318726e-06, + "loss": 0.0026, + "step": 18270 + }, + { + "epoch": 7.430256201708011, + "grad_norm": 23.319375093108427, + "learning_rate": 8.178881735482097e-06, + "loss": 0.1271, + "step": 18271 + }, + { + "epoch": 7.430662871085807, + "grad_norm": 0.4184167107296327, + "learning_rate": 8.177885499340163e-06, + "loss": 0.0045, + "step": 18272 + }, + { + "epoch": 7.431069540463603, + "grad_norm": 0.4742261280893195, + "learning_rate": 8.17688928190315e-06, + "loss": 0.0056, + "step": 18273 + }, + { + "epoch": 7.431476209841399, + "grad_norm": 2.0948149544343995, + "learning_rate": 8.175893083181288e-06, + "loss": 0.0229, + "step": 18274 + }, + { + "epoch": 7.431882879219195, + "grad_norm": 0.35408657058380544, + "learning_rate": 8.174896903184802e-06, + "loss": 0.0043, + "step": 18275 + }, + { + "epoch": 7.432289548596991, + "grad_norm": 1.8332124914884718, + "learning_rate": 8.173900741923915e-06, + "loss": 0.0208, + "step": 18276 + }, + { + "epoch": 7.432696217974787, + "grad_norm": 1.7673457825476766, + "learning_rate": 8.172904599408857e-06, + "loss": 0.0245, + "step": 18277 + }, + { + "epoch": 7.433102887352582, + "grad_norm": 0.2604722777360084, + "learning_rate": 8.171908475649855e-06, + "loss": 0.0036, + "step": 18278 + }, + { + "epoch": 7.433509556730378, + "grad_norm": 5.033049587754528, + "learning_rate": 8.17091237065713e-06, + "loss": 0.0791, + "step": 18279 + }, + { + "epoch": 7.433916226108174, + "grad_norm": 2.0107539242832195, + "learning_rate": 8.169916284440913e-06, + "loss": 0.03, + "step": 18280 + }, + { + "epoch": 7.43432289548597, + "grad_norm": 0.747192681380977, + "learning_rate": 8.168920217011424e-06, + "loss": 0.0099, + "step": 18281 + }, + { + "epoch": 7.4347295648637655, + "grad_norm": 5.129001386665116, + "learning_rate": 8.167924168378892e-06, + "loss": 0.0449, + "step": 18282 + }, + { + "epoch": 7.435136234241561, + "grad_norm": 1.0032968485143472, + "learning_rate": 8.166928138553538e-06, + "loss": 0.0126, + "step": 18283 + }, + { + "epoch": 7.435542903619357, + "grad_norm": 8.078671133249832, + "learning_rate": 8.165932127545591e-06, + "loss": 0.2329, + "step": 18284 + }, + { + "epoch": 7.435949572997154, + "grad_norm": 0.1275444531486987, + "learning_rate": 8.164936135365274e-06, + "loss": 0.0016, + "step": 18285 + }, + { + "epoch": 7.436356242374949, + "grad_norm": 6.221600180701548, + "learning_rate": 8.16394016202281e-06, + "loss": 0.0725, + "step": 18286 + }, + { + "epoch": 7.436762911752745, + "grad_norm": 0.0440859044766357, + "learning_rate": 8.162944207528425e-06, + "loss": 0.0007, + "step": 18287 + }, + { + "epoch": 7.437169581130541, + "grad_norm": 0.4315180932022956, + "learning_rate": 8.161948271892343e-06, + "loss": 0.0046, + "step": 18288 + }, + { + "epoch": 7.437576250508337, + "grad_norm": 4.395573254072473, + "learning_rate": 8.160952355124784e-06, + "loss": 0.0645, + "step": 18289 + }, + { + "epoch": 7.4379829198861325, + "grad_norm": 1.3073368169446522, + "learning_rate": 8.159956457235979e-06, + "loss": 0.0199, + "step": 18290 + }, + { + "epoch": 7.438389589263928, + "grad_norm": 0.46836024423214645, + "learning_rate": 8.158960578236145e-06, + "loss": 0.0051, + "step": 18291 + }, + { + "epoch": 7.438796258641724, + "grad_norm": 0.08997730062648733, + "learning_rate": 8.157964718135506e-06, + "loss": 0.0009, + "step": 18292 + }, + { + "epoch": 7.43920292801952, + "grad_norm": 13.690775359633523, + "learning_rate": 8.156968876944289e-06, + "loss": 0.6589, + "step": 18293 + }, + { + "epoch": 7.439609597397316, + "grad_norm": 0.6786103482585469, + "learning_rate": 8.155973054672713e-06, + "loss": 0.0088, + "step": 18294 + }, + { + "epoch": 7.440016266775112, + "grad_norm": 0.1694627062928043, + "learning_rate": 8.154977251331002e-06, + "loss": 0.0019, + "step": 18295 + }, + { + "epoch": 7.440422936152908, + "grad_norm": 1.6816463530502586, + "learning_rate": 8.153981466929375e-06, + "loss": 0.02, + "step": 18296 + }, + { + "epoch": 7.440829605530704, + "grad_norm": 1.1667429647915701, + "learning_rate": 8.15298570147806e-06, + "loss": 0.0124, + "step": 18297 + }, + { + "epoch": 7.4412362749084995, + "grad_norm": 4.764299745363035, + "learning_rate": 8.151989954987277e-06, + "loss": 0.119, + "step": 18298 + }, + { + "epoch": 7.441642944286295, + "grad_norm": 0.028151154136242158, + "learning_rate": 8.150994227467244e-06, + "loss": 0.0004, + "step": 18299 + }, + { + "epoch": 7.442049613664091, + "grad_norm": 12.410206276826102, + "learning_rate": 8.14999851892819e-06, + "loss": 0.2873, + "step": 18300 + }, + { + "epoch": 7.442456283041887, + "grad_norm": 0.07617145886781794, + "learning_rate": 8.149002829380332e-06, + "loss": 0.0011, + "step": 18301 + }, + { + "epoch": 7.4428629524196825, + "grad_norm": 0.8577338456880812, + "learning_rate": 8.148007158833887e-06, + "loss": 0.0097, + "step": 18302 + }, + { + "epoch": 7.443269621797478, + "grad_norm": 2.632968764945482, + "learning_rate": 8.147011507299084e-06, + "loss": 0.011, + "step": 18303 + }, + { + "epoch": 7.443676291175274, + "grad_norm": 1.0847501537131836, + "learning_rate": 8.146015874786142e-06, + "loss": 0.0105, + "step": 18304 + }, + { + "epoch": 7.444082960553071, + "grad_norm": 3.7680415087430115, + "learning_rate": 8.145020261305277e-06, + "loss": 0.108, + "step": 18305 + }, + { + "epoch": 7.4444896299308665, + "grad_norm": 1.5017516224770535, + "learning_rate": 8.144024666866713e-06, + "loss": 0.0261, + "step": 18306 + }, + { + "epoch": 7.444896299308662, + "grad_norm": 7.458896441164828, + "learning_rate": 8.143029091480672e-06, + "loss": 0.3994, + "step": 18307 + }, + { + "epoch": 7.445302968686458, + "grad_norm": 2.5524278347610405, + "learning_rate": 8.14203353515737e-06, + "loss": 0.0583, + "step": 18308 + }, + { + "epoch": 7.445709638064254, + "grad_norm": 0.028173132251728975, + "learning_rate": 8.141037997907028e-06, + "loss": 0.0004, + "step": 18309 + }, + { + "epoch": 7.4461163074420496, + "grad_norm": 0.31815814389764935, + "learning_rate": 8.14004247973987e-06, + "loss": 0.0033, + "step": 18310 + }, + { + "epoch": 7.446522976819845, + "grad_norm": 3.2919478881140343, + "learning_rate": 8.13904698066611e-06, + "loss": 0.0761, + "step": 18311 + }, + { + "epoch": 7.446929646197641, + "grad_norm": 0.026543973583216914, + "learning_rate": 8.138051500695968e-06, + "loss": 0.0002, + "step": 18312 + }, + { + "epoch": 7.447336315575437, + "grad_norm": 5.096750661370786, + "learning_rate": 8.137056039839666e-06, + "loss": 0.1114, + "step": 18313 + }, + { + "epoch": 7.447742984953233, + "grad_norm": 0.004859135528171811, + "learning_rate": 8.136060598107422e-06, + "loss": 0.0001, + "step": 18314 + }, + { + "epoch": 7.448149654331029, + "grad_norm": 9.980980420218172, + "learning_rate": 8.135065175509452e-06, + "loss": 0.1378, + "step": 18315 + }, + { + "epoch": 7.448556323708825, + "grad_norm": 0.16795432961610682, + "learning_rate": 8.134069772055979e-06, + "loss": 0.0026, + "step": 18316 + }, + { + "epoch": 7.448962993086621, + "grad_norm": 0.5901530823106017, + "learning_rate": 8.133074387757218e-06, + "loss": 0.007, + "step": 18317 + }, + { + "epoch": 7.449369662464417, + "grad_norm": 2.4858104743039307, + "learning_rate": 8.132079022623387e-06, + "loss": 0.0124, + "step": 18318 + }, + { + "epoch": 7.449776331842212, + "grad_norm": 0.45659188347032287, + "learning_rate": 8.131083676664705e-06, + "loss": 0.0048, + "step": 18319 + }, + { + "epoch": 7.450183001220008, + "grad_norm": 0.8148523012557878, + "learning_rate": 8.130088349891392e-06, + "loss": 0.0064, + "step": 18320 + }, + { + "epoch": 7.450589670597804, + "grad_norm": 0.9771606731668551, + "learning_rate": 8.129093042313662e-06, + "loss": 0.0172, + "step": 18321 + }, + { + "epoch": 7.4509963399756, + "grad_norm": 0.009133941269947444, + "learning_rate": 8.12809775394173e-06, + "loss": 0.0001, + "step": 18322 + }, + { + "epoch": 7.451403009353395, + "grad_norm": 3.93252403370612, + "learning_rate": 8.127102484785822e-06, + "loss": 0.0962, + "step": 18323 + }, + { + "epoch": 7.451809678731191, + "grad_norm": 0.07632594743937506, + "learning_rate": 8.126107234856147e-06, + "loss": 0.001, + "step": 18324 + }, + { + "epoch": 7.452216348108988, + "grad_norm": 0.8713181872343266, + "learning_rate": 8.125112004162924e-06, + "loss": 0.0084, + "step": 18325 + }, + { + "epoch": 7.452623017486784, + "grad_norm": 2.4461877888286083, + "learning_rate": 8.124116792716371e-06, + "loss": 0.0379, + "step": 18326 + }, + { + "epoch": 7.453029686864579, + "grad_norm": 15.606499623802748, + "learning_rate": 8.123121600526703e-06, + "loss": 0.1361, + "step": 18327 + }, + { + "epoch": 7.453436356242375, + "grad_norm": 0.7691874646875672, + "learning_rate": 8.122126427604135e-06, + "loss": 0.0061, + "step": 18328 + }, + { + "epoch": 7.453843025620171, + "grad_norm": 2.4496969380968214, + "learning_rate": 8.121131273958887e-06, + "loss": 0.0413, + "step": 18329 + }, + { + "epoch": 7.454249694997967, + "grad_norm": 3.16124814135128, + "learning_rate": 8.120136139601171e-06, + "loss": 0.0423, + "step": 18330 + }, + { + "epoch": 7.454656364375762, + "grad_norm": 0.6771176840503635, + "learning_rate": 8.119141024541201e-06, + "loss": 0.0068, + "step": 18331 + }, + { + "epoch": 7.455063033753558, + "grad_norm": 6.349720431177701, + "learning_rate": 8.118145928789198e-06, + "loss": 0.1764, + "step": 18332 + }, + { + "epoch": 7.455469703131354, + "grad_norm": 6.5099867952128605, + "learning_rate": 8.117150852355373e-06, + "loss": 0.1889, + "step": 18333 + }, + { + "epoch": 7.45587637250915, + "grad_norm": 2.23818994172747, + "learning_rate": 8.116155795249944e-06, + "loss": 0.0189, + "step": 18334 + }, + { + "epoch": 7.456283041886946, + "grad_norm": 15.712286579667389, + "learning_rate": 8.11516075748312e-06, + "loss": 0.4948, + "step": 18335 + }, + { + "epoch": 7.456689711264742, + "grad_norm": 0.5380974599834832, + "learning_rate": 8.114165739065124e-06, + "loss": 0.009, + "step": 18336 + }, + { + "epoch": 7.457096380642538, + "grad_norm": 1.4105819022753254, + "learning_rate": 8.113170740006164e-06, + "loss": 0.0178, + "step": 18337 + }, + { + "epoch": 7.457503050020334, + "grad_norm": 0.3200775375857939, + "learning_rate": 8.112175760316453e-06, + "loss": 0.0034, + "step": 18338 + }, + { + "epoch": 7.457909719398129, + "grad_norm": 0.43322641119134336, + "learning_rate": 8.111180800006211e-06, + "loss": 0.0047, + "step": 18339 + }, + { + "epoch": 7.458316388775925, + "grad_norm": 0.30255952326113333, + "learning_rate": 8.11018585908565e-06, + "loss": 0.0035, + "step": 18340 + }, + { + "epoch": 7.458723058153721, + "grad_norm": 4.359705299711251, + "learning_rate": 8.109190937564978e-06, + "loss": 0.0689, + "step": 18341 + }, + { + "epoch": 7.459129727531517, + "grad_norm": 5.345714975357161, + "learning_rate": 8.108196035454416e-06, + "loss": 0.0639, + "step": 18342 + }, + { + "epoch": 7.4595363969093125, + "grad_norm": 8.908282323734877, + "learning_rate": 8.107201152764175e-06, + "loss": 0.2732, + "step": 18343 + }, + { + "epoch": 7.459943066287108, + "grad_norm": 0.022111439982011136, + "learning_rate": 8.106206289504466e-06, + "loss": 0.0004, + "step": 18344 + }, + { + "epoch": 7.460349735664904, + "grad_norm": 12.422685549565145, + "learning_rate": 8.1052114456855e-06, + "loss": 0.4357, + "step": 18345 + }, + { + "epoch": 7.460756405042701, + "grad_norm": 0.06854505180629386, + "learning_rate": 8.104216621317493e-06, + "loss": 0.0007, + "step": 18346 + }, + { + "epoch": 7.461163074420496, + "grad_norm": 7.276386169094741, + "learning_rate": 8.103221816410658e-06, + "loss": 0.4743, + "step": 18347 + }, + { + "epoch": 7.461569743798292, + "grad_norm": 1.340106324002887, + "learning_rate": 8.102227030975203e-06, + "loss": 0.0162, + "step": 18348 + }, + { + "epoch": 7.461976413176088, + "grad_norm": 4.194355234206089, + "learning_rate": 8.101232265021344e-06, + "loss": 0.1252, + "step": 18349 + }, + { + "epoch": 7.462383082553884, + "grad_norm": 0.0008609109002758309, + "learning_rate": 8.100237518559293e-06, + "loss": 0.0, + "step": 18350 + }, + { + "epoch": 7.4627897519316795, + "grad_norm": 12.746912147559494, + "learning_rate": 8.099242791599257e-06, + "loss": 0.2854, + "step": 18351 + }, + { + "epoch": 7.463196421309475, + "grad_norm": 1.545747703748003, + "learning_rate": 8.098248084151454e-06, + "loss": 0.0224, + "step": 18352 + }, + { + "epoch": 7.463603090687271, + "grad_norm": 0.8800753417685502, + "learning_rate": 8.09725339622609e-06, + "loss": 0.0116, + "step": 18353 + }, + { + "epoch": 7.464009760065067, + "grad_norm": 3.4371165351585002, + "learning_rate": 8.096258727833376e-06, + "loss": 0.0372, + "step": 18354 + }, + { + "epoch": 7.4644164294428625, + "grad_norm": 2.741392635420262, + "learning_rate": 8.095264078983525e-06, + "loss": 0.0396, + "step": 18355 + }, + { + "epoch": 7.464823098820659, + "grad_norm": 0.34759442231039367, + "learning_rate": 8.094269449686749e-06, + "loss": 0.0035, + "step": 18356 + }, + { + "epoch": 7.465229768198455, + "grad_norm": 6.557655501555529, + "learning_rate": 8.093274839953256e-06, + "loss": 0.2009, + "step": 18357 + }, + { + "epoch": 7.465636437576251, + "grad_norm": 1.5048055190331948, + "learning_rate": 8.092280249793258e-06, + "loss": 0.0172, + "step": 18358 + }, + { + "epoch": 7.4660431069540465, + "grad_norm": 0.41128952851233064, + "learning_rate": 8.091285679216958e-06, + "loss": 0.0063, + "step": 18359 + }, + { + "epoch": 7.466449776331842, + "grad_norm": 8.263553539534213, + "learning_rate": 8.090291128234574e-06, + "loss": 0.1755, + "step": 18360 + }, + { + "epoch": 7.466856445709638, + "grad_norm": 3.7275338526412773, + "learning_rate": 8.089296596856313e-06, + "loss": 0.062, + "step": 18361 + }, + { + "epoch": 7.467263115087434, + "grad_norm": 0.17717411921249915, + "learning_rate": 8.088302085092385e-06, + "loss": 0.0012, + "step": 18362 + }, + { + "epoch": 7.4676697844652296, + "grad_norm": 0.023285547850414073, + "learning_rate": 8.087307592952994e-06, + "loss": 0.0004, + "step": 18363 + }, + { + "epoch": 7.468076453843025, + "grad_norm": 8.693568065403591, + "learning_rate": 8.086313120448358e-06, + "loss": 0.0917, + "step": 18364 + }, + { + "epoch": 7.468483123220821, + "grad_norm": 2.006975550331547, + "learning_rate": 8.085318667588678e-06, + "loss": 0.0361, + "step": 18365 + }, + { + "epoch": 7.468889792598618, + "grad_norm": 0.3598025400825445, + "learning_rate": 8.084324234384164e-06, + "loss": 0.0039, + "step": 18366 + }, + { + "epoch": 7.4692964619764135, + "grad_norm": 6.893009668981285, + "learning_rate": 8.08332982084503e-06, + "loss": 0.4857, + "step": 18367 + }, + { + "epoch": 7.469703131354209, + "grad_norm": 0.04539056654113288, + "learning_rate": 8.082335426981478e-06, + "loss": 0.0005, + "step": 18368 + }, + { + "epoch": 7.470109800732005, + "grad_norm": 1.2183544904086792, + "learning_rate": 8.081341052803718e-06, + "loss": 0.017, + "step": 18369 + }, + { + "epoch": 7.470516470109801, + "grad_norm": 3.071031838336189, + "learning_rate": 8.080346698321957e-06, + "loss": 0.0546, + "step": 18370 + }, + { + "epoch": 7.470923139487597, + "grad_norm": 6.51644441357107, + "learning_rate": 8.079352363546405e-06, + "loss": 0.1671, + "step": 18371 + }, + { + "epoch": 7.471329808865392, + "grad_norm": 0.6460573861212167, + "learning_rate": 8.078358048487265e-06, + "loss": 0.0141, + "step": 18372 + }, + { + "epoch": 7.471736478243188, + "grad_norm": 8.57280704742333, + "learning_rate": 8.07736375315475e-06, + "loss": 0.1489, + "step": 18373 + }, + { + "epoch": 7.472143147620984, + "grad_norm": 1.7706711055377897, + "learning_rate": 8.076369477559063e-06, + "loss": 0.0221, + "step": 18374 + }, + { + "epoch": 7.47254981699878, + "grad_norm": 2.163797919895513, + "learning_rate": 8.07537522171041e-06, + "loss": 0.0234, + "step": 18375 + }, + { + "epoch": 7.472956486376576, + "grad_norm": 6.456339595851327, + "learning_rate": 8.074380985619e-06, + "loss": 0.1546, + "step": 18376 + }, + { + "epoch": 7.473363155754372, + "grad_norm": 0.9365987998485569, + "learning_rate": 8.073386769295038e-06, + "loss": 0.0093, + "step": 18377 + }, + { + "epoch": 7.473769825132168, + "grad_norm": 4.843250085909212, + "learning_rate": 8.072392572748731e-06, + "loss": 0.1689, + "step": 18378 + }, + { + "epoch": 7.474176494509964, + "grad_norm": 0.7641690653214595, + "learning_rate": 8.071398395990281e-06, + "loss": 0.0091, + "step": 18379 + }, + { + "epoch": 7.474583163887759, + "grad_norm": 0.15745918245413268, + "learning_rate": 8.070404239029902e-06, + "loss": 0.0021, + "step": 18380 + }, + { + "epoch": 7.474989833265555, + "grad_norm": 0.05863062463818137, + "learning_rate": 8.069410101877792e-06, + "loss": 0.0005, + "step": 18381 + }, + { + "epoch": 7.475396502643351, + "grad_norm": 0.9426316450069435, + "learning_rate": 8.068415984544159e-06, + "loss": 0.0095, + "step": 18382 + }, + { + "epoch": 7.475803172021147, + "grad_norm": 1.7700242733727434, + "learning_rate": 8.06742188703921e-06, + "loss": 0.0252, + "step": 18383 + }, + { + "epoch": 7.476209841398942, + "grad_norm": 0.019592377672303284, + "learning_rate": 8.066427809373146e-06, + "loss": 0.0002, + "step": 18384 + }, + { + "epoch": 7.476616510776738, + "grad_norm": 6.4050038547494035, + "learning_rate": 8.065433751556175e-06, + "loss": 0.0658, + "step": 18385 + }, + { + "epoch": 7.477023180154534, + "grad_norm": 0.35007425158856326, + "learning_rate": 8.064439713598499e-06, + "loss": 0.0042, + "step": 18386 + }, + { + "epoch": 7.477429849532331, + "grad_norm": 7.403042070871143, + "learning_rate": 8.063445695510323e-06, + "loss": 0.1433, + "step": 18387 + }, + { + "epoch": 7.477836518910126, + "grad_norm": 2.5066842032591454, + "learning_rate": 8.062451697301854e-06, + "loss": 0.0666, + "step": 18388 + }, + { + "epoch": 7.478243188287922, + "grad_norm": 10.360900471712764, + "learning_rate": 8.061457718983292e-06, + "loss": 0.1686, + "step": 18389 + }, + { + "epoch": 7.478649857665718, + "grad_norm": 1.4985494734542233, + "learning_rate": 8.060463760564841e-06, + "loss": 0.0158, + "step": 18390 + }, + { + "epoch": 7.479056527043514, + "grad_norm": 2.5782717561517567, + "learning_rate": 8.059469822056709e-06, + "loss": 0.035, + "step": 18391 + }, + { + "epoch": 7.479463196421309, + "grad_norm": 3.1936420890794643, + "learning_rate": 8.058475903469091e-06, + "loss": 0.0684, + "step": 18392 + }, + { + "epoch": 7.479869865799105, + "grad_norm": 1.744899729135987, + "learning_rate": 8.057482004812199e-06, + "loss": 0.0357, + "step": 18393 + }, + { + "epoch": 7.480276535176901, + "grad_norm": 0.7845972197430989, + "learning_rate": 8.056488126096232e-06, + "loss": 0.0082, + "step": 18394 + }, + { + "epoch": 7.480683204554697, + "grad_norm": 0.06673409379969618, + "learning_rate": 8.055494267331389e-06, + "loss": 0.0007, + "step": 18395 + }, + { + "epoch": 7.4810898739324925, + "grad_norm": 7.333564760011239, + "learning_rate": 8.054500428527879e-06, + "loss": 0.2627, + "step": 18396 + }, + { + "epoch": 7.481496543310289, + "grad_norm": 3.263106362197912, + "learning_rate": 8.053506609695901e-06, + "loss": 0.1641, + "step": 18397 + }, + { + "epoch": 7.481903212688085, + "grad_norm": 0.5420271791297425, + "learning_rate": 8.052512810845656e-06, + "loss": 0.0046, + "step": 18398 + }, + { + "epoch": 7.482309882065881, + "grad_norm": 0.949598679262314, + "learning_rate": 8.051519031987347e-06, + "loss": 0.0088, + "step": 18399 + }, + { + "epoch": 7.482716551443676, + "grad_norm": 0.08191546896996521, + "learning_rate": 8.050525273131178e-06, + "loss": 0.0012, + "step": 18400 + }, + { + "epoch": 7.483123220821472, + "grad_norm": 1.995001510086325, + "learning_rate": 8.049531534287347e-06, + "loss": 0.0426, + "step": 18401 + }, + { + "epoch": 7.483529890199268, + "grad_norm": 6.621017092989636, + "learning_rate": 8.048537815466054e-06, + "loss": 0.1199, + "step": 18402 + }, + { + "epoch": 7.483936559577064, + "grad_norm": 3.3879001377986095, + "learning_rate": 8.047544116677505e-06, + "loss": 0.0792, + "step": 18403 + }, + { + "epoch": 7.4843432289548595, + "grad_norm": 1.0673414515830013, + "learning_rate": 8.046550437931899e-06, + "loss": 0.0167, + "step": 18404 + }, + { + "epoch": 7.484749898332655, + "grad_norm": 1.825816854255307, + "learning_rate": 8.045556779239435e-06, + "loss": 0.0199, + "step": 18405 + }, + { + "epoch": 7.485156567710451, + "grad_norm": 2.703980328455256, + "learning_rate": 8.044563140610315e-06, + "loss": 0.0903, + "step": 18406 + }, + { + "epoch": 7.485563237088248, + "grad_norm": 0.4294009014429737, + "learning_rate": 8.043569522054738e-06, + "loss": 0.0068, + "step": 18407 + }, + { + "epoch": 7.485969906466043, + "grad_norm": 16.751906350528493, + "learning_rate": 8.042575923582903e-06, + "loss": 0.3239, + "step": 18408 + }, + { + "epoch": 7.486376575843839, + "grad_norm": 8.085373508207864, + "learning_rate": 8.041582345205012e-06, + "loss": 0.1533, + "step": 18409 + }, + { + "epoch": 7.486783245221635, + "grad_norm": 0.6394714151009188, + "learning_rate": 8.040588786931266e-06, + "loss": 0.0062, + "step": 18410 + }, + { + "epoch": 7.487189914599431, + "grad_norm": 0.017687429920065362, + "learning_rate": 8.03959524877186e-06, + "loss": 0.0002, + "step": 18411 + }, + { + "epoch": 7.4875965839772265, + "grad_norm": 5.320372029390185, + "learning_rate": 8.038601730736996e-06, + "loss": 0.0936, + "step": 18412 + }, + { + "epoch": 7.488003253355022, + "grad_norm": 2.7414219941148876, + "learning_rate": 8.037608232836873e-06, + "loss": 0.0461, + "step": 18413 + }, + { + "epoch": 7.488409922732818, + "grad_norm": 0.07545918266334115, + "learning_rate": 8.03661475508169e-06, + "loss": 0.0008, + "step": 18414 + }, + { + "epoch": 7.488816592110614, + "grad_norm": 8.631528795174502, + "learning_rate": 8.035621297481641e-06, + "loss": 0.1545, + "step": 18415 + }, + { + "epoch": 7.4892232614884096, + "grad_norm": 0.01936728297121114, + "learning_rate": 8.03462786004693e-06, + "loss": 0.0003, + "step": 18416 + }, + { + "epoch": 7.489629930866206, + "grad_norm": 0.08814719507483679, + "learning_rate": 8.033634442787756e-06, + "loss": 0.0017, + "step": 18417 + }, + { + "epoch": 7.490036600244002, + "grad_norm": 2.384215016910715, + "learning_rate": 8.03264104571431e-06, + "loss": 0.1124, + "step": 18418 + }, + { + "epoch": 7.490443269621798, + "grad_norm": 7.925789575257496, + "learning_rate": 8.031647668836797e-06, + "loss": 0.6352, + "step": 18419 + }, + { + "epoch": 7.4908499389995935, + "grad_norm": 0.3726700817742986, + "learning_rate": 8.030654312165409e-06, + "loss": 0.0054, + "step": 18420 + }, + { + "epoch": 7.491256608377389, + "grad_norm": 0.4947011980032713, + "learning_rate": 8.029660975710345e-06, + "loss": 0.0104, + "step": 18421 + }, + { + "epoch": 7.491663277755185, + "grad_norm": 5.871902587927578, + "learning_rate": 8.028667659481805e-06, + "loss": 0.3637, + "step": 18422 + }, + { + "epoch": 7.492069947132981, + "grad_norm": 0.30164982396871165, + "learning_rate": 8.027674363489983e-06, + "loss": 0.0033, + "step": 18423 + }, + { + "epoch": 7.492476616510777, + "grad_norm": 4.68220157011947, + "learning_rate": 8.026681087745077e-06, + "loss": 0.04, + "step": 18424 + }, + { + "epoch": 7.492883285888572, + "grad_norm": 1.268028699680937, + "learning_rate": 8.02568783225728e-06, + "loss": 0.0178, + "step": 18425 + }, + { + "epoch": 7.493289955266368, + "grad_norm": 9.154059246064476, + "learning_rate": 8.024694597036795e-06, + "loss": 0.2807, + "step": 18426 + }, + { + "epoch": 7.493696624644164, + "grad_norm": 0.03834666058039923, + "learning_rate": 8.023701382093812e-06, + "loss": 0.0004, + "step": 18427 + }, + { + "epoch": 7.4941032940219605, + "grad_norm": 0.2115532044553354, + "learning_rate": 8.022708187438529e-06, + "loss": 0.0023, + "step": 18428 + }, + { + "epoch": 7.494509963399756, + "grad_norm": 7.633908795572741, + "learning_rate": 8.021715013081142e-06, + "loss": 0.1778, + "step": 18429 + }, + { + "epoch": 7.494916632777552, + "grad_norm": 0.9187087312904869, + "learning_rate": 8.020721859031847e-06, + "loss": 0.0119, + "step": 18430 + }, + { + "epoch": 7.495323302155348, + "grad_norm": 4.241846548169522, + "learning_rate": 8.019728725300833e-06, + "loss": 0.0822, + "step": 18431 + }, + { + "epoch": 7.495729971533144, + "grad_norm": 5.624009413386229, + "learning_rate": 8.018735611898304e-06, + "loss": 0.077, + "step": 18432 + }, + { + "epoch": 7.496136640910939, + "grad_norm": 0.6825588852296302, + "learning_rate": 8.017742518834454e-06, + "loss": 0.0085, + "step": 18433 + }, + { + "epoch": 7.496543310288735, + "grad_norm": 0.12577816070636835, + "learning_rate": 8.01674944611947e-06, + "loss": 0.0018, + "step": 18434 + }, + { + "epoch": 7.496949979666531, + "grad_norm": 5.625481718930507, + "learning_rate": 8.015756393763554e-06, + "loss": 0.0974, + "step": 18435 + }, + { + "epoch": 7.497356649044327, + "grad_norm": 0.09119672597307363, + "learning_rate": 8.014763361776896e-06, + "loss": 0.0016, + "step": 18436 + }, + { + "epoch": 7.497763318422122, + "grad_norm": 1.352409421564049, + "learning_rate": 8.013770350169693e-06, + "loss": 0.0156, + "step": 18437 + }, + { + "epoch": 7.498169987799919, + "grad_norm": 1.242901480381891, + "learning_rate": 8.012777358952134e-06, + "loss": 0.0271, + "step": 18438 + }, + { + "epoch": 7.498576657177715, + "grad_norm": 0.001518931303721541, + "learning_rate": 8.011784388134418e-06, + "loss": 0.0, + "step": 18439 + }, + { + "epoch": 7.498983326555511, + "grad_norm": 8.077717651817698, + "learning_rate": 8.010791437726735e-06, + "loss": 0.1439, + "step": 18440 + }, + { + "epoch": 7.499389995933306, + "grad_norm": 0.13034298955150744, + "learning_rate": 8.009798507739277e-06, + "loss": 0.0016, + "step": 18441 + }, + { + "epoch": 7.499796665311102, + "grad_norm": 0.6352397256610998, + "learning_rate": 8.008805598182244e-06, + "loss": 0.0099, + "step": 18442 + }, + { + "epoch": 7.500203334688898, + "grad_norm": 0.638617960254787, + "learning_rate": 8.007812709065821e-06, + "loss": 0.0111, + "step": 18443 + }, + { + "epoch": 7.500610004066694, + "grad_norm": 9.575053564225612, + "learning_rate": 8.006819840400201e-06, + "loss": 0.2446, + "step": 18444 + }, + { + "epoch": 7.501016673444489, + "grad_norm": 0.033604705499714504, + "learning_rate": 8.005826992195582e-06, + "loss": 0.0005, + "step": 18445 + }, + { + "epoch": 7.501423342822285, + "grad_norm": 5.050923767627432, + "learning_rate": 8.004834164462151e-06, + "loss": 0.0813, + "step": 18446 + }, + { + "epoch": 7.501830012200081, + "grad_norm": 0.8091201400767618, + "learning_rate": 8.003841357210103e-06, + "loss": 0.0083, + "step": 18447 + }, + { + "epoch": 7.502236681577877, + "grad_norm": 0.01483048409814835, + "learning_rate": 8.002848570449624e-06, + "loss": 0.0002, + "step": 18448 + }, + { + "epoch": 7.502643350955673, + "grad_norm": 3.598671062271381, + "learning_rate": 8.001855804190913e-06, + "loss": 0.1287, + "step": 18449 + }, + { + "epoch": 7.503050020333469, + "grad_norm": 3.8873618413379307, + "learning_rate": 8.000863058444157e-06, + "loss": 0.0342, + "step": 18450 + }, + { + "epoch": 7.503456689711265, + "grad_norm": 30.724124865100265, + "learning_rate": 7.999870333219547e-06, + "loss": 0.5878, + "step": 18451 + }, + { + "epoch": 7.503863359089061, + "grad_norm": 7.690503732415641, + "learning_rate": 7.998877628527276e-06, + "loss": 0.1223, + "step": 18452 + }, + { + "epoch": 7.504270028466856, + "grad_norm": 0.41901714532613266, + "learning_rate": 7.997884944377532e-06, + "loss": 0.0052, + "step": 18453 + }, + { + "epoch": 7.504676697844652, + "grad_norm": 2.16327922530095, + "learning_rate": 7.996892280780507e-06, + "loss": 0.0306, + "step": 18454 + }, + { + "epoch": 7.505083367222448, + "grad_norm": 7.243101365334758, + "learning_rate": 7.995899637746391e-06, + "loss": 0.2213, + "step": 18455 + }, + { + "epoch": 7.505490036600244, + "grad_norm": 0.017621664269820154, + "learning_rate": 7.994907015285376e-06, + "loss": 0.0003, + "step": 18456 + }, + { + "epoch": 7.5058967059780395, + "grad_norm": 0.24197018731554493, + "learning_rate": 7.993914413407647e-06, + "loss": 0.0029, + "step": 18457 + }, + { + "epoch": 7.506303375355836, + "grad_norm": 2.881325557942699, + "learning_rate": 7.992921832123402e-06, + "loss": 0.0458, + "step": 18458 + }, + { + "epoch": 7.506710044733632, + "grad_norm": 3.7724088652391963, + "learning_rate": 7.991929271442817e-06, + "loss": 0.0967, + "step": 18459 + }, + { + "epoch": 7.507116714111428, + "grad_norm": 1.35971777803555, + "learning_rate": 7.990936731376092e-06, + "loss": 0.019, + "step": 18460 + }, + { + "epoch": 7.507523383489223, + "grad_norm": 6.656766267016868, + "learning_rate": 7.989944211933413e-06, + "loss": 0.1066, + "step": 18461 + }, + { + "epoch": 7.507930052867019, + "grad_norm": 0.580802772438071, + "learning_rate": 7.988951713124965e-06, + "loss": 0.0067, + "step": 18462 + }, + { + "epoch": 7.508336722244815, + "grad_norm": 0.08687139344441432, + "learning_rate": 7.987959234960943e-06, + "loss": 0.0013, + "step": 18463 + }, + { + "epoch": 7.508743391622611, + "grad_norm": 0.41438300592819605, + "learning_rate": 7.986966777451532e-06, + "loss": 0.0063, + "step": 18464 + }, + { + "epoch": 7.5091500610004065, + "grad_norm": 0.025055930562863815, + "learning_rate": 7.985974340606919e-06, + "loss": 0.0004, + "step": 18465 + }, + { + "epoch": 7.509556730378202, + "grad_norm": 15.666742555032048, + "learning_rate": 7.984981924437292e-06, + "loss": 0.3644, + "step": 18466 + }, + { + "epoch": 7.509963399755998, + "grad_norm": 0.029408055064046704, + "learning_rate": 7.983989528952843e-06, + "loss": 0.0005, + "step": 18467 + }, + { + "epoch": 7.510370069133794, + "grad_norm": 6.803586754405439, + "learning_rate": 7.982997154163754e-06, + "loss": 0.144, + "step": 18468 + }, + { + "epoch": 7.5107767385115904, + "grad_norm": 1.1227969441945778, + "learning_rate": 7.982004800080212e-06, + "loss": 0.0133, + "step": 18469 + }, + { + "epoch": 7.511183407889386, + "grad_norm": 5.231939925556206, + "learning_rate": 7.98101246671241e-06, + "loss": 0.1066, + "step": 18470 + }, + { + "epoch": 7.511590077267182, + "grad_norm": 4.026310409475874, + "learning_rate": 7.98002015407053e-06, + "loss": 0.0627, + "step": 18471 + }, + { + "epoch": 7.511996746644978, + "grad_norm": 0.15376612785669488, + "learning_rate": 7.979027862164759e-06, + "loss": 0.0012, + "step": 18472 + }, + { + "epoch": 7.5124034160227735, + "grad_norm": 1.6681806146898455, + "learning_rate": 7.978035591005284e-06, + "loss": 0.0153, + "step": 18473 + }, + { + "epoch": 7.512810085400569, + "grad_norm": 0.1427311182927548, + "learning_rate": 7.977043340602293e-06, + "loss": 0.0028, + "step": 18474 + }, + { + "epoch": 7.513216754778365, + "grad_norm": 0.6531661975167068, + "learning_rate": 7.976051110965967e-06, + "loss": 0.0047, + "step": 18475 + }, + { + "epoch": 7.513623424156161, + "grad_norm": 4.927602766328597, + "learning_rate": 7.9750589021065e-06, + "loss": 0.0841, + "step": 18476 + }, + { + "epoch": 7.514030093533957, + "grad_norm": 2.9625700506560166, + "learning_rate": 7.974066714034069e-06, + "loss": 0.0556, + "step": 18477 + }, + { + "epoch": 7.514436762911753, + "grad_norm": 7.432809520734471, + "learning_rate": 7.973074546758865e-06, + "loss": 0.1577, + "step": 18478 + }, + { + "epoch": 7.514843432289549, + "grad_norm": 3.5564722246026657, + "learning_rate": 7.972082400291068e-06, + "loss": 0.0961, + "step": 18479 + }, + { + "epoch": 7.515250101667345, + "grad_norm": 1.1794466183170658, + "learning_rate": 7.971090274640868e-06, + "loss": 0.019, + "step": 18480 + }, + { + "epoch": 7.5156567710451405, + "grad_norm": 1.4308704177763318, + "learning_rate": 7.970098169818447e-06, + "loss": 0.0202, + "step": 18481 + }, + { + "epoch": 7.516063440422936, + "grad_norm": 3.8798677526739094, + "learning_rate": 7.969106085833988e-06, + "loss": 0.0609, + "step": 18482 + }, + { + "epoch": 7.516470109800732, + "grad_norm": 3.230328055469608, + "learning_rate": 7.96811402269768e-06, + "loss": 0.0584, + "step": 18483 + }, + { + "epoch": 7.516876779178528, + "grad_norm": 1.739731175026536, + "learning_rate": 7.967121980419702e-06, + "loss": 0.0242, + "step": 18484 + }, + { + "epoch": 7.517283448556324, + "grad_norm": 0.13045647800374244, + "learning_rate": 7.966129959010241e-06, + "loss": 0.0013, + "step": 18485 + }, + { + "epoch": 7.517690117934119, + "grad_norm": 0.3064542130522866, + "learning_rate": 7.96513795847948e-06, + "loss": 0.003, + "step": 18486 + }, + { + "epoch": 7.518096787311915, + "grad_norm": 12.962251031140251, + "learning_rate": 7.964145978837603e-06, + "loss": 0.7221, + "step": 18487 + }, + { + "epoch": 7.518503456689711, + "grad_norm": 2.0566991714375087, + "learning_rate": 7.96315402009479e-06, + "loss": 0.0289, + "step": 18488 + }, + { + "epoch": 7.518910126067507, + "grad_norm": 3.0136055379700193, + "learning_rate": 7.962162082261227e-06, + "loss": 0.0572, + "step": 18489 + }, + { + "epoch": 7.519316795445303, + "grad_norm": 0.016655877677010666, + "learning_rate": 7.961170165347097e-06, + "loss": 0.0002, + "step": 18490 + }, + { + "epoch": 7.519723464823099, + "grad_norm": 6.273529742074634, + "learning_rate": 7.96017826936258e-06, + "loss": 0.108, + "step": 18491 + }, + { + "epoch": 7.520130134200895, + "grad_norm": 0.005917598251845002, + "learning_rate": 7.959186394317857e-06, + "loss": 0.0, + "step": 18492 + }, + { + "epoch": 7.520536803578691, + "grad_norm": 0.030986768641714933, + "learning_rate": 7.958194540223116e-06, + "loss": 0.0006, + "step": 18493 + }, + { + "epoch": 7.520943472956486, + "grad_norm": 0.018016526037539914, + "learning_rate": 7.957202707088537e-06, + "loss": 0.0003, + "step": 18494 + }, + { + "epoch": 7.521350142334282, + "grad_norm": 0.25842378867983573, + "learning_rate": 7.956210894924296e-06, + "loss": 0.0023, + "step": 18495 + }, + { + "epoch": 7.521756811712078, + "grad_norm": 0.22977507900826566, + "learning_rate": 7.955219103740582e-06, + "loss": 0.0016, + "step": 18496 + }, + { + "epoch": 7.522163481089874, + "grad_norm": 1.3378860619086044, + "learning_rate": 7.954227333547575e-06, + "loss": 0.0209, + "step": 18497 + }, + { + "epoch": 7.522570150467669, + "grad_norm": 0.4295529385556449, + "learning_rate": 7.953235584355448e-06, + "loss": 0.0026, + "step": 18498 + }, + { + "epoch": 7.522976819845466, + "grad_norm": 1.1886858469341406, + "learning_rate": 7.952243856174394e-06, + "loss": 0.0073, + "step": 18499 + }, + { + "epoch": 7.523383489223262, + "grad_norm": 5.113598202076359, + "learning_rate": 7.951252149014583e-06, + "loss": 0.1068, + "step": 18500 + }, + { + "epoch": 7.523790158601058, + "grad_norm": 1.8805504745892723, + "learning_rate": 7.950260462886204e-06, + "loss": 0.0286, + "step": 18501 + }, + { + "epoch": 7.524196827978853, + "grad_norm": 9.070976439201976, + "learning_rate": 7.949268797799428e-06, + "loss": 0.3068, + "step": 18502 + }, + { + "epoch": 7.524603497356649, + "grad_norm": 19.10391618390403, + "learning_rate": 7.948277153764442e-06, + "loss": 0.7923, + "step": 18503 + }, + { + "epoch": 7.525010166734445, + "grad_norm": 0.14613849887344893, + "learning_rate": 7.947285530791426e-06, + "loss": 0.0013, + "step": 18504 + }, + { + "epoch": 7.525416836112241, + "grad_norm": 0.09687121321421507, + "learning_rate": 7.946293928890553e-06, + "loss": 0.0006, + "step": 18505 + }, + { + "epoch": 7.525823505490036, + "grad_norm": 3.2238333174450955, + "learning_rate": 7.94530234807201e-06, + "loss": 0.0802, + "step": 18506 + }, + { + "epoch": 7.526230174867832, + "grad_norm": 3.8629672587447326, + "learning_rate": 7.94431078834597e-06, + "loss": 0.063, + "step": 18507 + }, + { + "epoch": 7.526636844245628, + "grad_norm": 11.492185443538272, + "learning_rate": 7.943319249722614e-06, + "loss": 0.3616, + "step": 18508 + }, + { + "epoch": 7.527043513623424, + "grad_norm": 0.04124454543946411, + "learning_rate": 7.942327732212125e-06, + "loss": 0.0005, + "step": 18509 + }, + { + "epoch": 7.52745018300122, + "grad_norm": 17.886117583513222, + "learning_rate": 7.941336235824674e-06, + "loss": 0.7785, + "step": 18510 + }, + { + "epoch": 7.527856852379016, + "grad_norm": 11.257316865672495, + "learning_rate": 7.940344760570443e-06, + "loss": 0.1713, + "step": 18511 + }, + { + "epoch": 7.528263521756812, + "grad_norm": 0.6982935646364673, + "learning_rate": 7.939353306459609e-06, + "loss": 0.013, + "step": 18512 + }, + { + "epoch": 7.528670191134608, + "grad_norm": 0.2907609288345235, + "learning_rate": 7.938361873502353e-06, + "loss": 0.0041, + "step": 18513 + }, + { + "epoch": 7.529076860512403, + "grad_norm": 2.5119264671668815, + "learning_rate": 7.937370461708848e-06, + "loss": 0.0386, + "step": 18514 + }, + { + "epoch": 7.529483529890199, + "grad_norm": 1.0906905318875166, + "learning_rate": 7.936379071089273e-06, + "loss": 0.0152, + "step": 18515 + }, + { + "epoch": 7.529890199267995, + "grad_norm": 0.10741359608264366, + "learning_rate": 7.935387701653807e-06, + "loss": 0.0021, + "step": 18516 + }, + { + "epoch": 7.530296868645791, + "grad_norm": 3.340015048380211, + "learning_rate": 7.934396353412626e-06, + "loss": 0.0528, + "step": 18517 + }, + { + "epoch": 7.5307035380235865, + "grad_norm": 0.009941673726046396, + "learning_rate": 7.933405026375903e-06, + "loss": 0.0001, + "step": 18518 + }, + { + "epoch": 7.531110207401383, + "grad_norm": 3.959853319711881, + "learning_rate": 7.93241372055382e-06, + "loss": 0.0306, + "step": 18519 + }, + { + "epoch": 7.531516876779179, + "grad_norm": 0.04046115291024003, + "learning_rate": 7.93142243595655e-06, + "loss": 0.0006, + "step": 18520 + }, + { + "epoch": 7.531923546156975, + "grad_norm": 9.304656692505128, + "learning_rate": 7.930431172594268e-06, + "loss": 0.1277, + "step": 18521 + }, + { + "epoch": 7.5323302155347704, + "grad_norm": 2.5208749681064346, + "learning_rate": 7.929439930477154e-06, + "loss": 0.0412, + "step": 18522 + }, + { + "epoch": 7.532736884912566, + "grad_norm": 11.190249423209574, + "learning_rate": 7.92844870961538e-06, + "loss": 0.393, + "step": 18523 + }, + { + "epoch": 7.533143554290362, + "grad_norm": 2.6541387478952574, + "learning_rate": 7.927457510019123e-06, + "loss": 0.0373, + "step": 18524 + }, + { + "epoch": 7.533550223668158, + "grad_norm": 2.3730008900370514, + "learning_rate": 7.926466331698558e-06, + "loss": 0.0476, + "step": 18525 + }, + { + "epoch": 7.5339568930459535, + "grad_norm": 0.5585045199040823, + "learning_rate": 7.92547517466386e-06, + "loss": 0.0062, + "step": 18526 + }, + { + "epoch": 7.534363562423749, + "grad_norm": 0.012592617192743506, + "learning_rate": 7.924484038925204e-06, + "loss": 0.0001, + "step": 18527 + }, + { + "epoch": 7.534770231801545, + "grad_norm": 3.6895808748843617, + "learning_rate": 7.923492924492761e-06, + "loss": 0.0218, + "step": 18528 + }, + { + "epoch": 7.535176901179341, + "grad_norm": 3.685963123005273, + "learning_rate": 7.92250183137671e-06, + "loss": 0.0337, + "step": 18529 + }, + { + "epoch": 7.535583570557137, + "grad_norm": 4.186298486780518, + "learning_rate": 7.921510759587223e-06, + "loss": 0.0502, + "step": 18530 + }, + { + "epoch": 7.535990239934933, + "grad_norm": 0.007594514073571938, + "learning_rate": 7.920519709134474e-06, + "loss": 0.0001, + "step": 18531 + }, + { + "epoch": 7.536396909312729, + "grad_norm": 1.9128828262868807, + "learning_rate": 7.919528680028639e-06, + "loss": 0.0235, + "step": 18532 + }, + { + "epoch": 7.536803578690525, + "grad_norm": 0.014647506583432574, + "learning_rate": 7.918537672279888e-06, + "loss": 0.0002, + "step": 18533 + }, + { + "epoch": 7.5372102480683205, + "grad_norm": 0.0612497635600369, + "learning_rate": 7.917546685898393e-06, + "loss": 0.0009, + "step": 18534 + }, + { + "epoch": 7.537616917446116, + "grad_norm": 0.7981973625918605, + "learning_rate": 7.91655572089433e-06, + "loss": 0.0041, + "step": 18535 + }, + { + "epoch": 7.538023586823912, + "grad_norm": 0.1763675306776989, + "learning_rate": 7.915564777277874e-06, + "loss": 0.002, + "step": 18536 + }, + { + "epoch": 7.538430256201708, + "grad_norm": 0.1807851998776696, + "learning_rate": 7.91457385505919e-06, + "loss": 0.0023, + "step": 18537 + }, + { + "epoch": 7.538836925579504, + "grad_norm": 4.347130055451287, + "learning_rate": 7.91358295424846e-06, + "loss": 0.1812, + "step": 18538 + }, + { + "epoch": 7.539243594957299, + "grad_norm": 0.04211576263147828, + "learning_rate": 7.91259207485585e-06, + "loss": 0.0005, + "step": 18539 + }, + { + "epoch": 7.539650264335096, + "grad_norm": 3.230245203802059, + "learning_rate": 7.911601216891531e-06, + "loss": 0.0631, + "step": 18540 + }, + { + "epoch": 7.540056933712892, + "grad_norm": 12.125415288557772, + "learning_rate": 7.910610380365677e-06, + "loss": 0.1387, + "step": 18541 + }, + { + "epoch": 7.5404636030906875, + "grad_norm": 1.2040710420273748, + "learning_rate": 7.90961956528846e-06, + "loss": 0.0161, + "step": 18542 + }, + { + "epoch": 7.540870272468483, + "grad_norm": 2.3072463543581567, + "learning_rate": 7.90862877167005e-06, + "loss": 0.0478, + "step": 18543 + }, + { + "epoch": 7.541276941846279, + "grad_norm": 13.14728699204762, + "learning_rate": 7.907637999520616e-06, + "loss": 0.2556, + "step": 18544 + }, + { + "epoch": 7.541683611224075, + "grad_norm": 2.8188206325329577, + "learning_rate": 7.906647248850333e-06, + "loss": 0.0479, + "step": 18545 + }, + { + "epoch": 7.542090280601871, + "grad_norm": 1.605724745353298, + "learning_rate": 7.90565651966937e-06, + "loss": 0.0169, + "step": 18546 + }, + { + "epoch": 7.542496949979666, + "grad_norm": 0.05244919264402394, + "learning_rate": 7.904665811987894e-06, + "loss": 0.0007, + "step": 18547 + }, + { + "epoch": 7.542903619357462, + "grad_norm": 1.8980927932415834, + "learning_rate": 7.903675125816082e-06, + "loss": 0.0248, + "step": 18548 + }, + { + "epoch": 7.543310288735258, + "grad_norm": 2.560593200438392, + "learning_rate": 7.902684461164099e-06, + "loss": 0.0522, + "step": 18549 + }, + { + "epoch": 7.543716958113054, + "grad_norm": 2.006874546726116, + "learning_rate": 7.901693818042113e-06, + "loss": 0.0299, + "step": 18550 + }, + { + "epoch": 7.54412362749085, + "grad_norm": 4.301027187269733, + "learning_rate": 7.900703196460299e-06, + "loss": 0.0907, + "step": 18551 + }, + { + "epoch": 7.544530296868646, + "grad_norm": 8.96742649003341, + "learning_rate": 7.899712596428822e-06, + "loss": 0.0823, + "step": 18552 + }, + { + "epoch": 7.544936966246442, + "grad_norm": 0.49435352447487896, + "learning_rate": 7.898722017957852e-06, + "loss": 0.0058, + "step": 18553 + }, + { + "epoch": 7.545343635624238, + "grad_norm": 1.0345896069681106, + "learning_rate": 7.897731461057557e-06, + "loss": 0.0131, + "step": 18554 + }, + { + "epoch": 7.545750305002033, + "grad_norm": 0.9763212597425439, + "learning_rate": 7.896740925738108e-06, + "loss": 0.0106, + "step": 18555 + }, + { + "epoch": 7.546156974379829, + "grad_norm": 0.0026568298181370376, + "learning_rate": 7.895750412009673e-06, + "loss": 0.0, + "step": 18556 + }, + { + "epoch": 7.546563643757625, + "grad_norm": 0.005828332041373163, + "learning_rate": 7.894759919882416e-06, + "loss": 0.0001, + "step": 18557 + }, + { + "epoch": 7.546970313135421, + "grad_norm": 0.06384709335308912, + "learning_rate": 7.893769449366516e-06, + "loss": 0.0006, + "step": 18558 + }, + { + "epoch": 7.547376982513216, + "grad_norm": 0.5918062405792589, + "learning_rate": 7.892779000472125e-06, + "loss": 0.0083, + "step": 18559 + }, + { + "epoch": 7.547783651891013, + "grad_norm": 5.6834856057447585, + "learning_rate": 7.89178857320942e-06, + "loss": 0.0759, + "step": 18560 + }, + { + "epoch": 7.548190321268809, + "grad_norm": 4.429375637014598, + "learning_rate": 7.890798167588567e-06, + "loss": 0.132, + "step": 18561 + }, + { + "epoch": 7.548596990646605, + "grad_norm": 0.4137806169158514, + "learning_rate": 7.88980778361973e-06, + "loss": 0.0028, + "step": 18562 + }, + { + "epoch": 7.5490036600244, + "grad_norm": 3.388247674175792, + "learning_rate": 7.88881742131308e-06, + "loss": 0.0401, + "step": 18563 + }, + { + "epoch": 7.549410329402196, + "grad_norm": 2.6757080241698548, + "learning_rate": 7.88782708067878e-06, + "loss": 0.0325, + "step": 18564 + }, + { + "epoch": 7.549816998779992, + "grad_norm": 0.6405267383243267, + "learning_rate": 7.886836761727e-06, + "loss": 0.0098, + "step": 18565 + }, + { + "epoch": 7.550223668157788, + "grad_norm": 1.9110967173893274, + "learning_rate": 7.885846464467902e-06, + "loss": 0.0286, + "step": 18566 + }, + { + "epoch": 7.550630337535583, + "grad_norm": 5.11484744670295, + "learning_rate": 7.884856188911655e-06, + "loss": 0.0682, + "step": 18567 + }, + { + "epoch": 7.551037006913379, + "grad_norm": 4.389502316038285, + "learning_rate": 7.883865935068427e-06, + "loss": 0.0614, + "step": 18568 + }, + { + "epoch": 7.551443676291175, + "grad_norm": 0.4447129973383355, + "learning_rate": 7.882875702948374e-06, + "loss": 0.006, + "step": 18569 + }, + { + "epoch": 7.551850345668971, + "grad_norm": 1.4590315298103596, + "learning_rate": 7.881885492561672e-06, + "loss": 0.0278, + "step": 18570 + }, + { + "epoch": 7.5522570150467665, + "grad_norm": 8.87919803010732, + "learning_rate": 7.880895303918482e-06, + "loss": 0.1404, + "step": 18571 + }, + { + "epoch": 7.552663684424563, + "grad_norm": 5.344317352246713, + "learning_rate": 7.879905137028964e-06, + "loss": 0.1344, + "step": 18572 + }, + { + "epoch": 7.553070353802359, + "grad_norm": 0.37413521343338363, + "learning_rate": 7.878914991903291e-06, + "loss": 0.0053, + "step": 18573 + }, + { + "epoch": 7.553477023180155, + "grad_norm": 8.474069506489005, + "learning_rate": 7.877924868551621e-06, + "loss": 0.1262, + "step": 18574 + }, + { + "epoch": 7.5538836925579504, + "grad_norm": 0.0389738950621175, + "learning_rate": 7.87693476698412e-06, + "loss": 0.0004, + "step": 18575 + }, + { + "epoch": 7.554290361935746, + "grad_norm": 3.558409542148281, + "learning_rate": 7.875944687210953e-06, + "loss": 0.0936, + "step": 18576 + }, + { + "epoch": 7.554697031313542, + "grad_norm": 0.9376858613308251, + "learning_rate": 7.874954629242283e-06, + "loss": 0.0136, + "step": 18577 + }, + { + "epoch": 7.555103700691338, + "grad_norm": 0.006930539793306256, + "learning_rate": 7.873964593088271e-06, + "loss": 0.0001, + "step": 18578 + }, + { + "epoch": 7.5555103700691335, + "grad_norm": 14.403479071726368, + "learning_rate": 7.872974578759084e-06, + "loss": 0.1973, + "step": 18579 + }, + { + "epoch": 7.555917039446929, + "grad_norm": 0.16277363992535454, + "learning_rate": 7.871984586264886e-06, + "loss": 0.0019, + "step": 18580 + }, + { + "epoch": 7.556323708824726, + "grad_norm": 0.024900830582185514, + "learning_rate": 7.870994615615836e-06, + "loss": 0.0004, + "step": 18581 + }, + { + "epoch": 7.556730378202522, + "grad_norm": 0.9616286720642327, + "learning_rate": 7.870004666822094e-06, + "loss": 0.0118, + "step": 18582 + }, + { + "epoch": 7.5571370475803175, + "grad_norm": 7.012849310552075, + "learning_rate": 7.86901473989383e-06, + "loss": 0.1549, + "step": 18583 + }, + { + "epoch": 7.557543716958113, + "grad_norm": 1.2146106086884672, + "learning_rate": 7.868024834841201e-06, + "loss": 0.0147, + "step": 18584 + }, + { + "epoch": 7.557950386335909, + "grad_norm": 12.47838117425851, + "learning_rate": 7.86703495167437e-06, + "loss": 0.4683, + "step": 18585 + }, + { + "epoch": 7.558357055713705, + "grad_norm": 1.7166919526330968, + "learning_rate": 7.8660450904035e-06, + "loss": 0.0242, + "step": 18586 + }, + { + "epoch": 7.5587637250915005, + "grad_norm": 1.7539741965186688, + "learning_rate": 7.865055251038748e-06, + "loss": 0.0234, + "step": 18587 + }, + { + "epoch": 7.559170394469296, + "grad_norm": 5.827277407058847, + "learning_rate": 7.86406543359028e-06, + "loss": 0.1455, + "step": 18588 + }, + { + "epoch": 7.559577063847092, + "grad_norm": 0.24042304202167317, + "learning_rate": 7.863075638068257e-06, + "loss": 0.002, + "step": 18589 + }, + { + "epoch": 7.559983733224888, + "grad_norm": 1.5440700343663634, + "learning_rate": 7.862085864482837e-06, + "loss": 0.0319, + "step": 18590 + }, + { + "epoch": 7.560390402602684, + "grad_norm": 10.502328747130852, + "learning_rate": 7.86109611284418e-06, + "loss": 0.8858, + "step": 18591 + }, + { + "epoch": 7.56079707198048, + "grad_norm": 5.8954371889553, + "learning_rate": 7.86010638316245e-06, + "loss": 0.0909, + "step": 18592 + }, + { + "epoch": 7.561203741358276, + "grad_norm": 0.3350020652753407, + "learning_rate": 7.859116675447804e-06, + "loss": 0.0061, + "step": 18593 + }, + { + "epoch": 7.561610410736072, + "grad_norm": 0.07507759347910642, + "learning_rate": 7.858126989710404e-06, + "loss": 0.0008, + "step": 18594 + }, + { + "epoch": 7.5620170801138675, + "grad_norm": 3.631294159048265, + "learning_rate": 7.857137325960405e-06, + "loss": 0.1083, + "step": 18595 + }, + { + "epoch": 7.562423749491663, + "grad_norm": 7.7706532697234145, + "learning_rate": 7.856147684207973e-06, + "loss": 0.1045, + "step": 18596 + }, + { + "epoch": 7.562830418869459, + "grad_norm": 0.017943837088140218, + "learning_rate": 7.855158064463264e-06, + "loss": 0.0003, + "step": 18597 + }, + { + "epoch": 7.563237088247255, + "grad_norm": 15.101642213246926, + "learning_rate": 7.854168466736434e-06, + "loss": 0.6114, + "step": 18598 + }, + { + "epoch": 7.563643757625051, + "grad_norm": 0.00687856749634469, + "learning_rate": 7.853178891037648e-06, + "loss": 0.0001, + "step": 18599 + }, + { + "epoch": 7.564050427002846, + "grad_norm": 0.556689270747618, + "learning_rate": 7.85218933737706e-06, + "loss": 0.0111, + "step": 18600 + }, + { + "epoch": 7.564457096380643, + "grad_norm": 1.081062210258423, + "learning_rate": 7.851199805764829e-06, + "loss": 0.0152, + "step": 18601 + }, + { + "epoch": 7.564863765758439, + "grad_norm": 0.08085623072782959, + "learning_rate": 7.850210296211115e-06, + "loss": 0.0014, + "step": 18602 + }, + { + "epoch": 7.5652704351362345, + "grad_norm": 0.48847081760976796, + "learning_rate": 7.849220808726073e-06, + "loss": 0.0065, + "step": 18603 + }, + { + "epoch": 7.56567710451403, + "grad_norm": 1.5488133637228758, + "learning_rate": 7.848231343319864e-06, + "loss": 0.0237, + "step": 18604 + }, + { + "epoch": 7.566083773891826, + "grad_norm": 0.08284772159536653, + "learning_rate": 7.84724190000264e-06, + "loss": 0.001, + "step": 18605 + }, + { + "epoch": 7.566490443269622, + "grad_norm": 9.863186528440334, + "learning_rate": 7.846252478784562e-06, + "loss": 0.1584, + "step": 18606 + }, + { + "epoch": 7.566897112647418, + "grad_norm": 7.593746374356519, + "learning_rate": 7.845263079675786e-06, + "loss": 0.2873, + "step": 18607 + }, + { + "epoch": 7.567303782025213, + "grad_norm": 3.883194502008471, + "learning_rate": 7.84427370268647e-06, + "loss": 0.0885, + "step": 18608 + }, + { + "epoch": 7.567710451403009, + "grad_norm": 1.2052660722626116, + "learning_rate": 7.84328434782677e-06, + "loss": 0.0242, + "step": 18609 + }, + { + "epoch": 7.568117120780805, + "grad_norm": 4.771233821529437, + "learning_rate": 7.842295015106842e-06, + "loss": 0.0547, + "step": 18610 + }, + { + "epoch": 7.568523790158601, + "grad_norm": 7.09982873349495, + "learning_rate": 7.841305704536838e-06, + "loss": 0.1382, + "step": 18611 + }, + { + "epoch": 7.568930459536396, + "grad_norm": 0.16859434391969022, + "learning_rate": 7.840316416126922e-06, + "loss": 0.0028, + "step": 18612 + }, + { + "epoch": 7.569337128914193, + "grad_norm": 0.48923731937528625, + "learning_rate": 7.839327149887242e-06, + "loss": 0.0071, + "step": 18613 + }, + { + "epoch": 7.569743798291989, + "grad_norm": 6.714577286500119, + "learning_rate": 7.838337905827956e-06, + "loss": 0.1255, + "step": 18614 + }, + { + "epoch": 7.570150467669785, + "grad_norm": 4.478017840896854, + "learning_rate": 7.83734868395922e-06, + "loss": 0.1361, + "step": 18615 + }, + { + "epoch": 7.57055713704758, + "grad_norm": 0.3994497352614691, + "learning_rate": 7.836359484291189e-06, + "loss": 0.0071, + "step": 18616 + }, + { + "epoch": 7.570963806425376, + "grad_norm": 9.856994463938195, + "learning_rate": 7.835370306834017e-06, + "loss": 0.1785, + "step": 18617 + }, + { + "epoch": 7.571370475803172, + "grad_norm": 5.599258633736736, + "learning_rate": 7.834381151597856e-06, + "loss": 0.0837, + "step": 18618 + }, + { + "epoch": 7.571777145180968, + "grad_norm": 7.2817710628115675, + "learning_rate": 7.833392018592865e-06, + "loss": 0.1581, + "step": 18619 + }, + { + "epoch": 7.572183814558763, + "grad_norm": 6.775707012881739, + "learning_rate": 7.832402907829195e-06, + "loss": 0.1386, + "step": 18620 + }, + { + "epoch": 7.572590483936559, + "grad_norm": 14.901195278634846, + "learning_rate": 7.831413819316997e-06, + "loss": 0.5942, + "step": 18621 + }, + { + "epoch": 7.572997153314356, + "grad_norm": 13.417198897574925, + "learning_rate": 7.830424753066432e-06, + "loss": 0.4788, + "step": 18622 + }, + { + "epoch": 7.573403822692152, + "grad_norm": 0.05898268287008237, + "learning_rate": 7.829435709087647e-06, + "loss": 0.0007, + "step": 18623 + }, + { + "epoch": 7.573810492069947, + "grad_norm": 0.16735304526460784, + "learning_rate": 7.828446687390796e-06, + "loss": 0.0028, + "step": 18624 + }, + { + "epoch": 7.574217161447743, + "grad_norm": 0.0748572315787295, + "learning_rate": 7.827457687986033e-06, + "loss": 0.0011, + "step": 18625 + }, + { + "epoch": 7.574623830825539, + "grad_norm": 0.45440782519006206, + "learning_rate": 7.826468710883512e-06, + "loss": 0.0049, + "step": 18626 + }, + { + "epoch": 7.575030500203335, + "grad_norm": 0.014361526354000561, + "learning_rate": 7.82547975609338e-06, + "loss": 0.0002, + "step": 18627 + }, + { + "epoch": 7.5754371695811304, + "grad_norm": 0.44708182391805856, + "learning_rate": 7.824490823625797e-06, + "loss": 0.0098, + "step": 18628 + }, + { + "epoch": 7.575843838958926, + "grad_norm": 0.2941940923330943, + "learning_rate": 7.823501913490908e-06, + "loss": 0.0027, + "step": 18629 + }, + { + "epoch": 7.576250508336722, + "grad_norm": 0.031216906148060324, + "learning_rate": 7.822513025698868e-06, + "loss": 0.0005, + "step": 18630 + }, + { + "epoch": 7.576657177714518, + "grad_norm": 0.35289557476672023, + "learning_rate": 7.821524160259826e-06, + "loss": 0.002, + "step": 18631 + }, + { + "epoch": 7.5770638470923135, + "grad_norm": 1.758162091082678, + "learning_rate": 7.820535317183938e-06, + "loss": 0.0277, + "step": 18632 + }, + { + "epoch": 7.57747051647011, + "grad_norm": 0.019869198252345073, + "learning_rate": 7.81954649648135e-06, + "loss": 0.0003, + "step": 18633 + }, + { + "epoch": 7.577877185847906, + "grad_norm": 9.32163302694653, + "learning_rate": 7.818557698162215e-06, + "loss": 0.2876, + "step": 18634 + }, + { + "epoch": 7.578283855225702, + "grad_norm": 1.5248444222801456, + "learning_rate": 7.817568922236683e-06, + "loss": 0.0213, + "step": 18635 + }, + { + "epoch": 7.5786905246034975, + "grad_norm": 0.3013141873725647, + "learning_rate": 7.816580168714905e-06, + "loss": 0.0036, + "step": 18636 + }, + { + "epoch": 7.579097193981293, + "grad_norm": 0.1789514879967422, + "learning_rate": 7.815591437607028e-06, + "loss": 0.0021, + "step": 18637 + }, + { + "epoch": 7.579503863359089, + "grad_norm": 9.784694791332834, + "learning_rate": 7.814602728923207e-06, + "loss": 0.1625, + "step": 18638 + }, + { + "epoch": 7.579910532736885, + "grad_norm": 12.122222286959792, + "learning_rate": 7.81361404267359e-06, + "loss": 0.4835, + "step": 18639 + }, + { + "epoch": 7.5803172021146805, + "grad_norm": 5.409104000370563, + "learning_rate": 7.812625378868322e-06, + "loss": 0.0855, + "step": 18640 + }, + { + "epoch": 7.580723871492476, + "grad_norm": 7.550381587227013, + "learning_rate": 7.811636737517555e-06, + "loss": 0.0659, + "step": 18641 + }, + { + "epoch": 7.581130540870273, + "grad_norm": 1.9276600858298352, + "learning_rate": 7.810648118631442e-06, + "loss": 0.0262, + "step": 18642 + }, + { + "epoch": 7.581537210248069, + "grad_norm": 0.043343519761339495, + "learning_rate": 7.809659522220127e-06, + "loss": 0.0007, + "step": 18643 + }, + { + "epoch": 7.5819438796258645, + "grad_norm": 9.365301957249558, + "learning_rate": 7.808670948293756e-06, + "loss": 0.2297, + "step": 18644 + }, + { + "epoch": 7.58235054900366, + "grad_norm": 1.1130642715299772, + "learning_rate": 7.807682396862484e-06, + "loss": 0.0172, + "step": 18645 + }, + { + "epoch": 7.582757218381456, + "grad_norm": 6.9702594975445775, + "learning_rate": 7.806693867936455e-06, + "loss": 0.14, + "step": 18646 + }, + { + "epoch": 7.583163887759252, + "grad_norm": 0.1545560937647774, + "learning_rate": 7.805705361525814e-06, + "loss": 0.0022, + "step": 18647 + }, + { + "epoch": 7.5835705571370475, + "grad_norm": 0.21420143935453165, + "learning_rate": 7.804716877640717e-06, + "loss": 0.0031, + "step": 18648 + }, + { + "epoch": 7.583977226514843, + "grad_norm": 0.2802827925991985, + "learning_rate": 7.803728416291302e-06, + "loss": 0.0034, + "step": 18649 + }, + { + "epoch": 7.584383895892639, + "grad_norm": 0.01748312220305115, + "learning_rate": 7.802739977487721e-06, + "loss": 0.0003, + "step": 18650 + }, + { + "epoch": 7.584790565270435, + "grad_norm": 1.1147147326239475, + "learning_rate": 7.801751561240122e-06, + "loss": 0.0174, + "step": 18651 + }, + { + "epoch": 7.585197234648231, + "grad_norm": 6.819228943497961, + "learning_rate": 7.800763167558649e-06, + "loss": 0.165, + "step": 18652 + }, + { + "epoch": 7.585603904026026, + "grad_norm": 10.671193075943137, + "learning_rate": 7.799774796453447e-06, + "loss": 0.1759, + "step": 18653 + }, + { + "epoch": 7.586010573403823, + "grad_norm": 2.3172212124380107, + "learning_rate": 7.798786447934667e-06, + "loss": 0.0443, + "step": 18654 + }, + { + "epoch": 7.586417242781619, + "grad_norm": 9.248368788758945, + "learning_rate": 7.79779812201245e-06, + "loss": 0.2068, + "step": 18655 + }, + { + "epoch": 7.5868239121594145, + "grad_norm": 4.148101328642261, + "learning_rate": 7.796809818696945e-06, + "loss": 0.1249, + "step": 18656 + }, + { + "epoch": 7.58723058153721, + "grad_norm": 5.28896853495203, + "learning_rate": 7.795821537998297e-06, + "loss": 0.1805, + "step": 18657 + }, + { + "epoch": 7.587637250915006, + "grad_norm": 0.05705148339904219, + "learning_rate": 7.79483327992665e-06, + "loss": 0.0009, + "step": 18658 + }, + { + "epoch": 7.588043920292802, + "grad_norm": 6.158520984955114, + "learning_rate": 7.793845044492145e-06, + "loss": 0.2469, + "step": 18659 + }, + { + "epoch": 7.588450589670598, + "grad_norm": 12.533492374782973, + "learning_rate": 7.792856831704935e-06, + "loss": 0.4608, + "step": 18660 + }, + { + "epoch": 7.588857259048393, + "grad_norm": 5.06532541766718, + "learning_rate": 7.79186864157516e-06, + "loss": 0.1953, + "step": 18661 + }, + { + "epoch": 7.589263928426189, + "grad_norm": 1.4742246053187216, + "learning_rate": 7.790880474112962e-06, + "loss": 0.021, + "step": 18662 + }, + { + "epoch": 7.589670597803986, + "grad_norm": 4.419145268716246, + "learning_rate": 7.78989232932849e-06, + "loss": 0.0783, + "step": 18663 + }, + { + "epoch": 7.5900772671817816, + "grad_norm": 18.53988572490014, + "learning_rate": 7.788904207231886e-06, + "loss": 0.502, + "step": 18664 + }, + { + "epoch": 7.590483936559577, + "grad_norm": 1.8174914341870394, + "learning_rate": 7.78791610783329e-06, + "loss": 0.024, + "step": 18665 + }, + { + "epoch": 7.590890605937373, + "grad_norm": 3.1462299121489927, + "learning_rate": 7.78692803114285e-06, + "loss": 0.0427, + "step": 18666 + }, + { + "epoch": 7.591297275315169, + "grad_norm": 0.8082355589937578, + "learning_rate": 7.78593997717071e-06, + "loss": 0.0092, + "step": 18667 + }, + { + "epoch": 7.591703944692965, + "grad_norm": 9.18358496288689, + "learning_rate": 7.784951945927007e-06, + "loss": 0.2315, + "step": 18668 + }, + { + "epoch": 7.59211061407076, + "grad_norm": 1.695537371762822, + "learning_rate": 7.783963937421889e-06, + "loss": 0.0269, + "step": 18669 + }, + { + "epoch": 7.592517283448556, + "grad_norm": 8.902355942293047, + "learning_rate": 7.782975951665497e-06, + "loss": 0.1875, + "step": 18670 + }, + { + "epoch": 7.592923952826352, + "grad_norm": 5.360773768713995, + "learning_rate": 7.781987988667973e-06, + "loss": 0.2106, + "step": 18671 + }, + { + "epoch": 7.593330622204148, + "grad_norm": 5.757667964039279, + "learning_rate": 7.781000048439455e-06, + "loss": 0.083, + "step": 18672 + }, + { + "epoch": 7.593737291581943, + "grad_norm": 0.05616972980247664, + "learning_rate": 7.780012130990092e-06, + "loss": 0.0006, + "step": 18673 + }, + { + "epoch": 7.59414396095974, + "grad_norm": 0.14689787319757946, + "learning_rate": 7.779024236330022e-06, + "loss": 0.0015, + "step": 18674 + }, + { + "epoch": 7.594550630337536, + "grad_norm": 10.145925541524177, + "learning_rate": 7.778036364469383e-06, + "loss": 0.2327, + "step": 18675 + }, + { + "epoch": 7.594957299715332, + "grad_norm": 0.011242027796026293, + "learning_rate": 7.77704851541832e-06, + "loss": 0.0002, + "step": 18676 + }, + { + "epoch": 7.595363969093127, + "grad_norm": 1.9532057911733287, + "learning_rate": 7.776060689186974e-06, + "loss": 0.02, + "step": 18677 + }, + { + "epoch": 7.595770638470923, + "grad_norm": 6.885880973779534, + "learning_rate": 7.775072885785482e-06, + "loss": 0.1463, + "step": 18678 + }, + { + "epoch": 7.596177307848719, + "grad_norm": 4.316906279517823, + "learning_rate": 7.774085105223989e-06, + "loss": 0.0785, + "step": 18679 + }, + { + "epoch": 7.596583977226515, + "grad_norm": 3.2051705193670355, + "learning_rate": 7.773097347512633e-06, + "loss": 0.0395, + "step": 18680 + }, + { + "epoch": 7.5969906466043104, + "grad_norm": 0.03157232032832293, + "learning_rate": 7.77210961266155e-06, + "loss": 0.0004, + "step": 18681 + }, + { + "epoch": 7.597397315982106, + "grad_norm": 3.2149400914256923, + "learning_rate": 7.771121900680887e-06, + "loss": 0.1201, + "step": 18682 + }, + { + "epoch": 7.597803985359903, + "grad_norm": 0.4220264828348391, + "learning_rate": 7.770134211580778e-06, + "loss": 0.0069, + "step": 18683 + }, + { + "epoch": 7.598210654737699, + "grad_norm": 2.794501350880489, + "learning_rate": 7.769146545371364e-06, + "loss": 0.0413, + "step": 18684 + }, + { + "epoch": 7.598617324115494, + "grad_norm": 1.3359889794701159, + "learning_rate": 7.768158902062782e-06, + "loss": 0.0227, + "step": 18685 + }, + { + "epoch": 7.59902399349329, + "grad_norm": 0.04224369382978356, + "learning_rate": 7.767171281665174e-06, + "loss": 0.0004, + "step": 18686 + }, + { + "epoch": 7.599430662871086, + "grad_norm": 3.3909883044250413, + "learning_rate": 7.766183684188677e-06, + "loss": 0.0539, + "step": 18687 + }, + { + "epoch": 7.599837332248882, + "grad_norm": 0.5699121780741825, + "learning_rate": 7.765196109643425e-06, + "loss": 0.0096, + "step": 18688 + }, + { + "epoch": 7.6002440016266775, + "grad_norm": 2.9348429239058462, + "learning_rate": 7.764208558039562e-06, + "loss": 0.0649, + "step": 18689 + }, + { + "epoch": 7.600650671004473, + "grad_norm": 8.81184106926717, + "learning_rate": 7.763221029387225e-06, + "loss": 0.2515, + "step": 18690 + }, + { + "epoch": 7.601057340382269, + "grad_norm": 6.663620131647441, + "learning_rate": 7.762233523696548e-06, + "loss": 0.2398, + "step": 18691 + }, + { + "epoch": 7.601464009760065, + "grad_norm": 2.3044093718852907, + "learning_rate": 7.76124604097767e-06, + "loss": 0.026, + "step": 18692 + }, + { + "epoch": 7.6018706791378605, + "grad_norm": 3.7179944790831425, + "learning_rate": 7.76025858124073e-06, + "loss": 0.0607, + "step": 18693 + }, + { + "epoch": 7.602277348515656, + "grad_norm": 0.5921164803731995, + "learning_rate": 7.75927114449586e-06, + "loss": 0.0072, + "step": 18694 + }, + { + "epoch": 7.602684017893453, + "grad_norm": 11.05247318547695, + "learning_rate": 7.758283730753203e-06, + "loss": 0.3147, + "step": 18695 + }, + { + "epoch": 7.603090687271249, + "grad_norm": 1.3854038952669618, + "learning_rate": 7.757296340022888e-06, + "loss": 0.016, + "step": 18696 + }, + { + "epoch": 7.6034973566490445, + "grad_norm": 4.78579922073743, + "learning_rate": 7.756308972315059e-06, + "loss": 0.0626, + "step": 18697 + }, + { + "epoch": 7.60390402602684, + "grad_norm": 3.3863802511963073, + "learning_rate": 7.755321627639843e-06, + "loss": 0.0624, + "step": 18698 + }, + { + "epoch": 7.604310695404636, + "grad_norm": 4.771260373388716, + "learning_rate": 7.754334306007382e-06, + "loss": 0.1083, + "step": 18699 + }, + { + "epoch": 7.604717364782432, + "grad_norm": 10.323223351445243, + "learning_rate": 7.75334700742781e-06, + "loss": 0.2756, + "step": 18700 + }, + { + "epoch": 7.6051240341602275, + "grad_norm": 3.6288469009966526, + "learning_rate": 7.752359731911263e-06, + "loss": 0.0637, + "step": 18701 + }, + { + "epoch": 7.605530703538023, + "grad_norm": 30.364690422318567, + "learning_rate": 7.751372479467872e-06, + "loss": 0.8032, + "step": 18702 + }, + { + "epoch": 7.605937372915819, + "grad_norm": 0.8744723948413192, + "learning_rate": 7.750385250107777e-06, + "loss": 0.0107, + "step": 18703 + }, + { + "epoch": 7.606344042293616, + "grad_norm": 6.425821474059571, + "learning_rate": 7.749398043841106e-06, + "loss": 0.1883, + "step": 18704 + }, + { + "epoch": 7.6067507116714115, + "grad_norm": 0.6527639405264714, + "learning_rate": 7.748410860678e-06, + "loss": 0.0085, + "step": 18705 + }, + { + "epoch": 7.607157381049207, + "grad_norm": 0.45975171434289247, + "learning_rate": 7.747423700628589e-06, + "loss": 0.0069, + "step": 18706 + }, + { + "epoch": 7.607564050427003, + "grad_norm": 0.5584340296493645, + "learning_rate": 7.746436563703008e-06, + "loss": 0.0062, + "step": 18707 + }, + { + "epoch": 7.607970719804799, + "grad_norm": 0.08424946938545991, + "learning_rate": 7.745449449911387e-06, + "loss": 0.0015, + "step": 18708 + }, + { + "epoch": 7.6083773891825945, + "grad_norm": 0.030386665987976232, + "learning_rate": 7.744462359263864e-06, + "loss": 0.0005, + "step": 18709 + }, + { + "epoch": 7.60878405856039, + "grad_norm": 1.3485539460843137, + "learning_rate": 7.74347529177057e-06, + "loss": 0.0195, + "step": 18710 + }, + { + "epoch": 7.609190727938186, + "grad_norm": 9.691908649597845, + "learning_rate": 7.742488247441637e-06, + "loss": 0.1363, + "step": 18711 + }, + { + "epoch": 7.609597397315982, + "grad_norm": 3.3301446810836506, + "learning_rate": 7.7415012262872e-06, + "loss": 0.0576, + "step": 18712 + }, + { + "epoch": 7.610004066693778, + "grad_norm": 0.44314075286806, + "learning_rate": 7.74051422831739e-06, + "loss": 0.0044, + "step": 18713 + }, + { + "epoch": 7.610410736071573, + "grad_norm": 11.387445339622335, + "learning_rate": 7.739527253542335e-06, + "loss": 0.3117, + "step": 18714 + }, + { + "epoch": 7.61081740544937, + "grad_norm": 0.05948855289095884, + "learning_rate": 7.738540301972174e-06, + "loss": 0.001, + "step": 18715 + }, + { + "epoch": 7.611224074827166, + "grad_norm": 11.757667515263183, + "learning_rate": 7.737553373617034e-06, + "loss": 0.3383, + "step": 18716 + }, + { + "epoch": 7.6116307442049616, + "grad_norm": 4.721316458011256, + "learning_rate": 7.736566468487045e-06, + "loss": 0.0971, + "step": 18717 + }, + { + "epoch": 7.612037413582757, + "grad_norm": 0.7731984803888697, + "learning_rate": 7.735579586592344e-06, + "loss": 0.0128, + "step": 18718 + }, + { + "epoch": 7.612444082960553, + "grad_norm": 0.108894616179269, + "learning_rate": 7.734592727943056e-06, + "loss": 0.0014, + "step": 18719 + }, + { + "epoch": 7.612850752338349, + "grad_norm": 0.35601382747198423, + "learning_rate": 7.733605892549317e-06, + "loss": 0.0062, + "step": 18720 + }, + { + "epoch": 7.613257421716145, + "grad_norm": 3.9185561671149887, + "learning_rate": 7.732619080421248e-06, + "loss": 0.0673, + "step": 18721 + }, + { + "epoch": 7.61366409109394, + "grad_norm": 15.25032744192916, + "learning_rate": 7.73163229156899e-06, + "loss": 0.6924, + "step": 18722 + }, + { + "epoch": 7.614070760471736, + "grad_norm": 5.5957216675269725, + "learning_rate": 7.730645526002668e-06, + "loss": 0.1185, + "step": 18723 + }, + { + "epoch": 7.614477429849533, + "grad_norm": 1.0996510537372748, + "learning_rate": 7.729658783732408e-06, + "loss": 0.0142, + "step": 18724 + }, + { + "epoch": 7.614884099227329, + "grad_norm": 1.0641785201267155, + "learning_rate": 7.728672064768348e-06, + "loss": 0.0135, + "step": 18725 + }, + { + "epoch": 7.615290768605124, + "grad_norm": 7.776753650611188, + "learning_rate": 7.727685369120612e-06, + "loss": 0.3587, + "step": 18726 + }, + { + "epoch": 7.61569743798292, + "grad_norm": 3.067993476123025, + "learning_rate": 7.726698696799326e-06, + "loss": 0.0545, + "step": 18727 + }, + { + "epoch": 7.616104107360716, + "grad_norm": 3.487096812217161, + "learning_rate": 7.725712047814624e-06, + "loss": 0.096, + "step": 18728 + }, + { + "epoch": 7.616510776738512, + "grad_norm": 1.3255388497003822, + "learning_rate": 7.724725422176633e-06, + "loss": 0.016, + "step": 18729 + }, + { + "epoch": 7.616917446116307, + "grad_norm": 1.25669132637297, + "learning_rate": 7.723738819895478e-06, + "loss": 0.0199, + "step": 18730 + }, + { + "epoch": 7.617324115494103, + "grad_norm": 0.09078875461153779, + "learning_rate": 7.722752240981292e-06, + "loss": 0.001, + "step": 18731 + }, + { + "epoch": 7.617730784871899, + "grad_norm": 2.2995827768827066, + "learning_rate": 7.7217656854442e-06, + "loss": 0.0358, + "step": 18732 + }, + { + "epoch": 7.618137454249695, + "grad_norm": 4.350615461540442, + "learning_rate": 7.72077915329433e-06, + "loss": 0.1889, + "step": 18733 + }, + { + "epoch": 7.6185441236274904, + "grad_norm": 3.925915156519446, + "learning_rate": 7.719792644541808e-06, + "loss": 0.0816, + "step": 18734 + }, + { + "epoch": 7.618950793005286, + "grad_norm": 1.652732003872772, + "learning_rate": 7.718806159196764e-06, + "loss": 0.0264, + "step": 18735 + }, + { + "epoch": 7.619357462383083, + "grad_norm": 1.1065680446375548, + "learning_rate": 7.717819697269322e-06, + "loss": 0.0147, + "step": 18736 + }, + { + "epoch": 7.619764131760879, + "grad_norm": 0.17860156950206665, + "learning_rate": 7.716833258769608e-06, + "loss": 0.0019, + "step": 18737 + }, + { + "epoch": 7.620170801138674, + "grad_norm": 1.6357410475741159, + "learning_rate": 7.715846843707752e-06, + "loss": 0.0212, + "step": 18738 + }, + { + "epoch": 7.62057747051647, + "grad_norm": 7.227463433063663, + "learning_rate": 7.714860452093876e-06, + "loss": 0.1849, + "step": 18739 + }, + { + "epoch": 7.620984139894266, + "grad_norm": 0.5286176803248096, + "learning_rate": 7.713874083938107e-06, + "loss": 0.0067, + "step": 18740 + }, + { + "epoch": 7.621390809272062, + "grad_norm": 0.21708022784922046, + "learning_rate": 7.712887739250573e-06, + "loss": 0.002, + "step": 18741 + }, + { + "epoch": 7.6217974786498575, + "grad_norm": 0.08752913295327033, + "learning_rate": 7.711901418041398e-06, + "loss": 0.0016, + "step": 18742 + }, + { + "epoch": 7.622204148027653, + "grad_norm": 0.1117184518609399, + "learning_rate": 7.710915120320704e-06, + "loss": 0.0007, + "step": 18743 + }, + { + "epoch": 7.622610817405449, + "grad_norm": 0.16868393809293752, + "learning_rate": 7.70992884609862e-06, + "loss": 0.0025, + "step": 18744 + }, + { + "epoch": 7.623017486783246, + "grad_norm": 8.009952445608343, + "learning_rate": 7.70894259538527e-06, + "loss": 0.2592, + "step": 18745 + }, + { + "epoch": 7.623424156161041, + "grad_norm": 0.1965929348393742, + "learning_rate": 7.707956368190776e-06, + "loss": 0.0023, + "step": 18746 + }, + { + "epoch": 7.623830825538837, + "grad_norm": 0.476130495420802, + "learning_rate": 7.706970164525261e-06, + "loss": 0.0075, + "step": 18747 + }, + { + "epoch": 7.624237494916633, + "grad_norm": 0.0946336987931315, + "learning_rate": 7.705983984398854e-06, + "loss": 0.0014, + "step": 18748 + }, + { + "epoch": 7.624644164294429, + "grad_norm": 0.05145984602930277, + "learning_rate": 7.704997827821676e-06, + "loss": 0.0007, + "step": 18749 + }, + { + "epoch": 7.6250508336722245, + "grad_norm": 11.35029054487258, + "learning_rate": 7.704011694803848e-06, + "loss": 0.359, + "step": 18750 + }, + { + "epoch": 7.62545750305002, + "grad_norm": 4.810128292596818, + "learning_rate": 7.703025585355499e-06, + "loss": 0.1039, + "step": 18751 + }, + { + "epoch": 7.625864172427816, + "grad_norm": 0.005142394970788756, + "learning_rate": 7.702039499486747e-06, + "loss": 0.0001, + "step": 18752 + }, + { + "epoch": 7.626270841805612, + "grad_norm": 12.863312342337117, + "learning_rate": 7.701053437207716e-06, + "loss": 0.6157, + "step": 18753 + }, + { + "epoch": 7.6266775111834075, + "grad_norm": 11.158890380507573, + "learning_rate": 7.700067398528528e-06, + "loss": 0.2262, + "step": 18754 + }, + { + "epoch": 7.627084180561203, + "grad_norm": 2.0254265956131046, + "learning_rate": 7.699081383459307e-06, + "loss": 0.0331, + "step": 18755 + }, + { + "epoch": 7.627490849939, + "grad_norm": 17.909185001325284, + "learning_rate": 7.698095392010171e-06, + "loss": 1.0765, + "step": 18756 + }, + { + "epoch": 7.627897519316796, + "grad_norm": 0.4526503310855952, + "learning_rate": 7.697109424191251e-06, + "loss": 0.0083, + "step": 18757 + }, + { + "epoch": 7.6283041886945915, + "grad_norm": 0.011468318220045572, + "learning_rate": 7.696123480012657e-06, + "loss": 0.0002, + "step": 18758 + }, + { + "epoch": 7.628710858072387, + "grad_norm": 0.005769788793307946, + "learning_rate": 7.695137559484514e-06, + "loss": 0.0001, + "step": 18759 + }, + { + "epoch": 7.629117527450183, + "grad_norm": 0.04218421003950559, + "learning_rate": 7.694151662616945e-06, + "loss": 0.0006, + "step": 18760 + }, + { + "epoch": 7.629524196827979, + "grad_norm": 0.9818833977980801, + "learning_rate": 7.69316578942007e-06, + "loss": 0.0134, + "step": 18761 + }, + { + "epoch": 7.6299308662057745, + "grad_norm": 4.6978487985128865, + "learning_rate": 7.692179939904008e-06, + "loss": 0.0514, + "step": 18762 + }, + { + "epoch": 7.63033753558357, + "grad_norm": 0.6461540920858201, + "learning_rate": 7.691194114078882e-06, + "loss": 0.0066, + "step": 18763 + }, + { + "epoch": 7.630744204961366, + "grad_norm": 6.714559858622179, + "learning_rate": 7.69020831195481e-06, + "loss": 0.1978, + "step": 18764 + }, + { + "epoch": 7.631150874339163, + "grad_norm": 1.1667916810399184, + "learning_rate": 7.68922253354191e-06, + "loss": 0.0162, + "step": 18765 + }, + { + "epoch": 7.6315575437169585, + "grad_norm": 3.6366641358578047, + "learning_rate": 7.688236778850307e-06, + "loss": 0.041, + "step": 18766 + }, + { + "epoch": 7.631964213094754, + "grad_norm": 8.18597623349926, + "learning_rate": 7.687251047890116e-06, + "loss": 0.2926, + "step": 18767 + }, + { + "epoch": 7.63237088247255, + "grad_norm": 6.648055723880272, + "learning_rate": 7.686265340671455e-06, + "loss": 0.1505, + "step": 18768 + }, + { + "epoch": 7.632777551850346, + "grad_norm": 4.924570467612078, + "learning_rate": 7.685279657204446e-06, + "loss": 0.0866, + "step": 18769 + }, + { + "epoch": 7.6331842212281416, + "grad_norm": 5.423187900103482, + "learning_rate": 7.684293997499207e-06, + "loss": 0.1324, + "step": 18770 + }, + { + "epoch": 7.633590890605937, + "grad_norm": 4.338560994517903, + "learning_rate": 7.683308361565853e-06, + "loss": 0.1112, + "step": 18771 + }, + { + "epoch": 7.633997559983733, + "grad_norm": 8.826192734464277, + "learning_rate": 7.682322749414509e-06, + "loss": 0.2223, + "step": 18772 + }, + { + "epoch": 7.634404229361529, + "grad_norm": 0.177620504267991, + "learning_rate": 7.681337161055285e-06, + "loss": 0.0032, + "step": 18773 + }, + { + "epoch": 7.634810898739325, + "grad_norm": 3.9207976948252945, + "learning_rate": 7.680351596498304e-06, + "loss": 0.0545, + "step": 18774 + }, + { + "epoch": 7.63521756811712, + "grad_norm": 0.11373975071522384, + "learning_rate": 7.679366055753679e-06, + "loss": 0.0015, + "step": 18775 + }, + { + "epoch": 7.635624237494916, + "grad_norm": 4.253274180346963, + "learning_rate": 7.678380538831532e-06, + "loss": 0.0645, + "step": 18776 + }, + { + "epoch": 7.636030906872713, + "grad_norm": 12.449198728957654, + "learning_rate": 7.677395045741975e-06, + "loss": 0.5321, + "step": 18777 + }, + { + "epoch": 7.636437576250509, + "grad_norm": 1.0328100618244915, + "learning_rate": 7.676409576495128e-06, + "loss": 0.0083, + "step": 18778 + }, + { + "epoch": 7.636844245628304, + "grad_norm": 3.772541314687576, + "learning_rate": 7.675424131101105e-06, + "loss": 0.0631, + "step": 18779 + }, + { + "epoch": 7.6372509150061, + "grad_norm": 3.7157616094345904, + "learning_rate": 7.674438709570025e-06, + "loss": 0.1213, + "step": 18780 + }, + { + "epoch": 7.637657584383896, + "grad_norm": 1.9587707278679638, + "learning_rate": 7.673453311912e-06, + "loss": 0.0336, + "step": 18781 + }, + { + "epoch": 7.638064253761692, + "grad_norm": 0.3898816457189581, + "learning_rate": 7.67246793813715e-06, + "loss": 0.0053, + "step": 18782 + }, + { + "epoch": 7.638470923139487, + "grad_norm": 2.3383185756640765, + "learning_rate": 7.671482588255586e-06, + "loss": 0.0501, + "step": 18783 + }, + { + "epoch": 7.638877592517283, + "grad_norm": 6.2855502256135924, + "learning_rate": 7.670497262277425e-06, + "loss": 0.1231, + "step": 18784 + }, + { + "epoch": 7.639284261895079, + "grad_norm": 7.323184100679412, + "learning_rate": 7.669511960212785e-06, + "loss": 0.2802, + "step": 18785 + }, + { + "epoch": 7.639690931272876, + "grad_norm": 2.9456035111512526, + "learning_rate": 7.668526682071775e-06, + "loss": 0.0499, + "step": 18786 + }, + { + "epoch": 7.640097600650671, + "grad_norm": 0.03768639167428882, + "learning_rate": 7.667541427864513e-06, + "loss": 0.0006, + "step": 18787 + }, + { + "epoch": 7.640504270028467, + "grad_norm": 5.770751356962525, + "learning_rate": 7.666556197601111e-06, + "loss": 0.1011, + "step": 18788 + }, + { + "epoch": 7.640910939406263, + "grad_norm": 4.0356758010019735, + "learning_rate": 7.665570991291687e-06, + "loss": 0.1047, + "step": 18789 + }, + { + "epoch": 7.641317608784059, + "grad_norm": 8.403845405686386, + "learning_rate": 7.664585808946348e-06, + "loss": 0.1969, + "step": 18790 + }, + { + "epoch": 7.641724278161854, + "grad_norm": 4.8936354130527056, + "learning_rate": 7.663600650575213e-06, + "loss": 0.0669, + "step": 18791 + }, + { + "epoch": 7.64213094753965, + "grad_norm": 8.225858256770936, + "learning_rate": 7.662615516188391e-06, + "loss": 0.3871, + "step": 18792 + }, + { + "epoch": 7.642537616917446, + "grad_norm": 3.0176041969234704, + "learning_rate": 7.661630405796e-06, + "loss": 0.0322, + "step": 18793 + }, + { + "epoch": 7.642944286295242, + "grad_norm": 4.112050513323391, + "learning_rate": 7.66064531940815e-06, + "loss": 0.1133, + "step": 18794 + }, + { + "epoch": 7.6433509556730375, + "grad_norm": 7.588042356135146, + "learning_rate": 7.659660257034951e-06, + "loss": 0.1894, + "step": 18795 + }, + { + "epoch": 7.643757625050833, + "grad_norm": 49.53503767868277, + "learning_rate": 7.658675218686519e-06, + "loss": 0.0952, + "step": 18796 + }, + { + "epoch": 7.64416429442863, + "grad_norm": 0.5502436542363515, + "learning_rate": 7.657690204372962e-06, + "loss": 0.0072, + "step": 18797 + }, + { + "epoch": 7.644570963806426, + "grad_norm": 2.318928093789262, + "learning_rate": 7.656705214104395e-06, + "loss": 0.0449, + "step": 18798 + }, + { + "epoch": 7.644977633184221, + "grad_norm": 1.8018437739399327, + "learning_rate": 7.65572024789093e-06, + "loss": 0.0291, + "step": 18799 + }, + { + "epoch": 7.645384302562017, + "grad_norm": 0.02312560966908388, + "learning_rate": 7.654735305742675e-06, + "loss": 0.0004, + "step": 18800 + }, + { + "epoch": 7.645790971939813, + "grad_norm": 5.397769348463793, + "learning_rate": 7.653750387669742e-06, + "loss": 0.1134, + "step": 18801 + }, + { + "epoch": 7.646197641317609, + "grad_norm": 1.3310266738093128, + "learning_rate": 7.652765493682243e-06, + "loss": 0.0099, + "step": 18802 + }, + { + "epoch": 7.6466043106954045, + "grad_norm": 6.3328669722814634, + "learning_rate": 7.651780623790288e-06, + "loss": 0.2224, + "step": 18803 + }, + { + "epoch": 7.6470109800732, + "grad_norm": 0.030208610210379114, + "learning_rate": 7.650795778003985e-06, + "loss": 0.0005, + "step": 18804 + }, + { + "epoch": 7.647417649450996, + "grad_norm": 1.0938734892759432, + "learning_rate": 7.649810956333448e-06, + "loss": 0.0126, + "step": 18805 + }, + { + "epoch": 7.647824318828793, + "grad_norm": 2.3089866720838192, + "learning_rate": 7.648826158788782e-06, + "loss": 0.044, + "step": 18806 + }, + { + "epoch": 7.648230988206588, + "grad_norm": 0.07782979552720322, + "learning_rate": 7.6478413853801e-06, + "loss": 0.0011, + "step": 18807 + }, + { + "epoch": 7.648637657584384, + "grad_norm": 5.540915524093198, + "learning_rate": 7.64685663611751e-06, + "loss": 0.1496, + "step": 18808 + }, + { + "epoch": 7.64904432696218, + "grad_norm": 0.7160327584919882, + "learning_rate": 7.645871911011121e-06, + "loss": 0.0126, + "step": 18809 + }, + { + "epoch": 7.649450996339976, + "grad_norm": 2.579094098231807, + "learning_rate": 7.64488721007104e-06, + "loss": 0.0354, + "step": 18810 + }, + { + "epoch": 7.6498576657177715, + "grad_norm": 1.9046730808801875, + "learning_rate": 7.643902533307378e-06, + "loss": 0.0337, + "step": 18811 + }, + { + "epoch": 7.650264335095567, + "grad_norm": 4.337462212354331, + "learning_rate": 7.642917880730245e-06, + "loss": 0.0672, + "step": 18812 + }, + { + "epoch": 7.650671004473363, + "grad_norm": 2.2910503419780675, + "learning_rate": 7.641933252349745e-06, + "loss": 0.0475, + "step": 18813 + }, + { + "epoch": 7.651077673851159, + "grad_norm": 0.10807230562779806, + "learning_rate": 7.640948648175984e-06, + "loss": 0.0017, + "step": 18814 + }, + { + "epoch": 7.6514843432289545, + "grad_norm": 8.229688545420363, + "learning_rate": 7.639964068219074e-06, + "loss": 0.1703, + "step": 18815 + }, + { + "epoch": 7.65189101260675, + "grad_norm": 2.659412717397655, + "learning_rate": 7.638979512489123e-06, + "loss": 0.0321, + "step": 18816 + }, + { + "epoch": 7.652297681984546, + "grad_norm": 2.5656108179421717, + "learning_rate": 7.637994980996231e-06, + "loss": 0.0901, + "step": 18817 + }, + { + "epoch": 7.652704351362343, + "grad_norm": 0.9405849163942529, + "learning_rate": 7.637010473750514e-06, + "loss": 0.0152, + "step": 18818 + }, + { + "epoch": 7.6531110207401385, + "grad_norm": 0.5100835202838093, + "learning_rate": 7.636025990762071e-06, + "loss": 0.0065, + "step": 18819 + }, + { + "epoch": 7.653517690117934, + "grad_norm": 5.372710157225963, + "learning_rate": 7.635041532041011e-06, + "loss": 0.1494, + "step": 18820 + }, + { + "epoch": 7.65392435949573, + "grad_norm": 8.871949085862301, + "learning_rate": 7.634057097597441e-06, + "loss": 0.3569, + "step": 18821 + }, + { + "epoch": 7.654331028873526, + "grad_norm": 0.8506432760226342, + "learning_rate": 7.633072687441465e-06, + "loss": 0.0149, + "step": 18822 + }, + { + "epoch": 7.6547376982513216, + "grad_norm": 1.446737315589447, + "learning_rate": 7.632088301583192e-06, + "loss": 0.0272, + "step": 18823 + }, + { + "epoch": 7.655144367629117, + "grad_norm": 0.5132525558020581, + "learning_rate": 7.631103940032718e-06, + "loss": 0.0089, + "step": 18824 + }, + { + "epoch": 7.655551037006913, + "grad_norm": 4.131666342956489, + "learning_rate": 7.63011960280016e-06, + "loss": 0.064, + "step": 18825 + }, + { + "epoch": 7.655957706384709, + "grad_norm": 2.2266523762390475, + "learning_rate": 7.629135289895615e-06, + "loss": 0.0361, + "step": 18826 + }, + { + "epoch": 7.6563643757625055, + "grad_norm": 0.037893902731862154, + "learning_rate": 7.628151001329188e-06, + "loss": 0.0004, + "step": 18827 + }, + { + "epoch": 7.656771045140301, + "grad_norm": 1.8684733983938309, + "learning_rate": 7.627166737110987e-06, + "loss": 0.0164, + "step": 18828 + }, + { + "epoch": 7.657177714518097, + "grad_norm": 1.4550300850634545, + "learning_rate": 7.626182497251113e-06, + "loss": 0.0263, + "step": 18829 + }, + { + "epoch": 7.657584383895893, + "grad_norm": 0.015663154946054723, + "learning_rate": 7.625198281759669e-06, + "loss": 0.0002, + "step": 18830 + }, + { + "epoch": 7.657991053273689, + "grad_norm": 0.03017450581229056, + "learning_rate": 7.624214090646762e-06, + "loss": 0.0005, + "step": 18831 + }, + { + "epoch": 7.658397722651484, + "grad_norm": 0.17671099308705135, + "learning_rate": 7.623229923922493e-06, + "loss": 0.0018, + "step": 18832 + }, + { + "epoch": 7.65880439202928, + "grad_norm": 6.354502361043827, + "learning_rate": 7.622245781596962e-06, + "loss": 0.0896, + "step": 18833 + }, + { + "epoch": 7.659211061407076, + "grad_norm": 0.20999268557780565, + "learning_rate": 7.621261663680278e-06, + "loss": 0.0033, + "step": 18834 + }, + { + "epoch": 7.659617730784872, + "grad_norm": 4.048075613969678, + "learning_rate": 7.620277570182539e-06, + "loss": 0.0829, + "step": 18835 + }, + { + "epoch": 7.660024400162667, + "grad_norm": 2.4831147783821796, + "learning_rate": 7.619293501113851e-06, + "loss": 0.0587, + "step": 18836 + }, + { + "epoch": 7.660431069540463, + "grad_norm": 0.7747822634243325, + "learning_rate": 7.618309456484309e-06, + "loss": 0.0146, + "step": 18837 + }, + { + "epoch": 7.66083773891826, + "grad_norm": 0.599293719199181, + "learning_rate": 7.6173254363040216e-06, + "loss": 0.0099, + "step": 18838 + }, + { + "epoch": 7.661244408296056, + "grad_norm": 3.785750330683777, + "learning_rate": 7.616341440583087e-06, + "loss": 0.089, + "step": 18839 + }, + { + "epoch": 7.661651077673851, + "grad_norm": 6.584019464236905, + "learning_rate": 7.615357469331607e-06, + "loss": 0.1051, + "step": 18840 + }, + { + "epoch": 7.662057747051647, + "grad_norm": 0.22372783554175654, + "learning_rate": 7.6143735225596836e-06, + "loss": 0.0032, + "step": 18841 + }, + { + "epoch": 7.662464416429443, + "grad_norm": 0.1982816780438111, + "learning_rate": 7.613389600277416e-06, + "loss": 0.0017, + "step": 18842 + }, + { + "epoch": 7.662871085807239, + "grad_norm": 0.142941171715341, + "learning_rate": 7.612405702494905e-06, + "loss": 0.0017, + "step": 18843 + }, + { + "epoch": 7.663277755185034, + "grad_norm": 5.0155544043003895, + "learning_rate": 7.611421829222252e-06, + "loss": 0.0837, + "step": 18844 + }, + { + "epoch": 7.66368442456283, + "grad_norm": 2.733165764923025, + "learning_rate": 7.610437980469556e-06, + "loss": 0.0376, + "step": 18845 + }, + { + "epoch": 7.664091093940626, + "grad_norm": 3.7915716827190673, + "learning_rate": 7.609454156246915e-06, + "loss": 0.059, + "step": 18846 + }, + { + "epoch": 7.664497763318423, + "grad_norm": 0.4034263650763015, + "learning_rate": 7.608470356564433e-06, + "loss": 0.0061, + "step": 18847 + }, + { + "epoch": 7.664904432696218, + "grad_norm": 0.4409438228185662, + "learning_rate": 7.607486581432205e-06, + "loss": 0.0047, + "step": 18848 + }, + { + "epoch": 7.665311102074014, + "grad_norm": 12.590483935321691, + "learning_rate": 7.606502830860332e-06, + "loss": 0.3774, + "step": 18849 + }, + { + "epoch": 7.66571777145181, + "grad_norm": 13.06011589452181, + "learning_rate": 7.60551910485891e-06, + "loss": 0.289, + "step": 18850 + }, + { + "epoch": 7.666124440829606, + "grad_norm": 1.625435717661863, + "learning_rate": 7.604535403438043e-06, + "loss": 0.0169, + "step": 18851 + }, + { + "epoch": 7.666531110207401, + "grad_norm": 0.5651967103027633, + "learning_rate": 7.603551726607823e-06, + "loss": 0.0102, + "step": 18852 + }, + { + "epoch": 7.666937779585197, + "grad_norm": 0.27063142669139606, + "learning_rate": 7.60256807437835e-06, + "loss": 0.0041, + "step": 18853 + }, + { + "epoch": 7.667344448962993, + "grad_norm": 0.014609773779741228, + "learning_rate": 7.601584446759724e-06, + "loss": 0.0002, + "step": 18854 + }, + { + "epoch": 7.667751118340789, + "grad_norm": 3.4562126286880765, + "learning_rate": 7.600600843762041e-06, + "loss": 0.1238, + "step": 18855 + }, + { + "epoch": 7.6681577877185845, + "grad_norm": 1.6368176738103275, + "learning_rate": 7.599617265395397e-06, + "loss": 0.0245, + "step": 18856 + }, + { + "epoch": 7.66856445709638, + "grad_norm": 2.8815983698917886, + "learning_rate": 7.598633711669895e-06, + "loss": 0.047, + "step": 18857 + }, + { + "epoch": 7.668971126474176, + "grad_norm": 1.564792425410941, + "learning_rate": 7.597650182595621e-06, + "loss": 0.0206, + "step": 18858 + }, + { + "epoch": 7.669377795851973, + "grad_norm": 2.8686470440754737, + "learning_rate": 7.596666678182678e-06, + "loss": 0.0502, + "step": 18859 + }, + { + "epoch": 7.669784465229768, + "grad_norm": 0.6174382544307904, + "learning_rate": 7.595683198441161e-06, + "loss": 0.0094, + "step": 18860 + }, + { + "epoch": 7.670191134607564, + "grad_norm": 0.05704367357292386, + "learning_rate": 7.594699743381164e-06, + "loss": 0.0005, + "step": 18861 + }, + { + "epoch": 7.67059780398536, + "grad_norm": 3.5452923359146182, + "learning_rate": 7.593716313012788e-06, + "loss": 0.1344, + "step": 18862 + }, + { + "epoch": 7.671004473363156, + "grad_norm": 1.3553659054123437, + "learning_rate": 7.592732907346124e-06, + "loss": 0.0187, + "step": 18863 + }, + { + "epoch": 7.6714111427409515, + "grad_norm": 8.861315227160317, + "learning_rate": 7.591749526391269e-06, + "loss": 0.2901, + "step": 18864 + }, + { + "epoch": 7.671817812118747, + "grad_norm": 5.020693892874317, + "learning_rate": 7.590766170158315e-06, + "loss": 0.1022, + "step": 18865 + }, + { + "epoch": 7.672224481496543, + "grad_norm": 3.2035885412205425, + "learning_rate": 7.589782838657361e-06, + "loss": 0.0398, + "step": 18866 + }, + { + "epoch": 7.672631150874339, + "grad_norm": 5.390670830507614, + "learning_rate": 7.588799531898497e-06, + "loss": 0.096, + "step": 18867 + }, + { + "epoch": 7.673037820252135, + "grad_norm": 2.4714114524675215, + "learning_rate": 7.587816249891819e-06, + "loss": 0.0331, + "step": 18868 + }, + { + "epoch": 7.673444489629931, + "grad_norm": 13.439241094126592, + "learning_rate": 7.586832992647424e-06, + "loss": 0.5693, + "step": 18869 + }, + { + "epoch": 7.673851159007727, + "grad_norm": 8.552271980047184, + "learning_rate": 7.585849760175402e-06, + "loss": 0.0418, + "step": 18870 + }, + { + "epoch": 7.674257828385523, + "grad_norm": 0.09763119158027611, + "learning_rate": 7.584866552485845e-06, + "loss": 0.001, + "step": 18871 + }, + { + "epoch": 7.6746644977633185, + "grad_norm": 10.8726095159932, + "learning_rate": 7.583883369588851e-06, + "loss": 0.3269, + "step": 18872 + }, + { + "epoch": 7.675071167141114, + "grad_norm": 0.061218172188221946, + "learning_rate": 7.58290021149451e-06, + "loss": 0.0007, + "step": 18873 + }, + { + "epoch": 7.67547783651891, + "grad_norm": 5.960262165336055, + "learning_rate": 7.5819170782129124e-06, + "loss": 0.1071, + "step": 18874 + }, + { + "epoch": 7.675884505896706, + "grad_norm": 0.06799731619143927, + "learning_rate": 7.580933969754156e-06, + "loss": 0.0008, + "step": 18875 + }, + { + "epoch": 7.6762911752745016, + "grad_norm": 1.1229653940496327, + "learning_rate": 7.579950886128329e-06, + "loss": 0.0157, + "step": 18876 + }, + { + "epoch": 7.676697844652297, + "grad_norm": 3.534336056323988, + "learning_rate": 7.578967827345525e-06, + "loss": 0.0696, + "step": 18877 + }, + { + "epoch": 7.677104514030093, + "grad_norm": 6.765446500317624, + "learning_rate": 7.577984793415832e-06, + "loss": 0.2533, + "step": 18878 + }, + { + "epoch": 7.67751118340789, + "grad_norm": 0.005549696171249742, + "learning_rate": 7.5770017843493445e-06, + "loss": 0.0001, + "step": 18879 + }, + { + "epoch": 7.6779178527856855, + "grad_norm": 7.846466352580989, + "learning_rate": 7.576018800156156e-06, + "loss": 0.1466, + "step": 18880 + }, + { + "epoch": 7.678324522163481, + "grad_norm": 0.14922727521362744, + "learning_rate": 7.575035840846351e-06, + "loss": 0.0024, + "step": 18881 + }, + { + "epoch": 7.678731191541277, + "grad_norm": 5.847565285676572, + "learning_rate": 7.574052906430026e-06, + "loss": 0.1212, + "step": 18882 + }, + { + "epoch": 7.679137860919073, + "grad_norm": 0.037346461510175705, + "learning_rate": 7.5730699969172685e-06, + "loss": 0.0005, + "step": 18883 + }, + { + "epoch": 7.679544530296869, + "grad_norm": 1.6689678706129807, + "learning_rate": 7.572087112318167e-06, + "loss": 0.0211, + "step": 18884 + }, + { + "epoch": 7.679951199674664, + "grad_norm": 3.1124774249529064, + "learning_rate": 7.571104252642816e-06, + "loss": 0.0497, + "step": 18885 + }, + { + "epoch": 7.68035786905246, + "grad_norm": 0.05404285102789126, + "learning_rate": 7.570121417901301e-06, + "loss": 0.0008, + "step": 18886 + }, + { + "epoch": 7.680764538430256, + "grad_norm": 7.621216338262745, + "learning_rate": 7.569138608103712e-06, + "loss": 0.1132, + "step": 18887 + }, + { + "epoch": 7.6811712078080525, + "grad_norm": 0.0025584079594499196, + "learning_rate": 7.568155823260139e-06, + "loss": 0.0, + "step": 18888 + }, + { + "epoch": 7.681577877185848, + "grad_norm": 1.4094140851840267, + "learning_rate": 7.567173063380671e-06, + "loss": 0.0107, + "step": 18889 + }, + { + "epoch": 7.681984546563644, + "grad_norm": 0.8596055275510333, + "learning_rate": 7.5661903284753965e-06, + "loss": 0.016, + "step": 18890 + }, + { + "epoch": 7.68239121594144, + "grad_norm": 3.6830566197685504, + "learning_rate": 7.565207618554401e-06, + "loss": 0.0749, + "step": 18891 + }, + { + "epoch": 7.682797885319236, + "grad_norm": 0.9765778755750816, + "learning_rate": 7.564224933627778e-06, + "loss": 0.0098, + "step": 18892 + }, + { + "epoch": 7.683204554697031, + "grad_norm": 0.008439438711239719, + "learning_rate": 7.56324227370561e-06, + "loss": 0.0001, + "step": 18893 + }, + { + "epoch": 7.683611224074827, + "grad_norm": 0.06188521041976819, + "learning_rate": 7.562259638797984e-06, + "loss": 0.001, + "step": 18894 + }, + { + "epoch": 7.684017893452623, + "grad_norm": 0.2997611669803593, + "learning_rate": 7.561277028914993e-06, + "loss": 0.0032, + "step": 18895 + }, + { + "epoch": 7.684424562830419, + "grad_norm": 0.34458872053628636, + "learning_rate": 7.56029444406672e-06, + "loss": 0.0049, + "step": 18896 + }, + { + "epoch": 7.684831232208214, + "grad_norm": 5.748699408366649, + "learning_rate": 7.559311884263252e-06, + "loss": 0.1675, + "step": 18897 + }, + { + "epoch": 7.68523790158601, + "grad_norm": 0.10800103840898671, + "learning_rate": 7.558329349514676e-06, + "loss": 0.001, + "step": 18898 + }, + { + "epoch": 7.685644570963806, + "grad_norm": 0.20999249857476734, + "learning_rate": 7.557346839831079e-06, + "loss": 0.0023, + "step": 18899 + }, + { + "epoch": 7.686051240341603, + "grad_norm": 0.030454630809693097, + "learning_rate": 7.556364355222543e-06, + "loss": 0.0004, + "step": 18900 + }, + { + "epoch": 7.686457909719398, + "grad_norm": 8.93144884118395, + "learning_rate": 7.555381895699159e-06, + "loss": 0.1464, + "step": 18901 + }, + { + "epoch": 7.686864579097194, + "grad_norm": 0.7470850623028887, + "learning_rate": 7.554399461271011e-06, + "loss": 0.0113, + "step": 18902 + }, + { + "epoch": 7.68727124847499, + "grad_norm": 3.4128406359422723, + "learning_rate": 7.5534170519481844e-06, + "loss": 0.0395, + "step": 18903 + }, + { + "epoch": 7.687677917852786, + "grad_norm": 0.1523444291893536, + "learning_rate": 7.552434667740759e-06, + "loss": 0.0024, + "step": 18904 + }, + { + "epoch": 7.688084587230581, + "grad_norm": 4.531671449365538, + "learning_rate": 7.551452308658826e-06, + "loss": 0.0681, + "step": 18905 + }, + { + "epoch": 7.688491256608377, + "grad_norm": 2.17428175041718, + "learning_rate": 7.550469974712469e-06, + "loss": 0.0273, + "step": 18906 + }, + { + "epoch": 7.688897925986173, + "grad_norm": 8.849965980043493, + "learning_rate": 7.5494876659117665e-06, + "loss": 0.1555, + "step": 18907 + }, + { + "epoch": 7.689304595363969, + "grad_norm": 0.7966081944457716, + "learning_rate": 7.5485053822668095e-06, + "loss": 0.0122, + "step": 18908 + }, + { + "epoch": 7.689711264741765, + "grad_norm": 3.0143003907728314, + "learning_rate": 7.547523123787679e-06, + "loss": 0.0682, + "step": 18909 + }, + { + "epoch": 7.690117934119561, + "grad_norm": 14.21532243246055, + "learning_rate": 7.546540890484455e-06, + "loss": 0.3562, + "step": 18910 + }, + { + "epoch": 7.690524603497357, + "grad_norm": 1.5528195023443572, + "learning_rate": 7.545558682367225e-06, + "loss": 0.0179, + "step": 18911 + }, + { + "epoch": 7.690931272875153, + "grad_norm": 1.3068843885111572, + "learning_rate": 7.544576499446071e-06, + "loss": 0.0177, + "step": 18912 + }, + { + "epoch": 7.691337942252948, + "grad_norm": 8.646363570526441, + "learning_rate": 7.543594341731073e-06, + "loss": 0.3373, + "step": 18913 + }, + { + "epoch": 7.691744611630744, + "grad_norm": 5.49639047331416, + "learning_rate": 7.542612209232318e-06, + "loss": 0.1367, + "step": 18914 + }, + { + "epoch": 7.69215128100854, + "grad_norm": 0.5272821949949505, + "learning_rate": 7.541630101959885e-06, + "loss": 0.0058, + "step": 18915 + }, + { + "epoch": 7.692557950386336, + "grad_norm": 0.013529325510588013, + "learning_rate": 7.540648019923857e-06, + "loss": 0.0002, + "step": 18916 + }, + { + "epoch": 7.6929646197641315, + "grad_norm": 0.07415005659517433, + "learning_rate": 7.539665963134312e-06, + "loss": 0.0008, + "step": 18917 + }, + { + "epoch": 7.693371289141927, + "grad_norm": 0.08292795062724834, + "learning_rate": 7.538683931601336e-06, + "loss": 0.0014, + "step": 18918 + }, + { + "epoch": 7.693777958519723, + "grad_norm": 3.3565465859316412, + "learning_rate": 7.537701925335008e-06, + "loss": 0.0581, + "step": 18919 + }, + { + "epoch": 7.69418462789752, + "grad_norm": 0.017923590975649127, + "learning_rate": 7.536719944345408e-06, + "loss": 0.0003, + "step": 18920 + }, + { + "epoch": 7.694591297275315, + "grad_norm": 1.9763509135071515, + "learning_rate": 7.53573798864262e-06, + "loss": 0.051, + "step": 18921 + }, + { + "epoch": 7.694997966653111, + "grad_norm": 2.7794941643354445, + "learning_rate": 7.534756058236719e-06, + "loss": 0.0706, + "step": 18922 + }, + { + "epoch": 7.695404636030907, + "grad_norm": 1.3766751862450681, + "learning_rate": 7.533774153137788e-06, + "loss": 0.0173, + "step": 18923 + }, + { + "epoch": 7.695811305408703, + "grad_norm": 0.2969065337531781, + "learning_rate": 7.532792273355909e-06, + "loss": 0.0049, + "step": 18924 + }, + { + "epoch": 7.6962179747864985, + "grad_norm": 12.291846724901387, + "learning_rate": 7.5318104189011574e-06, + "loss": 0.3211, + "step": 18925 + }, + { + "epoch": 7.696624644164294, + "grad_norm": 3.2794214748381334, + "learning_rate": 7.530828589783615e-06, + "loss": 0.0595, + "step": 18926 + }, + { + "epoch": 7.69703131354209, + "grad_norm": 0.5341069835462307, + "learning_rate": 7.529846786013357e-06, + "loss": 0.0069, + "step": 18927 + }, + { + "epoch": 7.697437982919886, + "grad_norm": 1.820661887427805, + "learning_rate": 7.528865007600469e-06, + "loss": 0.0267, + "step": 18928 + }, + { + "epoch": 7.697844652297682, + "grad_norm": 8.283138980821748, + "learning_rate": 7.527883254555024e-06, + "loss": 0.2437, + "step": 18929 + }, + { + "epoch": 7.698251321675478, + "grad_norm": 0.23543295481434612, + "learning_rate": 7.526901526887099e-06, + "loss": 0.0022, + "step": 18930 + }, + { + "epoch": 7.698657991053274, + "grad_norm": 0.8649517119252285, + "learning_rate": 7.525919824606778e-06, + "loss": 0.0192, + "step": 18931 + }, + { + "epoch": 7.69906466043107, + "grad_norm": 6.309434932690496, + "learning_rate": 7.524938147724134e-06, + "loss": 0.2793, + "step": 18932 + }, + { + "epoch": 7.6994713298088655, + "grad_norm": 0.5317688689649145, + "learning_rate": 7.523956496249243e-06, + "loss": 0.0077, + "step": 18933 + }, + { + "epoch": 7.699877999186661, + "grad_norm": 3.463944439853987, + "learning_rate": 7.522974870192188e-06, + "loss": 0.0927, + "step": 18934 + }, + { + "epoch": 7.700284668564457, + "grad_norm": 3.2265075786510935, + "learning_rate": 7.521993269563042e-06, + "loss": 0.047, + "step": 18935 + }, + { + "epoch": 7.700691337942253, + "grad_norm": 10.631376055935025, + "learning_rate": 7.5210116943718804e-06, + "loss": 0.375, + "step": 18936 + }, + { + "epoch": 7.701098007320049, + "grad_norm": 3.8631436784408373, + "learning_rate": 7.520030144628784e-06, + "loss": 0.1204, + "step": 18937 + }, + { + "epoch": 7.701504676697844, + "grad_norm": 0.5598874826372235, + "learning_rate": 7.519048620343825e-06, + "loss": 0.0065, + "step": 18938 + }, + { + "epoch": 7.70191134607564, + "grad_norm": 5.1269357281344305, + "learning_rate": 7.518067121527082e-06, + "loss": 0.0443, + "step": 18939 + }, + { + "epoch": 7.702318015453436, + "grad_norm": 0.25671907812311606, + "learning_rate": 7.517085648188627e-06, + "loss": 0.0044, + "step": 18940 + }, + { + "epoch": 7.7027246848312325, + "grad_norm": 1.0764072409311327, + "learning_rate": 7.516104200338539e-06, + "loss": 0.013, + "step": 18941 + }, + { + "epoch": 7.703131354209028, + "grad_norm": 1.7889560186968692, + "learning_rate": 7.515122777986891e-06, + "loss": 0.0257, + "step": 18942 + }, + { + "epoch": 7.703538023586824, + "grad_norm": 0.9830801747948347, + "learning_rate": 7.514141381143756e-06, + "loss": 0.0168, + "step": 18943 + }, + { + "epoch": 7.70394469296462, + "grad_norm": 3.669007000139304, + "learning_rate": 7.5131600098192134e-06, + "loss": 0.0387, + "step": 18944 + }, + { + "epoch": 7.704351362342416, + "grad_norm": 0.11681732036489459, + "learning_rate": 7.512178664023335e-06, + "loss": 0.0015, + "step": 18945 + }, + { + "epoch": 7.704758031720211, + "grad_norm": 0.021627432334812745, + "learning_rate": 7.5111973437661925e-06, + "loss": 0.0003, + "step": 18946 + }, + { + "epoch": 7.705164701098007, + "grad_norm": 1.8225321970502664, + "learning_rate": 7.510216049057865e-06, + "loss": 0.0267, + "step": 18947 + }, + { + "epoch": 7.705571370475803, + "grad_norm": 0.3134549497792205, + "learning_rate": 7.509234779908421e-06, + "loss": 0.0032, + "step": 18948 + }, + { + "epoch": 7.705978039853599, + "grad_norm": 3.8523122237208796, + "learning_rate": 7.508253536327935e-06, + "loss": 0.1434, + "step": 18949 + }, + { + "epoch": 7.706384709231395, + "grad_norm": 2.2314580925339684, + "learning_rate": 7.507272318326482e-06, + "loss": 0.034, + "step": 18950 + }, + { + "epoch": 7.706791378609191, + "grad_norm": 1.2205714835234125, + "learning_rate": 7.5062911259141335e-06, + "loss": 0.0156, + "step": 18951 + }, + { + "epoch": 7.707198047986987, + "grad_norm": 0.044642072317648857, + "learning_rate": 7.5053099591009614e-06, + "loss": 0.0005, + "step": 18952 + }, + { + "epoch": 7.707604717364783, + "grad_norm": 3.4907375860435232, + "learning_rate": 7.504328817897035e-06, + "loss": 0.056, + "step": 18953 + }, + { + "epoch": 7.708011386742578, + "grad_norm": 8.519660729216243, + "learning_rate": 7.5033477023124334e-06, + "loss": 0.2474, + "step": 18954 + }, + { + "epoch": 7.708418056120374, + "grad_norm": 2.4515408519855963, + "learning_rate": 7.5023666123572226e-06, + "loss": 0.0402, + "step": 18955 + }, + { + "epoch": 7.70882472549817, + "grad_norm": 0.1342100167342004, + "learning_rate": 7.5013855480414745e-06, + "loss": 0.0019, + "step": 18956 + }, + { + "epoch": 7.709231394875966, + "grad_norm": 2.5788726429253983, + "learning_rate": 7.500404509375267e-06, + "loss": 0.0384, + "step": 18957 + }, + { + "epoch": 7.709638064253761, + "grad_norm": 0.6301848816506427, + "learning_rate": 7.499423496368659e-06, + "loss": 0.0081, + "step": 18958 + }, + { + "epoch": 7.710044733631557, + "grad_norm": 3.670237194386844, + "learning_rate": 7.498442509031729e-06, + "loss": 0.128, + "step": 18959 + }, + { + "epoch": 7.710451403009353, + "grad_norm": 0.14873787626725726, + "learning_rate": 7.497461547374547e-06, + "loss": 0.0017, + "step": 18960 + }, + { + "epoch": 7.71085807238715, + "grad_norm": 0.7513358754779556, + "learning_rate": 7.496480611407178e-06, + "loss": 0.0118, + "step": 18961 + }, + { + "epoch": 7.711264741764945, + "grad_norm": 1.0505041497749616, + "learning_rate": 7.495499701139699e-06, + "loss": 0.0153, + "step": 18962 + }, + { + "epoch": 7.711671411142741, + "grad_norm": 2.111667414464451, + "learning_rate": 7.494518816582175e-06, + "loss": 0.026, + "step": 18963 + }, + { + "epoch": 7.712078080520537, + "grad_norm": 0.028240894414289088, + "learning_rate": 7.493537957744674e-06, + "loss": 0.0004, + "step": 18964 + }, + { + "epoch": 7.712484749898333, + "grad_norm": 1.7775077896738225, + "learning_rate": 7.492557124637272e-06, + "loss": 0.0233, + "step": 18965 + }, + { + "epoch": 7.712891419276128, + "grad_norm": 1.2682082286845673, + "learning_rate": 7.491576317270031e-06, + "loss": 0.0147, + "step": 18966 + }, + { + "epoch": 7.713298088653924, + "grad_norm": 1.734070041196343, + "learning_rate": 7.490595535653021e-06, + "loss": 0.0278, + "step": 18967 + }, + { + "epoch": 7.71370475803172, + "grad_norm": 12.99417082919084, + "learning_rate": 7.489614779796311e-06, + "loss": 0.3376, + "step": 18968 + }, + { + "epoch": 7.714111427409516, + "grad_norm": 4.148602886599587, + "learning_rate": 7.4886340497099685e-06, + "loss": 0.0644, + "step": 18969 + }, + { + "epoch": 7.714518096787312, + "grad_norm": 0.002856406755034318, + "learning_rate": 7.487653345404063e-06, + "loss": 0.0, + "step": 18970 + }, + { + "epoch": 7.714924766165108, + "grad_norm": 0.7942974486290433, + "learning_rate": 7.4866726668886566e-06, + "loss": 0.0093, + "step": 18971 + }, + { + "epoch": 7.715331435542904, + "grad_norm": 7.711202860385285, + "learning_rate": 7.485692014173822e-06, + "loss": 0.096, + "step": 18972 + }, + { + "epoch": 7.7157381049207, + "grad_norm": 0.3897293741156446, + "learning_rate": 7.484711387269626e-06, + "loss": 0.006, + "step": 18973 + }, + { + "epoch": 7.716144774298495, + "grad_norm": 3.8080468680143844, + "learning_rate": 7.483730786186131e-06, + "loss": 0.0523, + "step": 18974 + }, + { + "epoch": 7.716551443676291, + "grad_norm": 1.7202590617071294, + "learning_rate": 7.482750210933407e-06, + "loss": 0.0448, + "step": 18975 + }, + { + "epoch": 7.716958113054087, + "grad_norm": 0.031306992387502265, + "learning_rate": 7.481769661521521e-06, + "loss": 0.0006, + "step": 18976 + }, + { + "epoch": 7.717364782431883, + "grad_norm": 4.236925448827611, + "learning_rate": 7.480789137960533e-06, + "loss": 0.036, + "step": 18977 + }, + { + "epoch": 7.7177714518096785, + "grad_norm": 3.901818776283362, + "learning_rate": 7.479808640260515e-06, + "loss": 0.0759, + "step": 18978 + }, + { + "epoch": 7.718178121187474, + "grad_norm": 3.543820310371145, + "learning_rate": 7.478828168431531e-06, + "loss": 0.0627, + "step": 18979 + }, + { + "epoch": 7.71858479056527, + "grad_norm": 9.380934688163338, + "learning_rate": 7.477847722483642e-06, + "loss": 0.1277, + "step": 18980 + }, + { + "epoch": 7.718991459943066, + "grad_norm": 0.7437738732770798, + "learning_rate": 7.476867302426915e-06, + "loss": 0.0149, + "step": 18981 + }, + { + "epoch": 7.7193981293208624, + "grad_norm": 7.345128916703335, + "learning_rate": 7.475886908271418e-06, + "loss": 0.1644, + "step": 18982 + }, + { + "epoch": 7.719804798698658, + "grad_norm": 0.02752787322417979, + "learning_rate": 7.4749065400272116e-06, + "loss": 0.0004, + "step": 18983 + }, + { + "epoch": 7.720211468076454, + "grad_norm": 1.9306654716654426, + "learning_rate": 7.473926197704358e-06, + "loss": 0.0319, + "step": 18984 + }, + { + "epoch": 7.72061813745425, + "grad_norm": 10.910106290736337, + "learning_rate": 7.472945881312925e-06, + "loss": 0.2055, + "step": 18985 + }, + { + "epoch": 7.7210248068320455, + "grad_norm": 0.049603551734417585, + "learning_rate": 7.4719655908629755e-06, + "loss": 0.0005, + "step": 18986 + }, + { + "epoch": 7.721431476209841, + "grad_norm": 6.844075225288787, + "learning_rate": 7.4709853263645685e-06, + "loss": 0.3807, + "step": 18987 + }, + { + "epoch": 7.721838145587637, + "grad_norm": 0.4740862433241216, + "learning_rate": 7.470005087827772e-06, + "loss": 0.0046, + "step": 18988 + }, + { + "epoch": 7.722244814965433, + "grad_norm": 0.14497022538666113, + "learning_rate": 7.469024875262647e-06, + "loss": 0.0024, + "step": 18989 + }, + { + "epoch": 7.722651484343229, + "grad_norm": 2.735483482506869, + "learning_rate": 7.4680446886792525e-06, + "loss": 0.027, + "step": 18990 + }, + { + "epoch": 7.723058153721025, + "grad_norm": 0.1512094437332363, + "learning_rate": 7.467064528087658e-06, + "loss": 0.0027, + "step": 18991 + }, + { + "epoch": 7.723464823098821, + "grad_norm": 3.863634426753861, + "learning_rate": 7.466084393497919e-06, + "loss": 0.0871, + "step": 18992 + }, + { + "epoch": 7.723871492476617, + "grad_norm": 4.341146284968704, + "learning_rate": 7.465104284920099e-06, + "loss": 0.1552, + "step": 18993 + }, + { + "epoch": 7.7242781618544125, + "grad_norm": 7.3915450764687725, + "learning_rate": 7.464124202364257e-06, + "loss": 0.1179, + "step": 18994 + }, + { + "epoch": 7.724684831232208, + "grad_norm": 1.156412853088393, + "learning_rate": 7.463144145840459e-06, + "loss": 0.0148, + "step": 18995 + }, + { + "epoch": 7.725091500610004, + "grad_norm": 0.007174220903561907, + "learning_rate": 7.462164115358763e-06, + "loss": 0.0001, + "step": 18996 + }, + { + "epoch": 7.7254981699878, + "grad_norm": 0.10209514471186595, + "learning_rate": 7.461184110929227e-06, + "loss": 0.0011, + "step": 18997 + }, + { + "epoch": 7.725904839365596, + "grad_norm": 0.6233119419438555, + "learning_rate": 7.460204132561917e-06, + "loss": 0.0033, + "step": 18998 + }, + { + "epoch": 7.726311508743391, + "grad_norm": 0.28234200905591317, + "learning_rate": 7.45922418026689e-06, + "loss": 0.0035, + "step": 18999 + }, + { + "epoch": 7.726718178121187, + "grad_norm": 4.56911844984144, + "learning_rate": 7.458244254054202e-06, + "loss": 0.0921, + "step": 19000 + }, + { + "epoch": 7.727124847498983, + "grad_norm": 3.8706133766473707, + "learning_rate": 7.457264353933919e-06, + "loss": 0.1216, + "step": 19001 + }, + { + "epoch": 7.7275315168767795, + "grad_norm": 0.689660626445449, + "learning_rate": 7.456284479916097e-06, + "loss": 0.0101, + "step": 19002 + }, + { + "epoch": 7.727938186254575, + "grad_norm": 0.5140017186838552, + "learning_rate": 7.455304632010793e-06, + "loss": 0.0083, + "step": 19003 + }, + { + "epoch": 7.728344855632371, + "grad_norm": 2.8381308303723687, + "learning_rate": 7.45432481022807e-06, + "loss": 0.0397, + "step": 19004 + }, + { + "epoch": 7.728751525010167, + "grad_norm": 3.566661965257851, + "learning_rate": 7.453345014577984e-06, + "loss": 0.0877, + "step": 19005 + }, + { + "epoch": 7.729158194387963, + "grad_norm": 3.1219454920897394, + "learning_rate": 7.452365245070593e-06, + "loss": 0.0795, + "step": 19006 + }, + { + "epoch": 7.729564863765758, + "grad_norm": 9.523880219906722, + "learning_rate": 7.451385501715953e-06, + "loss": 0.148, + "step": 19007 + }, + { + "epoch": 7.729971533143554, + "grad_norm": 0.44966879101582063, + "learning_rate": 7.450405784524125e-06, + "loss": 0.0049, + "step": 19008 + }, + { + "epoch": 7.73037820252135, + "grad_norm": 2.326294730641173, + "learning_rate": 7.449426093505165e-06, + "loss": 0.0323, + "step": 19009 + }, + { + "epoch": 7.730784871899146, + "grad_norm": 0.3333210987146181, + "learning_rate": 7.448446428669128e-06, + "loss": 0.004, + "step": 19010 + }, + { + "epoch": 7.731191541276942, + "grad_norm": 4.117166309080476, + "learning_rate": 7.4474667900260745e-06, + "loss": 0.1312, + "step": 19011 + }, + { + "epoch": 7.731598210654738, + "grad_norm": 0.10103921487765559, + "learning_rate": 7.446487177586059e-06, + "loss": 0.0006, + "step": 19012 + }, + { + "epoch": 7.732004880032534, + "grad_norm": 1.5702591981978, + "learning_rate": 7.4455075913591354e-06, + "loss": 0.023, + "step": 19013 + }, + { + "epoch": 7.73241154941033, + "grad_norm": 9.903685672017684, + "learning_rate": 7.444528031355364e-06, + "loss": 0.4454, + "step": 19014 + }, + { + "epoch": 7.732818218788125, + "grad_norm": 2.9681524133299906, + "learning_rate": 7.443548497584801e-06, + "loss": 0.0455, + "step": 19015 + }, + { + "epoch": 7.733224888165921, + "grad_norm": 0.5466640966723938, + "learning_rate": 7.442568990057495e-06, + "loss": 0.0076, + "step": 19016 + }, + { + "epoch": 7.733631557543717, + "grad_norm": 3.4424226972284924, + "learning_rate": 7.441589508783508e-06, + "loss": 0.0456, + "step": 19017 + }, + { + "epoch": 7.734038226921513, + "grad_norm": 0.5110833681729616, + "learning_rate": 7.440610053772892e-06, + "loss": 0.0034, + "step": 19018 + }, + { + "epoch": 7.734444896299308, + "grad_norm": 9.130065127829333, + "learning_rate": 7.439630625035702e-06, + "loss": 0.0706, + "step": 19019 + }, + { + "epoch": 7.734851565677104, + "grad_norm": 3.858832804218645, + "learning_rate": 7.4386512225819895e-06, + "loss": 0.0569, + "step": 19020 + }, + { + "epoch": 7.7352582350549, + "grad_norm": 3.9616566770521073, + "learning_rate": 7.4376718464218145e-06, + "loss": 0.0949, + "step": 19021 + }, + { + "epoch": 7.735664904432696, + "grad_norm": 0.11950759166294957, + "learning_rate": 7.436692496565228e-06, + "loss": 0.0008, + "step": 19022 + }, + { + "epoch": 7.736071573810492, + "grad_norm": 4.383889096674475, + "learning_rate": 7.435713173022279e-06, + "loss": 0.0773, + "step": 19023 + }, + { + "epoch": 7.736478243188288, + "grad_norm": 0.8594316054119553, + "learning_rate": 7.434733875803028e-06, + "loss": 0.0096, + "step": 19024 + }, + { + "epoch": 7.736884912566084, + "grad_norm": 3.856792861205424, + "learning_rate": 7.4337546049175255e-06, + "loss": 0.061, + "step": 19025 + }, + { + "epoch": 7.73729158194388, + "grad_norm": 1.5696388629371802, + "learning_rate": 7.4327753603758215e-06, + "loss": 0.0215, + "step": 19026 + }, + { + "epoch": 7.737698251321675, + "grad_norm": 0.7494745066076726, + "learning_rate": 7.431796142187971e-06, + "loss": 0.0078, + "step": 19027 + }, + { + "epoch": 7.738104920699471, + "grad_norm": 10.098004177968756, + "learning_rate": 7.430816950364027e-06, + "loss": 0.1821, + "step": 19028 + }, + { + "epoch": 7.738511590077267, + "grad_norm": 0.10432174879247484, + "learning_rate": 7.429837784914038e-06, + "loss": 0.0012, + "step": 19029 + }, + { + "epoch": 7.738918259455063, + "grad_norm": 8.550441853273375, + "learning_rate": 7.4288586458480595e-06, + "loss": 0.2267, + "step": 19030 + }, + { + "epoch": 7.739324928832859, + "grad_norm": 0.7277207071010826, + "learning_rate": 7.427879533176141e-06, + "loss": 0.0075, + "step": 19031 + }, + { + "epoch": 7.739731598210655, + "grad_norm": 7.9676777643625964, + "learning_rate": 7.426900446908334e-06, + "loss": 0.2354, + "step": 19032 + }, + { + "epoch": 7.740138267588451, + "grad_norm": 2.487009387764715, + "learning_rate": 7.425921387054685e-06, + "loss": 0.0322, + "step": 19033 + }, + { + "epoch": 7.740544936966247, + "grad_norm": 14.502751202442164, + "learning_rate": 7.424942353625253e-06, + "loss": 0.6307, + "step": 19034 + }, + { + "epoch": 7.7409516063440424, + "grad_norm": 2.088872289528753, + "learning_rate": 7.423963346630082e-06, + "loss": 0.0283, + "step": 19035 + }, + { + "epoch": 7.741358275721838, + "grad_norm": 0.43601508073657663, + "learning_rate": 7.422984366079222e-06, + "loss": 0.0069, + "step": 19036 + }, + { + "epoch": 7.741764945099634, + "grad_norm": 0.3862694892136163, + "learning_rate": 7.4220054119827264e-06, + "loss": 0.0053, + "step": 19037 + }, + { + "epoch": 7.74217161447743, + "grad_norm": 4.207364339003898, + "learning_rate": 7.421026484350644e-06, + "loss": 0.0566, + "step": 19038 + }, + { + "epoch": 7.7425782838552255, + "grad_norm": 0.6293172069989071, + "learning_rate": 7.42004758319302e-06, + "loss": 0.0072, + "step": 19039 + }, + { + "epoch": 7.742984953233021, + "grad_norm": 10.93653732899791, + "learning_rate": 7.419068708519907e-06, + "loss": 0.476, + "step": 19040 + }, + { + "epoch": 7.743391622610817, + "grad_norm": 10.047341001850851, + "learning_rate": 7.4180898603413545e-06, + "loss": 0.4517, + "step": 19041 + }, + { + "epoch": 7.743798291988613, + "grad_norm": 0.10231981061464876, + "learning_rate": 7.417111038667409e-06, + "loss": 0.0012, + "step": 19042 + }, + { + "epoch": 7.7442049613664095, + "grad_norm": 1.8123923657810295, + "learning_rate": 7.416132243508114e-06, + "loss": 0.0109, + "step": 19043 + }, + { + "epoch": 7.744611630744205, + "grad_norm": 0.6808635136049838, + "learning_rate": 7.415153474873527e-06, + "loss": 0.0062, + "step": 19044 + }, + { + "epoch": 7.745018300122001, + "grad_norm": 3.082601638769505, + "learning_rate": 7.414174732773689e-06, + "loss": 0.064, + "step": 19045 + }, + { + "epoch": 7.745424969499797, + "grad_norm": 0.15025657002029463, + "learning_rate": 7.413196017218647e-06, + "loss": 0.0015, + "step": 19046 + }, + { + "epoch": 7.7458316388775925, + "grad_norm": 2.4639846445061218, + "learning_rate": 7.412217328218452e-06, + "loss": 0.051, + "step": 19047 + }, + { + "epoch": 7.746238308255388, + "grad_norm": 0.5166748709834751, + "learning_rate": 7.411238665783147e-06, + "loss": 0.0064, + "step": 19048 + }, + { + "epoch": 7.746644977633184, + "grad_norm": 1.7055398040442047, + "learning_rate": 7.41026002992278e-06, + "loss": 0.0246, + "step": 19049 + }, + { + "epoch": 7.74705164701098, + "grad_norm": 1.7153080976039747, + "learning_rate": 7.409281420647399e-06, + "loss": 0.03, + "step": 19050 + }, + { + "epoch": 7.747458316388776, + "grad_norm": 4.319992064648194, + "learning_rate": 7.408302837967047e-06, + "loss": 0.1008, + "step": 19051 + }, + { + "epoch": 7.747864985766572, + "grad_norm": 9.670601158499768, + "learning_rate": 7.407324281891769e-06, + "loss": 0.3638, + "step": 19052 + }, + { + "epoch": 7.748271655144368, + "grad_norm": 0.1104607798791516, + "learning_rate": 7.406345752431614e-06, + "loss": 0.0011, + "step": 19053 + }, + { + "epoch": 7.748678324522164, + "grad_norm": 0.8497831540266176, + "learning_rate": 7.405367249596627e-06, + "loss": 0.012, + "step": 19054 + }, + { + "epoch": 7.7490849938999595, + "grad_norm": 3.805932529848156, + "learning_rate": 7.404388773396849e-06, + "loss": 0.12, + "step": 19055 + }, + { + "epoch": 7.749491663277755, + "grad_norm": 6.865179001434271, + "learning_rate": 7.40341032384233e-06, + "loss": 0.1372, + "step": 19056 + }, + { + "epoch": 7.749898332655551, + "grad_norm": 0.036442245992508185, + "learning_rate": 7.402431900943106e-06, + "loss": 0.0005, + "step": 19057 + }, + { + "epoch": 7.750305002033347, + "grad_norm": 0.1299050264793442, + "learning_rate": 7.4014535047092276e-06, + "loss": 0.0011, + "step": 19058 + }, + { + "epoch": 7.750711671411143, + "grad_norm": 4.958903663174871, + "learning_rate": 7.400475135150738e-06, + "loss": 0.0748, + "step": 19059 + }, + { + "epoch": 7.751118340788938, + "grad_norm": 0.09075004817301026, + "learning_rate": 7.399496792277678e-06, + "loss": 0.0014, + "step": 19060 + }, + { + "epoch": 7.751525010166734, + "grad_norm": 0.013505396384923578, + "learning_rate": 7.398518476100091e-06, + "loss": 0.0002, + "step": 19061 + }, + { + "epoch": 7.75193167954453, + "grad_norm": 0.24104341967382117, + "learning_rate": 7.397540186628023e-06, + "loss": 0.0022, + "step": 19062 + }, + { + "epoch": 7.752338348922326, + "grad_norm": 2.760695474689573, + "learning_rate": 7.396561923871514e-06, + "loss": 0.0365, + "step": 19063 + }, + { + "epoch": 7.752745018300122, + "grad_norm": 8.292054575154264, + "learning_rate": 7.395583687840606e-06, + "loss": 0.2466, + "step": 19064 + }, + { + "epoch": 7.753151687677918, + "grad_norm": 9.10668227595657, + "learning_rate": 7.3946054785453444e-06, + "loss": 0.2088, + "step": 19065 + }, + { + "epoch": 7.753558357055714, + "grad_norm": 1.0246124798178307, + "learning_rate": 7.393627295995769e-06, + "loss": 0.0173, + "step": 19066 + }, + { + "epoch": 7.75396502643351, + "grad_norm": 0.2323880567961234, + "learning_rate": 7.3926491402019175e-06, + "loss": 0.0026, + "step": 19067 + }, + { + "epoch": 7.754371695811305, + "grad_norm": 0.09325358620859531, + "learning_rate": 7.391671011173839e-06, + "loss": 0.0012, + "step": 19068 + }, + { + "epoch": 7.754778365189101, + "grad_norm": 0.38535319796581047, + "learning_rate": 7.390692908921569e-06, + "loss": 0.0042, + "step": 19069 + }, + { + "epoch": 7.755185034566897, + "grad_norm": 4.315021668793065, + "learning_rate": 7.3897148334551486e-06, + "loss": 0.0843, + "step": 19070 + }, + { + "epoch": 7.755591703944693, + "grad_norm": 0.05514603679922641, + "learning_rate": 7.388736784784621e-06, + "loss": 0.0005, + "step": 19071 + }, + { + "epoch": 7.755998373322489, + "grad_norm": 8.7588904140264, + "learning_rate": 7.387758762920023e-06, + "loss": 0.1925, + "step": 19072 + }, + { + "epoch": 7.756405042700285, + "grad_norm": 5.765915882550916, + "learning_rate": 7.3867807678713965e-06, + "loss": 0.1174, + "step": 19073 + }, + { + "epoch": 7.756811712078081, + "grad_norm": 4.70810943053509, + "learning_rate": 7.385802799648779e-06, + "loss": 0.0734, + "step": 19074 + }, + { + "epoch": 7.757218381455877, + "grad_norm": 0.005873370770430493, + "learning_rate": 7.384824858262214e-06, + "loss": 0.0001, + "step": 19075 + }, + { + "epoch": 7.757625050833672, + "grad_norm": 0.2003572838077397, + "learning_rate": 7.383846943721737e-06, + "loss": 0.0026, + "step": 19076 + }, + { + "epoch": 7.758031720211468, + "grad_norm": 0.047602866527302744, + "learning_rate": 7.382869056037387e-06, + "loss": 0.0005, + "step": 19077 + }, + { + "epoch": 7.758438389589264, + "grad_norm": 1.8066393163714956, + "learning_rate": 7.3818911952192044e-06, + "loss": 0.03, + "step": 19078 + }, + { + "epoch": 7.75884505896706, + "grad_norm": 1.623747720355439, + "learning_rate": 7.380913361277227e-06, + "loss": 0.0201, + "step": 19079 + }, + { + "epoch": 7.759251728344855, + "grad_norm": 2.4453773818419613, + "learning_rate": 7.379935554221489e-06, + "loss": 0.0313, + "step": 19080 + }, + { + "epoch": 7.759658397722651, + "grad_norm": 1.6701279474706332, + "learning_rate": 7.3789577740620345e-06, + "loss": 0.0343, + "step": 19081 + }, + { + "epoch": 7.760065067100447, + "grad_norm": 2.3955744107378205, + "learning_rate": 7.377980020808897e-06, + "loss": 0.0343, + "step": 19082 + }, + { + "epoch": 7.760471736478243, + "grad_norm": 2.6135025495576056, + "learning_rate": 7.377002294472114e-06, + "loss": 0.0285, + "step": 19083 + }, + { + "epoch": 7.760878405856039, + "grad_norm": 0.3973816151795237, + "learning_rate": 7.376024595061721e-06, + "loss": 0.0034, + "step": 19084 + }, + { + "epoch": 7.761285075233835, + "grad_norm": 0.04185295550188542, + "learning_rate": 7.375046922587758e-06, + "loss": 0.0005, + "step": 19085 + }, + { + "epoch": 7.761691744611631, + "grad_norm": 0.009972446651381403, + "learning_rate": 7.374069277060259e-06, + "loss": 0.0002, + "step": 19086 + }, + { + "epoch": 7.762098413989427, + "grad_norm": 0.0486615262718872, + "learning_rate": 7.3730916584892596e-06, + "loss": 0.0006, + "step": 19087 + }, + { + "epoch": 7.7625050833672224, + "grad_norm": 0.16525376006983022, + "learning_rate": 7.372114066884798e-06, + "loss": 0.0024, + "step": 19088 + }, + { + "epoch": 7.762911752745018, + "grad_norm": 6.123656770304463, + "learning_rate": 7.371136502256908e-06, + "loss": 0.0557, + "step": 19089 + }, + { + "epoch": 7.763318422122814, + "grad_norm": 2.799508036826848, + "learning_rate": 7.370158964615623e-06, + "loss": 0.0682, + "step": 19090 + }, + { + "epoch": 7.76372509150061, + "grad_norm": 0.6830125478212058, + "learning_rate": 7.36918145397098e-06, + "loss": 0.0069, + "step": 19091 + }, + { + "epoch": 7.7641317608784055, + "grad_norm": 15.51413944619874, + "learning_rate": 7.368203970333017e-06, + "loss": 0.7115, + "step": 19092 + }, + { + "epoch": 7.764538430256202, + "grad_norm": 0.8903865938112139, + "learning_rate": 7.3672265137117596e-06, + "loss": 0.0145, + "step": 19093 + }, + { + "epoch": 7.764945099633998, + "grad_norm": 6.55625525141143, + "learning_rate": 7.366249084117249e-06, + "loss": 0.1739, + "step": 19094 + }, + { + "epoch": 7.765351769011794, + "grad_norm": 0.032748128020944574, + "learning_rate": 7.365271681559519e-06, + "loss": 0.0004, + "step": 19095 + }, + { + "epoch": 7.7657584383895895, + "grad_norm": 7.853780764278634, + "learning_rate": 7.3642943060486005e-06, + "loss": 0.1492, + "step": 19096 + }, + { + "epoch": 7.766165107767385, + "grad_norm": 0.007081537622489972, + "learning_rate": 7.363316957594526e-06, + "loss": 0.0001, + "step": 19097 + }, + { + "epoch": 7.766571777145181, + "grad_norm": 0.5925340243861388, + "learning_rate": 7.362339636207332e-06, + "loss": 0.0041, + "step": 19098 + }, + { + "epoch": 7.766978446522977, + "grad_norm": 3.446750316724667, + "learning_rate": 7.361362341897048e-06, + "loss": 0.0831, + "step": 19099 + }, + { + "epoch": 7.7673851159007725, + "grad_norm": 14.52598460179287, + "learning_rate": 7.360385074673706e-06, + "loss": 0.4637, + "step": 19100 + }, + { + "epoch": 7.767791785278568, + "grad_norm": 0.04698698549374883, + "learning_rate": 7.359407834547342e-06, + "loss": 0.0007, + "step": 19101 + }, + { + "epoch": 7.768198454656364, + "grad_norm": 0.06839133864040632, + "learning_rate": 7.358430621527986e-06, + "loss": 0.0008, + "step": 19102 + }, + { + "epoch": 7.76860512403416, + "grad_norm": 1.3285318461920048, + "learning_rate": 7.357453435625665e-06, + "loss": 0.0126, + "step": 19103 + }, + { + "epoch": 7.7690117934119565, + "grad_norm": 0.2515512389937313, + "learning_rate": 7.356476276850419e-06, + "loss": 0.0034, + "step": 19104 + }, + { + "epoch": 7.769418462789752, + "grad_norm": 25.285931850390995, + "learning_rate": 7.355499145212274e-06, + "loss": 0.0833, + "step": 19105 + }, + { + "epoch": 7.769825132167548, + "grad_norm": 0.18002766173248116, + "learning_rate": 7.354522040721258e-06, + "loss": 0.0025, + "step": 19106 + }, + { + "epoch": 7.770231801545344, + "grad_norm": 0.1695614772206968, + "learning_rate": 7.353544963387408e-06, + "loss": 0.0026, + "step": 19107 + }, + { + "epoch": 7.7706384709231395, + "grad_norm": 1.2447490600469429, + "learning_rate": 7.352567913220749e-06, + "loss": 0.021, + "step": 19108 + }, + { + "epoch": 7.771045140300935, + "grad_norm": 0.1750219895828201, + "learning_rate": 7.351590890231315e-06, + "loss": 0.0023, + "step": 19109 + }, + { + "epoch": 7.771451809678731, + "grad_norm": 9.459460752099956, + "learning_rate": 7.350613894429131e-06, + "loss": 0.2081, + "step": 19110 + }, + { + "epoch": 7.771858479056527, + "grad_norm": 1.6030201544296883, + "learning_rate": 7.349636925824231e-06, + "loss": 0.048, + "step": 19111 + }, + { + "epoch": 7.772265148434323, + "grad_norm": 0.8183243243532743, + "learning_rate": 7.3486599844266405e-06, + "loss": 0.0084, + "step": 19112 + }, + { + "epoch": 7.772671817812119, + "grad_norm": 1.7051849448696133, + "learning_rate": 7.347683070246387e-06, + "loss": 0.0156, + "step": 19113 + }, + { + "epoch": 7.773078487189915, + "grad_norm": 0.1288369847440408, + "learning_rate": 7.346706183293506e-06, + "loss": 0.0015, + "step": 19114 + }, + { + "epoch": 7.773485156567711, + "grad_norm": 4.620022248861175, + "learning_rate": 7.34572932357802e-06, + "loss": 0.0559, + "step": 19115 + }, + { + "epoch": 7.7738918259455065, + "grad_norm": 1.1331120658096603, + "learning_rate": 7.344752491109957e-06, + "loss": 0.0136, + "step": 19116 + }, + { + "epoch": 7.774298495323302, + "grad_norm": 0.012885627525709457, + "learning_rate": 7.343775685899348e-06, + "loss": 0.0002, + "step": 19117 + }, + { + "epoch": 7.774705164701098, + "grad_norm": 14.31315531334037, + "learning_rate": 7.342798907956218e-06, + "loss": 0.4354, + "step": 19118 + }, + { + "epoch": 7.775111834078894, + "grad_norm": 21.07398732423324, + "learning_rate": 7.341822157290592e-06, + "loss": 0.4002, + "step": 19119 + }, + { + "epoch": 7.77551850345669, + "grad_norm": 4.493238944042381, + "learning_rate": 7.340845433912503e-06, + "loss": 0.1424, + "step": 19120 + }, + { + "epoch": 7.775925172834485, + "grad_norm": 2.3225529551512785, + "learning_rate": 7.339868737831972e-06, + "loss": 0.0495, + "step": 19121 + }, + { + "epoch": 7.776331842212281, + "grad_norm": 0.8630482029168147, + "learning_rate": 7.338892069059029e-06, + "loss": 0.0128, + "step": 19122 + }, + { + "epoch": 7.776738511590077, + "grad_norm": 0.011847349634468762, + "learning_rate": 7.3379154276036954e-06, + "loss": 0.0002, + "step": 19123 + }, + { + "epoch": 7.777145180967873, + "grad_norm": 1.2393579106772343, + "learning_rate": 7.336938813476002e-06, + "loss": 0.0164, + "step": 19124 + }, + { + "epoch": 7.777551850345669, + "grad_norm": 0.9275175018047762, + "learning_rate": 7.335962226685972e-06, + "loss": 0.0111, + "step": 19125 + }, + { + "epoch": 7.777958519723465, + "grad_norm": 0.40613462288281216, + "learning_rate": 7.3349856672436286e-06, + "loss": 0.0076, + "step": 19126 + }, + { + "epoch": 7.778365189101261, + "grad_norm": 0.003817541472092383, + "learning_rate": 7.334009135159e-06, + "loss": 0.0001, + "step": 19127 + }, + { + "epoch": 7.778771858479057, + "grad_norm": 10.025117684265748, + "learning_rate": 7.33303263044211e-06, + "loss": 0.3472, + "step": 19128 + }, + { + "epoch": 7.779178527856852, + "grad_norm": 0.24488436827376872, + "learning_rate": 7.33205615310298e-06, + "loss": 0.0022, + "step": 19129 + }, + { + "epoch": 7.779585197234648, + "grad_norm": 0.059374397704547346, + "learning_rate": 7.331079703151638e-06, + "loss": 0.0008, + "step": 19130 + }, + { + "epoch": 7.779991866612444, + "grad_norm": 8.088794182440243, + "learning_rate": 7.3301032805981064e-06, + "loss": 0.2676, + "step": 19131 + }, + { + "epoch": 7.78039853599024, + "grad_norm": 3.5035238833483744, + "learning_rate": 7.329126885452407e-06, + "loss": 0.0463, + "step": 19132 + }, + { + "epoch": 7.780805205368035, + "grad_norm": 5.997835298108898, + "learning_rate": 7.328150517724566e-06, + "loss": 0.1353, + "step": 19133 + }, + { + "epoch": 7.781211874745832, + "grad_norm": 4.538510632390152, + "learning_rate": 7.327174177424604e-06, + "loss": 0.0839, + "step": 19134 + }, + { + "epoch": 7.781618544123628, + "grad_norm": 17.52699964001463, + "learning_rate": 7.326197864562545e-06, + "loss": 0.5536, + "step": 19135 + }, + { + "epoch": 7.782025213501424, + "grad_norm": 3.854145926557188, + "learning_rate": 7.325221579148408e-06, + "loss": 0.1313, + "step": 19136 + }, + { + "epoch": 7.782431882879219, + "grad_norm": 0.35171358379716805, + "learning_rate": 7.32424532119222e-06, + "loss": 0.0039, + "step": 19137 + }, + { + "epoch": 7.782838552257015, + "grad_norm": 1.0871388983838277, + "learning_rate": 7.3232690907040006e-06, + "loss": 0.0137, + "step": 19138 + }, + { + "epoch": 7.783245221634811, + "grad_norm": 2.3174791269953325, + "learning_rate": 7.3222928876937694e-06, + "loss": 0.0507, + "step": 19139 + }, + { + "epoch": 7.783651891012607, + "grad_norm": 0.06707210666851941, + "learning_rate": 7.3213167121715514e-06, + "loss": 0.0007, + "step": 19140 + }, + { + "epoch": 7.7840585603904024, + "grad_norm": 1.2663436359477345, + "learning_rate": 7.3203405641473645e-06, + "loss": 0.0202, + "step": 19141 + }, + { + "epoch": 7.784465229768198, + "grad_norm": 2.8126897436616223, + "learning_rate": 7.31936444363123e-06, + "loss": 0.0392, + "step": 19142 + }, + { + "epoch": 7.784871899145994, + "grad_norm": 0.0780288221624415, + "learning_rate": 7.318388350633168e-06, + "loss": 0.0011, + "step": 19143 + }, + { + "epoch": 7.78527856852379, + "grad_norm": 8.542018554967784, + "learning_rate": 7.317412285163201e-06, + "loss": 0.2748, + "step": 19144 + }, + { + "epoch": 7.785685237901586, + "grad_norm": 4.674858611923256, + "learning_rate": 7.316436247231344e-06, + "loss": 0.1417, + "step": 19145 + }, + { + "epoch": 7.786091907279382, + "grad_norm": 3.2427552315359383, + "learning_rate": 7.315460236847622e-06, + "loss": 0.0457, + "step": 19146 + }, + { + "epoch": 7.786498576657178, + "grad_norm": 0.1528406814744283, + "learning_rate": 7.314484254022054e-06, + "loss": 0.0015, + "step": 19147 + }, + { + "epoch": 7.786905246034974, + "grad_norm": 4.366370609118023, + "learning_rate": 7.3135082987646545e-06, + "loss": 0.0734, + "step": 19148 + }, + { + "epoch": 7.7873119154127695, + "grad_norm": 0.0077829307165206965, + "learning_rate": 7.312532371085442e-06, + "loss": 0.0001, + "step": 19149 + }, + { + "epoch": 7.787718584790565, + "grad_norm": 0.21089380206987032, + "learning_rate": 7.311556470994441e-06, + "loss": 0.0033, + "step": 19150 + }, + { + "epoch": 7.788125254168361, + "grad_norm": 0.5227792649276604, + "learning_rate": 7.310580598501665e-06, + "loss": 0.0072, + "step": 19151 + }, + { + "epoch": 7.788531923546157, + "grad_norm": 0.2027766052465246, + "learning_rate": 7.30960475361713e-06, + "loss": 0.0027, + "step": 19152 + }, + { + "epoch": 7.7889385929239525, + "grad_norm": 0.26852845033649114, + "learning_rate": 7.3086289363508586e-06, + "loss": 0.0043, + "step": 19153 + }, + { + "epoch": 7.789345262301749, + "grad_norm": 1.3373452667432635, + "learning_rate": 7.307653146712865e-06, + "loss": 0.0237, + "step": 19154 + }, + { + "epoch": 7.789751931679545, + "grad_norm": 5.1659873660564015, + "learning_rate": 7.306677384713167e-06, + "loss": 0.1057, + "step": 19155 + }, + { + "epoch": 7.790158601057341, + "grad_norm": 3.073775153407193, + "learning_rate": 7.3057016503617865e-06, + "loss": 0.0321, + "step": 19156 + }, + { + "epoch": 7.7905652704351365, + "grad_norm": 1.744992303996105, + "learning_rate": 7.304725943668727e-06, + "loss": 0.0208, + "step": 19157 + }, + { + "epoch": 7.790971939812932, + "grad_norm": 8.936548037177797, + "learning_rate": 7.303750264644016e-06, + "loss": 0.3594, + "step": 19158 + }, + { + "epoch": 7.791378609190728, + "grad_norm": 5.948110168449964, + "learning_rate": 7.302774613297665e-06, + "loss": 0.0516, + "step": 19159 + }, + { + "epoch": 7.791785278568524, + "grad_norm": 0.08937048936440566, + "learning_rate": 7.301798989639688e-06, + "loss": 0.0012, + "step": 19160 + }, + { + "epoch": 7.7921919479463195, + "grad_norm": 0.3477773242189202, + "learning_rate": 7.300823393680103e-06, + "loss": 0.0056, + "step": 19161 + }, + { + "epoch": 7.792598617324115, + "grad_norm": 8.681611752113415, + "learning_rate": 7.299847825428927e-06, + "loss": 0.2018, + "step": 19162 + }, + { + "epoch": 7.793005286701911, + "grad_norm": 2.993286131256814, + "learning_rate": 7.2988722848961705e-06, + "loss": 0.0483, + "step": 19163 + }, + { + "epoch": 7.793411956079707, + "grad_norm": 1.9234576642545764, + "learning_rate": 7.297896772091847e-06, + "loss": 0.021, + "step": 19164 + }, + { + "epoch": 7.793818625457503, + "grad_norm": 3.7239830920687393, + "learning_rate": 7.296921287025975e-06, + "loss": 0.0718, + "step": 19165 + }, + { + "epoch": 7.794225294835299, + "grad_norm": 0.05476528425497642, + "learning_rate": 7.295945829708568e-06, + "loss": 0.0013, + "step": 19166 + }, + { + "epoch": 7.794631964213095, + "grad_norm": 0.05351953048710521, + "learning_rate": 7.294970400149635e-06, + "loss": 0.0006, + "step": 19167 + }, + { + "epoch": 7.795038633590891, + "grad_norm": 0.3185530794321289, + "learning_rate": 7.293994998359193e-06, + "loss": 0.0049, + "step": 19168 + }, + { + "epoch": 7.7954453029686865, + "grad_norm": 0.025901132296751214, + "learning_rate": 7.293019624347256e-06, + "loss": 0.0003, + "step": 19169 + }, + { + "epoch": 7.795851972346482, + "grad_norm": 6.453294096594702, + "learning_rate": 7.292044278123833e-06, + "loss": 0.1214, + "step": 19170 + }, + { + "epoch": 7.796258641724278, + "grad_norm": 1.2901811727208483, + "learning_rate": 7.2910689596989395e-06, + "loss": 0.0209, + "step": 19171 + }, + { + "epoch": 7.796665311102074, + "grad_norm": 2.6919463723690247, + "learning_rate": 7.290093669082587e-06, + "loss": 0.0291, + "step": 19172 + }, + { + "epoch": 7.79707198047987, + "grad_norm": 0.4105880276698003, + "learning_rate": 7.289118406284784e-06, + "loss": 0.0053, + "step": 19173 + }, + { + "epoch": 7.797478649857665, + "grad_norm": 0.011078350598067471, + "learning_rate": 7.288143171315548e-06, + "loss": 0.0002, + "step": 19174 + }, + { + "epoch": 7.797885319235462, + "grad_norm": 4.220671984837021, + "learning_rate": 7.287167964184887e-06, + "loss": 0.0387, + "step": 19175 + }, + { + "epoch": 7.798291988613258, + "grad_norm": 0.005329100705264877, + "learning_rate": 7.2861927849028125e-06, + "loss": 0.0001, + "step": 19176 + }, + { + "epoch": 7.7986986579910536, + "grad_norm": 1.8660972623660563, + "learning_rate": 7.285217633479331e-06, + "loss": 0.0256, + "step": 19177 + }, + { + "epoch": 7.799105327368849, + "grad_norm": 2.1802428217615857, + "learning_rate": 7.284242509924461e-06, + "loss": 0.0153, + "step": 19178 + }, + { + "epoch": 7.799511996746645, + "grad_norm": 6.646784850583272, + "learning_rate": 7.283267414248208e-06, + "loss": 0.0915, + "step": 19179 + }, + { + "epoch": 7.799918666124441, + "grad_norm": 1.2524613350612817, + "learning_rate": 7.28229234646058e-06, + "loss": 0.0119, + "step": 19180 + }, + { + "epoch": 7.800325335502237, + "grad_norm": 7.279565153712977, + "learning_rate": 7.2813173065715905e-06, + "loss": 0.0489, + "step": 19181 + }, + { + "epoch": 7.800732004880032, + "grad_norm": 1.5142862692943886, + "learning_rate": 7.2803422945912475e-06, + "loss": 0.019, + "step": 19182 + }, + { + "epoch": 7.801138674257828, + "grad_norm": 3.815433773871431, + "learning_rate": 7.279367310529557e-06, + "loss": 0.0513, + "step": 19183 + }, + { + "epoch": 7.801545343635624, + "grad_norm": 0.09921500300616345, + "learning_rate": 7.278392354396533e-06, + "loss": 0.0013, + "step": 19184 + }, + { + "epoch": 7.80195201301342, + "grad_norm": 1.372122783777548, + "learning_rate": 7.277417426202181e-06, + "loss": 0.0137, + "step": 19185 + }, + { + "epoch": 7.802358682391216, + "grad_norm": 2.171736760851447, + "learning_rate": 7.276442525956507e-06, + "loss": 0.0286, + "step": 19186 + }, + { + "epoch": 7.802765351769012, + "grad_norm": 0.6662362091441697, + "learning_rate": 7.275467653669523e-06, + "loss": 0.0087, + "step": 19187 + }, + { + "epoch": 7.803172021146808, + "grad_norm": 7.066995068155671, + "learning_rate": 7.274492809351237e-06, + "loss": 0.5547, + "step": 19188 + }, + { + "epoch": 7.803578690524604, + "grad_norm": 1.0511815177779267, + "learning_rate": 7.273517993011652e-06, + "loss": 0.0103, + "step": 19189 + }, + { + "epoch": 7.803985359902399, + "grad_norm": 5.694802352476035, + "learning_rate": 7.272543204660776e-06, + "loss": 0.3576, + "step": 19190 + }, + { + "epoch": 7.804392029280195, + "grad_norm": 0.1096047534995759, + "learning_rate": 7.2715684443086176e-06, + "loss": 0.0014, + "step": 19191 + }, + { + "epoch": 7.804798698657991, + "grad_norm": 4.035124851760819, + "learning_rate": 7.270593711965184e-06, + "loss": 0.0568, + "step": 19192 + }, + { + "epoch": 7.805205368035787, + "grad_norm": 2.5043719399069726, + "learning_rate": 7.269619007640476e-06, + "loss": 0.0767, + "step": 19193 + }, + { + "epoch": 7.8056120374135824, + "grad_norm": 6.932118211009661, + "learning_rate": 7.268644331344505e-06, + "loss": 0.1038, + "step": 19194 + }, + { + "epoch": 7.806018706791379, + "grad_norm": 26.718538122842844, + "learning_rate": 7.267669683087275e-06, + "loss": 0.2948, + "step": 19195 + }, + { + "epoch": 7.806425376169175, + "grad_norm": 6.261700165519335, + "learning_rate": 7.266695062878789e-06, + "loss": 0.2119, + "step": 19196 + }, + { + "epoch": 7.806832045546971, + "grad_norm": 13.070390565106647, + "learning_rate": 7.265720470729056e-06, + "loss": 0.3036, + "step": 19197 + }, + { + "epoch": 7.807238714924766, + "grad_norm": 0.05530237440729751, + "learning_rate": 7.264745906648079e-06, + "loss": 0.0007, + "step": 19198 + }, + { + "epoch": 7.807645384302562, + "grad_norm": 0.10099139444952454, + "learning_rate": 7.263771370645861e-06, + "loss": 0.0013, + "step": 19199 + }, + { + "epoch": 7.808052053680358, + "grad_norm": 0.21786621430111008, + "learning_rate": 7.262796862732406e-06, + "loss": 0.003, + "step": 19200 + }, + { + "epoch": 7.808458723058154, + "grad_norm": 0.019214965344871895, + "learning_rate": 7.261822382917719e-06, + "loss": 0.0004, + "step": 19201 + }, + { + "epoch": 7.8088653924359495, + "grad_norm": 0.024964362576414528, + "learning_rate": 7.260847931211805e-06, + "loss": 0.0004, + "step": 19202 + }, + { + "epoch": 7.809272061813745, + "grad_norm": 1.0787679705188864, + "learning_rate": 7.259873507624662e-06, + "loss": 0.013, + "step": 19203 + }, + { + "epoch": 7.809678731191541, + "grad_norm": 5.597241200774425, + "learning_rate": 7.2588991121663e-06, + "loss": 0.1092, + "step": 19204 + }, + { + "epoch": 7.810085400569337, + "grad_norm": 6.990030543395074, + "learning_rate": 7.257924744846718e-06, + "loss": 0.1947, + "step": 19205 + }, + { + "epoch": 7.8104920699471325, + "grad_norm": 0.15598754768376108, + "learning_rate": 7.256950405675915e-06, + "loss": 0.0022, + "step": 19206 + }, + { + "epoch": 7.810898739324929, + "grad_norm": 4.254575064983666, + "learning_rate": 7.255976094663901e-06, + "loss": 0.0733, + "step": 19207 + }, + { + "epoch": 7.811305408702725, + "grad_norm": 0.024268889500719495, + "learning_rate": 7.255001811820672e-06, + "loss": 0.0002, + "step": 19208 + }, + { + "epoch": 7.811712078080521, + "grad_norm": 2.6234219920006456, + "learning_rate": 7.254027557156229e-06, + "loss": 0.048, + "step": 19209 + }, + { + "epoch": 7.8121187474583165, + "grad_norm": 1.1667716735228477, + "learning_rate": 7.253053330680577e-06, + "loss": 0.0183, + "step": 19210 + }, + { + "epoch": 7.812525416836112, + "grad_norm": 0.659074380529494, + "learning_rate": 7.252079132403715e-06, + "loss": 0.0063, + "step": 19211 + }, + { + "epoch": 7.812932086213908, + "grad_norm": 1.530889842802622, + "learning_rate": 7.251104962335645e-06, + "loss": 0.0092, + "step": 19212 + }, + { + "epoch": 7.813338755591704, + "grad_norm": 0.7635500288909817, + "learning_rate": 7.250130820486363e-06, + "loss": 0.0118, + "step": 19213 + }, + { + "epoch": 7.8137454249694995, + "grad_norm": 0.28118990235066815, + "learning_rate": 7.249156706865874e-06, + "loss": 0.0063, + "step": 19214 + }, + { + "epoch": 7.814152094347295, + "grad_norm": 0.0059000093977337265, + "learning_rate": 7.248182621484177e-06, + "loss": 0.0001, + "step": 19215 + }, + { + "epoch": 7.814558763725092, + "grad_norm": 5.981452796152844, + "learning_rate": 7.247208564351267e-06, + "loss": 0.076, + "step": 19216 + }, + { + "epoch": 7.814965433102888, + "grad_norm": 0.5445881545591437, + "learning_rate": 7.24623453547715e-06, + "loss": 0.01, + "step": 19217 + }, + { + "epoch": 7.8153721024806835, + "grad_norm": 1.023813199012044, + "learning_rate": 7.245260534871821e-06, + "loss": 0.0144, + "step": 19218 + }, + { + "epoch": 7.815778771858479, + "grad_norm": 7.406959351338623, + "learning_rate": 7.244286562545275e-06, + "loss": 0.1582, + "step": 19219 + }, + { + "epoch": 7.816185441236275, + "grad_norm": 6.09200397107635, + "learning_rate": 7.243312618507519e-06, + "loss": 0.2077, + "step": 19220 + }, + { + "epoch": 7.816592110614071, + "grad_norm": 1.0808389983946536, + "learning_rate": 7.242338702768545e-06, + "loss": 0.0091, + "step": 19221 + }, + { + "epoch": 7.8169987799918665, + "grad_norm": 13.417303948404065, + "learning_rate": 7.24136481533835e-06, + "loss": 0.4041, + "step": 19222 + }, + { + "epoch": 7.817405449369662, + "grad_norm": 0.12276641034043592, + "learning_rate": 7.240390956226936e-06, + "loss": 0.0016, + "step": 19223 + }, + { + "epoch": 7.817812118747458, + "grad_norm": 0.04388810641762251, + "learning_rate": 7.239417125444298e-06, + "loss": 0.0006, + "step": 19224 + }, + { + "epoch": 7.818218788125254, + "grad_norm": 0.12370358713029017, + "learning_rate": 7.2384433230004326e-06, + "loss": 0.001, + "step": 19225 + }, + { + "epoch": 7.81862545750305, + "grad_norm": 5.5721695311634445, + "learning_rate": 7.237469548905334e-06, + "loss": 0.0848, + "step": 19226 + }, + { + "epoch": 7.819032126880846, + "grad_norm": 2.740297614925823, + "learning_rate": 7.236495803169003e-06, + "loss": 0.0396, + "step": 19227 + }, + { + "epoch": 7.819438796258642, + "grad_norm": 7.690464343764918, + "learning_rate": 7.235522085801432e-06, + "loss": 0.0912, + "step": 19228 + }, + { + "epoch": 7.819845465636438, + "grad_norm": 0.08084368078243358, + "learning_rate": 7.234548396812618e-06, + "loss": 0.0012, + "step": 19229 + }, + { + "epoch": 7.8202521350142336, + "grad_norm": 0.009217235137101475, + "learning_rate": 7.233574736212558e-06, + "loss": 0.0002, + "step": 19230 + }, + { + "epoch": 7.820658804392029, + "grad_norm": 6.558833075286785, + "learning_rate": 7.232601104011245e-06, + "loss": 0.0792, + "step": 19231 + }, + { + "epoch": 7.821065473769825, + "grad_norm": 14.725782467853538, + "learning_rate": 7.231627500218674e-06, + "loss": 0.5378, + "step": 19232 + }, + { + "epoch": 7.821472143147621, + "grad_norm": 4.2493014342228586, + "learning_rate": 7.23065392484484e-06, + "loss": 0.0543, + "step": 19233 + }, + { + "epoch": 7.821878812525417, + "grad_norm": 9.874788883750162, + "learning_rate": 7.2296803778997394e-06, + "loss": 0.2065, + "step": 19234 + }, + { + "epoch": 7.822285481903212, + "grad_norm": 9.845538525604494, + "learning_rate": 7.22870685939336e-06, + "loss": 0.1738, + "step": 19235 + }, + { + "epoch": 7.822692151281009, + "grad_norm": 1.298218049810225, + "learning_rate": 7.227733369335703e-06, + "loss": 0.0114, + "step": 19236 + }, + { + "epoch": 7.823098820658805, + "grad_norm": 3.0341473204129783, + "learning_rate": 7.226759907736758e-06, + "loss": 0.1256, + "step": 19237 + }, + { + "epoch": 7.823505490036601, + "grad_norm": 10.878847851416714, + "learning_rate": 7.225786474606517e-06, + "loss": 0.3482, + "step": 19238 + }, + { + "epoch": 7.823912159414396, + "grad_norm": 7.035558792517515, + "learning_rate": 7.224813069954973e-06, + "loss": 0.145, + "step": 19239 + }, + { + "epoch": 7.824318828792192, + "grad_norm": 9.245759726789515, + "learning_rate": 7.223839693792121e-06, + "loss": 0.8135, + "step": 19240 + }, + { + "epoch": 7.824725498169988, + "grad_norm": 1.9902984146272833, + "learning_rate": 7.222866346127952e-06, + "loss": 0.0337, + "step": 19241 + }, + { + "epoch": 7.825132167547784, + "grad_norm": 2.636876717533204, + "learning_rate": 7.221893026972457e-06, + "loss": 0.0361, + "step": 19242 + }, + { + "epoch": 7.825538836925579, + "grad_norm": 0.009264914094733352, + "learning_rate": 7.2209197363356285e-06, + "loss": 0.0001, + "step": 19243 + }, + { + "epoch": 7.825945506303375, + "grad_norm": 0.010445264286232164, + "learning_rate": 7.219946474227458e-06, + "loss": 0.0002, + "step": 19244 + }, + { + "epoch": 7.826352175681171, + "grad_norm": 6.144793481196551, + "learning_rate": 7.218973240657934e-06, + "loss": 0.141, + "step": 19245 + }, + { + "epoch": 7.826758845058967, + "grad_norm": 1.4527457561962887, + "learning_rate": 7.218000035637051e-06, + "loss": 0.0171, + "step": 19246 + }, + { + "epoch": 7.8271655144367625, + "grad_norm": 0.9564968402755709, + "learning_rate": 7.2170268591748e-06, + "loss": 0.01, + "step": 19247 + }, + { + "epoch": 7.827572183814559, + "grad_norm": 0.7591333241482348, + "learning_rate": 7.2160537112811655e-06, + "loss": 0.011, + "step": 19248 + }, + { + "epoch": 7.827978853192355, + "grad_norm": 0.11102462830590339, + "learning_rate": 7.215080591966141e-06, + "loss": 0.0022, + "step": 19249 + }, + { + "epoch": 7.828385522570151, + "grad_norm": 0.04524100905591837, + "learning_rate": 7.214107501239719e-06, + "loss": 0.0006, + "step": 19250 + }, + { + "epoch": 7.828792191947946, + "grad_norm": 2.431816574418188, + "learning_rate": 7.213134439111884e-06, + "loss": 0.0151, + "step": 19251 + }, + { + "epoch": 7.829198861325742, + "grad_norm": 1.8105210998906165, + "learning_rate": 7.212161405592625e-06, + "loss": 0.015, + "step": 19252 + }, + { + "epoch": 7.829605530703538, + "grad_norm": 0.02022364633079385, + "learning_rate": 7.211188400691933e-06, + "loss": 0.0004, + "step": 19253 + }, + { + "epoch": 7.830012200081334, + "grad_norm": 1.0915533952651286, + "learning_rate": 7.210215424419797e-06, + "loss": 0.0122, + "step": 19254 + }, + { + "epoch": 7.8304188694591295, + "grad_norm": 4.756922588134785, + "learning_rate": 7.209242476786202e-06, + "loss": 0.0776, + "step": 19255 + }, + { + "epoch": 7.830825538836925, + "grad_norm": 0.013461371929887228, + "learning_rate": 7.208269557801143e-06, + "loss": 0.0002, + "step": 19256 + }, + { + "epoch": 7.831232208214722, + "grad_norm": 1.8774796614699294, + "learning_rate": 7.207296667474596e-06, + "loss": 0.0095, + "step": 19257 + }, + { + "epoch": 7.831638877592518, + "grad_norm": 0.19119553813859555, + "learning_rate": 7.2063238058165565e-06, + "loss": 0.0021, + "step": 19258 + }, + { + "epoch": 7.832045546970313, + "grad_norm": 8.505404551069203, + "learning_rate": 7.205350972837008e-06, + "loss": 0.1972, + "step": 19259 + }, + { + "epoch": 7.832452216348109, + "grad_norm": 0.9376362998415237, + "learning_rate": 7.204378168545938e-06, + "loss": 0.0165, + "step": 19260 + }, + { + "epoch": 7.832858885725905, + "grad_norm": 0.7005743063828798, + "learning_rate": 7.2034053929533335e-06, + "loss": 0.0095, + "step": 19261 + }, + { + "epoch": 7.833265555103701, + "grad_norm": 3.2859615379069638, + "learning_rate": 7.202432646069182e-06, + "loss": 0.0598, + "step": 19262 + }, + { + "epoch": 7.8336722244814965, + "grad_norm": 0.25162758920566586, + "learning_rate": 7.2014599279034636e-06, + "loss": 0.0039, + "step": 19263 + }, + { + "epoch": 7.834078893859292, + "grad_norm": 3.989730112407714, + "learning_rate": 7.200487238466169e-06, + "loss": 0.0574, + "step": 19264 + }, + { + "epoch": 7.834485563237088, + "grad_norm": 0.6133832994913574, + "learning_rate": 7.199514577767284e-06, + "loss": 0.0095, + "step": 19265 + }, + { + "epoch": 7.834892232614884, + "grad_norm": 0.0012619662827221489, + "learning_rate": 7.19854194581679e-06, + "loss": 0.0, + "step": 19266 + }, + { + "epoch": 7.8352989019926795, + "grad_norm": 2.27786524823553, + "learning_rate": 7.197569342624672e-06, + "loss": 0.0397, + "step": 19267 + }, + { + "epoch": 7.835705571370476, + "grad_norm": 0.037061461066237436, + "learning_rate": 7.196596768200916e-06, + "loss": 0.0004, + "step": 19268 + }, + { + "epoch": 7.836112240748272, + "grad_norm": 1.348061793405617, + "learning_rate": 7.195624222555506e-06, + "loss": 0.0133, + "step": 19269 + }, + { + "epoch": 7.836518910126068, + "grad_norm": 0.08104361478609237, + "learning_rate": 7.194651705698421e-06, + "loss": 0.0012, + "step": 19270 + }, + { + "epoch": 7.8369255795038635, + "grad_norm": 0.06737164922375942, + "learning_rate": 7.193679217639652e-06, + "loss": 0.0013, + "step": 19271 + }, + { + "epoch": 7.837332248881659, + "grad_norm": 2.753593606990518, + "learning_rate": 7.192706758389178e-06, + "loss": 0.0344, + "step": 19272 + }, + { + "epoch": 7.837738918259455, + "grad_norm": 4.068089872270984, + "learning_rate": 7.1917343279569785e-06, + "loss": 0.0266, + "step": 19273 + }, + { + "epoch": 7.838145587637251, + "grad_norm": 4.053998116147323, + "learning_rate": 7.1907619263530425e-06, + "loss": 0.1031, + "step": 19274 + }, + { + "epoch": 7.8385522570150465, + "grad_norm": 7.217114316097633, + "learning_rate": 7.1897895535873495e-06, + "loss": 0.1292, + "step": 19275 + }, + { + "epoch": 7.838958926392842, + "grad_norm": 4.54740472047845, + "learning_rate": 7.188817209669879e-06, + "loss": 0.0427, + "step": 19276 + }, + { + "epoch": 7.839365595770639, + "grad_norm": 1.2283791866559646, + "learning_rate": 7.187844894610616e-06, + "loss": 0.014, + "step": 19277 + }, + { + "epoch": 7.839772265148435, + "grad_norm": 1.1071082213481134, + "learning_rate": 7.186872608419542e-06, + "loss": 0.0202, + "step": 19278 + }, + { + "epoch": 7.8401789345262305, + "grad_norm": 0.07915486610360045, + "learning_rate": 7.185900351106636e-06, + "loss": 0.0016, + "step": 19279 + }, + { + "epoch": 7.840585603904026, + "grad_norm": 5.11408363877545, + "learning_rate": 7.184928122681876e-06, + "loss": 0.0617, + "step": 19280 + }, + { + "epoch": 7.840992273281822, + "grad_norm": 5.6976535890283495, + "learning_rate": 7.183955923155248e-06, + "loss": 0.1671, + "step": 19281 + }, + { + "epoch": 7.841398942659618, + "grad_norm": 2.840311512996751, + "learning_rate": 7.182983752536731e-06, + "loss": 0.0685, + "step": 19282 + }, + { + "epoch": 7.8418056120374136, + "grad_norm": 0.020146907986842738, + "learning_rate": 7.182011610836301e-06, + "loss": 0.0002, + "step": 19283 + }, + { + "epoch": 7.842212281415209, + "grad_norm": 8.216124099026612, + "learning_rate": 7.181039498063941e-06, + "loss": 0.1104, + "step": 19284 + }, + { + "epoch": 7.842618950793005, + "grad_norm": 0.20154654636125807, + "learning_rate": 7.1800674142296305e-06, + "loss": 0.0024, + "step": 19285 + }, + { + "epoch": 7.843025620170801, + "grad_norm": 5.322962725771958, + "learning_rate": 7.179095359343344e-06, + "loss": 0.0756, + "step": 19286 + }, + { + "epoch": 7.843432289548597, + "grad_norm": 1.1892861699463038, + "learning_rate": 7.178123333415065e-06, + "loss": 0.0183, + "step": 19287 + }, + { + "epoch": 7.843838958926392, + "grad_norm": 0.44483040557586856, + "learning_rate": 7.177151336454773e-06, + "loss": 0.0049, + "step": 19288 + }, + { + "epoch": 7.844245628304189, + "grad_norm": 0.08382578348669613, + "learning_rate": 7.176179368472439e-06, + "loss": 0.0011, + "step": 19289 + }, + { + "epoch": 7.844652297681985, + "grad_norm": 0.43063569690141507, + "learning_rate": 7.1752074294780485e-06, + "loss": 0.0046, + "step": 19290 + }, + { + "epoch": 7.845058967059781, + "grad_norm": 2.612982577667587, + "learning_rate": 7.1742355194815734e-06, + "loss": 0.0458, + "step": 19291 + }, + { + "epoch": 7.845465636437576, + "grad_norm": 1.0370174394038074, + "learning_rate": 7.173263638492994e-06, + "loss": 0.0126, + "step": 19292 + }, + { + "epoch": 7.845872305815372, + "grad_norm": 3.218662277820672, + "learning_rate": 7.172291786522284e-06, + "loss": 0.054, + "step": 19293 + }, + { + "epoch": 7.846278975193168, + "grad_norm": 0.22764415021654474, + "learning_rate": 7.171319963579424e-06, + "loss": 0.0027, + "step": 19294 + }, + { + "epoch": 7.846685644570964, + "grad_norm": 0.003945348954968312, + "learning_rate": 7.1703481696743885e-06, + "loss": 0.0, + "step": 19295 + }, + { + "epoch": 7.847092313948759, + "grad_norm": 8.987593434470536, + "learning_rate": 7.1693764048171495e-06, + "loss": 0.3986, + "step": 19296 + }, + { + "epoch": 7.847498983326555, + "grad_norm": 0.2982815158666298, + "learning_rate": 7.16840466901769e-06, + "loss": 0.0024, + "step": 19297 + }, + { + "epoch": 7.847905652704352, + "grad_norm": 15.271775307172852, + "learning_rate": 7.167432962285982e-06, + "loss": 0.7637, + "step": 19298 + }, + { + "epoch": 7.848312322082148, + "grad_norm": 4.144476010872847, + "learning_rate": 7.166461284631998e-06, + "loss": 0.0613, + "step": 19299 + }, + { + "epoch": 7.848718991459943, + "grad_norm": 0.06735251490691284, + "learning_rate": 7.1654896360657166e-06, + "loss": 0.0011, + "step": 19300 + }, + { + "epoch": 7.849125660837739, + "grad_norm": 2.8715248323361777, + "learning_rate": 7.164518016597111e-06, + "loss": 0.0696, + "step": 19301 + }, + { + "epoch": 7.849532330215535, + "grad_norm": 0.15921561117446503, + "learning_rate": 7.163546426236154e-06, + "loss": 0.0023, + "step": 19302 + }, + { + "epoch": 7.849938999593331, + "grad_norm": 0.02619186883211635, + "learning_rate": 7.162574864992819e-06, + "loss": 0.0003, + "step": 19303 + }, + { + "epoch": 7.850345668971126, + "grad_norm": 2.6534625881973795, + "learning_rate": 7.161603332877083e-06, + "loss": 0.0504, + "step": 19304 + }, + { + "epoch": 7.850752338348922, + "grad_norm": 2.1427364651093153, + "learning_rate": 7.160631829898918e-06, + "loss": 0.0301, + "step": 19305 + }, + { + "epoch": 7.851159007726718, + "grad_norm": 0.18774227792882808, + "learning_rate": 7.159660356068293e-06, + "loss": 0.0026, + "step": 19306 + }, + { + "epoch": 7.851565677104514, + "grad_norm": 4.7419990662368114, + "learning_rate": 7.158688911395187e-06, + "loss": 0.0623, + "step": 19307 + }, + { + "epoch": 7.8519723464823095, + "grad_norm": 6.589657655877, + "learning_rate": 7.157717495889569e-06, + "loss": 0.1678, + "step": 19308 + }, + { + "epoch": 7.852379015860106, + "grad_norm": 2.8460459184429707, + "learning_rate": 7.15674610956141e-06, + "loss": 0.0499, + "step": 19309 + }, + { + "epoch": 7.852785685237902, + "grad_norm": 2.535969648338119, + "learning_rate": 7.155774752420684e-06, + "loss": 0.036, + "step": 19310 + }, + { + "epoch": 7.853192354615698, + "grad_norm": 0.9015318927775049, + "learning_rate": 7.1548034244773634e-06, + "loss": 0.007, + "step": 19311 + }, + { + "epoch": 7.853599023993493, + "grad_norm": 0.41894601049203906, + "learning_rate": 7.153832125741413e-06, + "loss": 0.0034, + "step": 19312 + }, + { + "epoch": 7.854005693371289, + "grad_norm": 0.9609055408391007, + "learning_rate": 7.152860856222812e-06, + "loss": 0.0127, + "step": 19313 + }, + { + "epoch": 7.854412362749085, + "grad_norm": 4.402319776068654, + "learning_rate": 7.151889615931527e-06, + "loss": 0.068, + "step": 19314 + }, + { + "epoch": 7.854819032126881, + "grad_norm": 2.4310595450050516, + "learning_rate": 7.150918404877529e-06, + "loss": 0.0282, + "step": 19315 + }, + { + "epoch": 7.8552257015046765, + "grad_norm": 0.6296250505178566, + "learning_rate": 7.1499472230707855e-06, + "loss": 0.007, + "step": 19316 + }, + { + "epoch": 7.855632370882472, + "grad_norm": 0.1515162804742897, + "learning_rate": 7.1489760705212695e-06, + "loss": 0.0018, + "step": 19317 + }, + { + "epoch": 7.856039040260269, + "grad_norm": 0.20090583082793745, + "learning_rate": 7.148004947238949e-06, + "loss": 0.0022, + "step": 19318 + }, + { + "epoch": 7.856445709638065, + "grad_norm": 0.020947248270371924, + "learning_rate": 7.1470338532337915e-06, + "loss": 0.0003, + "step": 19319 + }, + { + "epoch": 7.85685237901586, + "grad_norm": 18.823551404419494, + "learning_rate": 7.146062788515769e-06, + "loss": 0.8424, + "step": 19320 + }, + { + "epoch": 7.857259048393656, + "grad_norm": 17.224375220464555, + "learning_rate": 7.145091753094849e-06, + "loss": 0.1219, + "step": 19321 + }, + { + "epoch": 7.857665717771452, + "grad_norm": 2.6181270355038038, + "learning_rate": 7.144120746980996e-06, + "loss": 0.055, + "step": 19322 + }, + { + "epoch": 7.858072387149248, + "grad_norm": 5.261363559398136, + "learning_rate": 7.143149770184183e-06, + "loss": 0.2011, + "step": 19323 + }, + { + "epoch": 7.8584790565270435, + "grad_norm": 0.9461524293018989, + "learning_rate": 7.142178822714376e-06, + "loss": 0.0139, + "step": 19324 + }, + { + "epoch": 7.858885725904839, + "grad_norm": 0.064028494638735, + "learning_rate": 7.14120790458154e-06, + "loss": 0.0013, + "step": 19325 + }, + { + "epoch": 7.859292395282635, + "grad_norm": 6.02063086481408, + "learning_rate": 7.140237015795644e-06, + "loss": 0.0824, + "step": 19326 + }, + { + "epoch": 7.859699064660431, + "grad_norm": 4.692540083213551, + "learning_rate": 7.139266156366657e-06, + "loss": 0.1093, + "step": 19327 + }, + { + "epoch": 7.8601057340382265, + "grad_norm": 0.2664200460322326, + "learning_rate": 7.138295326304541e-06, + "loss": 0.0034, + "step": 19328 + }, + { + "epoch": 7.860512403416022, + "grad_norm": 3.993243730769052, + "learning_rate": 7.137324525619262e-06, + "loss": 0.053, + "step": 19329 + }, + { + "epoch": 7.860919072793819, + "grad_norm": 0.05055913974377945, + "learning_rate": 7.136353754320788e-06, + "loss": 0.0007, + "step": 19330 + }, + { + "epoch": 7.861325742171615, + "grad_norm": 1.1222014145753254, + "learning_rate": 7.135383012419086e-06, + "loss": 0.0176, + "step": 19331 + }, + { + "epoch": 7.8617324115494105, + "grad_norm": 13.59035128275926, + "learning_rate": 7.134412299924116e-06, + "loss": 0.1185, + "step": 19332 + }, + { + "epoch": 7.862139080927206, + "grad_norm": 7.3594868964295905, + "learning_rate": 7.133441616845848e-06, + "loss": 0.3562, + "step": 19333 + }, + { + "epoch": 7.862545750305002, + "grad_norm": 0.17108315562443435, + "learning_rate": 7.132470963194247e-06, + "loss": 0.0022, + "step": 19334 + }, + { + "epoch": 7.862952419682798, + "grad_norm": 3.0403127102916, + "learning_rate": 7.13150033897927e-06, + "loss": 0.129, + "step": 19335 + }, + { + "epoch": 7.8633590890605936, + "grad_norm": 4.268439853618551, + "learning_rate": 7.1305297442108876e-06, + "loss": 0.0514, + "step": 19336 + }, + { + "epoch": 7.863765758438389, + "grad_norm": 0.03117643554137114, + "learning_rate": 7.129559178899064e-06, + "loss": 0.0004, + "step": 19337 + }, + { + "epoch": 7.864172427816185, + "grad_norm": 0.5464804225480959, + "learning_rate": 7.1285886430537555e-06, + "loss": 0.0055, + "step": 19338 + }, + { + "epoch": 7.864579097193982, + "grad_norm": 4.127893294974246, + "learning_rate": 7.127618136684933e-06, + "loss": 0.0682, + "step": 19339 + }, + { + "epoch": 7.8649857665717775, + "grad_norm": 0.0530864686645884, + "learning_rate": 7.126647659802555e-06, + "loss": 0.0005, + "step": 19340 + }, + { + "epoch": 7.865392435949573, + "grad_norm": 0.08908248938500651, + "learning_rate": 7.125677212416586e-06, + "loss": 0.001, + "step": 19341 + }, + { + "epoch": 7.865799105327369, + "grad_norm": 7.01520138908077, + "learning_rate": 7.124706794536984e-06, + "loss": 0.1645, + "step": 19342 + }, + { + "epoch": 7.866205774705165, + "grad_norm": 0.2790250828118168, + "learning_rate": 7.1237364061737165e-06, + "loss": 0.005, + "step": 19343 + }, + { + "epoch": 7.866612444082961, + "grad_norm": 0.4256530457877697, + "learning_rate": 7.122766047336742e-06, + "loss": 0.0053, + "step": 19344 + }, + { + "epoch": 7.867019113460756, + "grad_norm": 8.54904508097515, + "learning_rate": 7.121795718036021e-06, + "loss": 0.2072, + "step": 19345 + }, + { + "epoch": 7.867425782838552, + "grad_norm": 3.510782146972541, + "learning_rate": 7.120825418281516e-06, + "loss": 0.0734, + "step": 19346 + }, + { + "epoch": 7.867832452216348, + "grad_norm": 0.38536048330089534, + "learning_rate": 7.119855148083189e-06, + "loss": 0.0064, + "step": 19347 + }, + { + "epoch": 7.868239121594144, + "grad_norm": 6.976078359725875, + "learning_rate": 7.118884907450996e-06, + "loss": 0.0561, + "step": 19348 + }, + { + "epoch": 7.868645790971939, + "grad_norm": 9.66366426058663, + "learning_rate": 7.117914696394901e-06, + "loss": 0.3634, + "step": 19349 + }, + { + "epoch": 7.869052460349736, + "grad_norm": 0.009515906925119763, + "learning_rate": 7.116944514924862e-06, + "loss": 0.0001, + "step": 19350 + }, + { + "epoch": 7.869459129727532, + "grad_norm": 0.7151986768824953, + "learning_rate": 7.115974363050837e-06, + "loss": 0.0058, + "step": 19351 + }, + { + "epoch": 7.869865799105328, + "grad_norm": 7.578985572402199, + "learning_rate": 7.115004240782788e-06, + "loss": 0.2226, + "step": 19352 + }, + { + "epoch": 7.870272468483123, + "grad_norm": 0.32471153819256554, + "learning_rate": 7.114034148130673e-06, + "loss": 0.005, + "step": 19353 + }, + { + "epoch": 7.870679137860919, + "grad_norm": 10.055569067679786, + "learning_rate": 7.11306408510445e-06, + "loss": 0.3919, + "step": 19354 + }, + { + "epoch": 7.871085807238715, + "grad_norm": 4.3045565511470585, + "learning_rate": 7.112094051714078e-06, + "loss": 0.1668, + "step": 19355 + }, + { + "epoch": 7.871492476616511, + "grad_norm": 2.481997384182742, + "learning_rate": 7.111124047969513e-06, + "loss": 0.0337, + "step": 19356 + }, + { + "epoch": 7.871899145994306, + "grad_norm": 0.07716274603289129, + "learning_rate": 7.1101540738807125e-06, + "loss": 0.0013, + "step": 19357 + }, + { + "epoch": 7.872305815372102, + "grad_norm": 4.09654021020004, + "learning_rate": 7.109184129457636e-06, + "loss": 0.0734, + "step": 19358 + }, + { + "epoch": 7.872712484749899, + "grad_norm": 8.504696946966293, + "learning_rate": 7.108214214710238e-06, + "loss": 0.3966, + "step": 19359 + }, + { + "epoch": 7.873119154127695, + "grad_norm": 5.232591228122414, + "learning_rate": 7.107244329648477e-06, + "loss": 0.0555, + "step": 19360 + }, + { + "epoch": 7.87352582350549, + "grad_norm": 1.5017264781249096, + "learning_rate": 7.106274474282309e-06, + "loss": 0.019, + "step": 19361 + }, + { + "epoch": 7.873932492883286, + "grad_norm": 13.484504432764272, + "learning_rate": 7.10530464862169e-06, + "loss": 0.342, + "step": 19362 + }, + { + "epoch": 7.874339162261082, + "grad_norm": 0.002038427439485387, + "learning_rate": 7.104334852676576e-06, + "loss": 0.0, + "step": 19363 + }, + { + "epoch": 7.874745831638878, + "grad_norm": 0.2839526000243701, + "learning_rate": 7.103365086456921e-06, + "loss": 0.0043, + "step": 19364 + }, + { + "epoch": 7.875152501016673, + "grad_norm": 4.274248685812144, + "learning_rate": 7.102395349972682e-06, + "loss": 0.0204, + "step": 19365 + }, + { + "epoch": 7.875559170394469, + "grad_norm": 0.8441514631638297, + "learning_rate": 7.101425643233813e-06, + "loss": 0.0109, + "step": 19366 + }, + { + "epoch": 7.875965839772265, + "grad_norm": 3.205412362247677, + "learning_rate": 7.100455966250268e-06, + "loss": 0.0801, + "step": 19367 + }, + { + "epoch": 7.876372509150061, + "grad_norm": 0.2773472410341408, + "learning_rate": 7.099486319032003e-06, + "loss": 0.0021, + "step": 19368 + }, + { + "epoch": 7.8767791785278565, + "grad_norm": 0.08893455803588055, + "learning_rate": 7.098516701588971e-06, + "loss": 0.0013, + "step": 19369 + }, + { + "epoch": 7.877185847905652, + "grad_norm": 0.4089764294009347, + "learning_rate": 7.0975471139311225e-06, + "loss": 0.0043, + "step": 19370 + }, + { + "epoch": 7.877592517283449, + "grad_norm": 0.006794972179617637, + "learning_rate": 7.096577556068416e-06, + "loss": 0.0001, + "step": 19371 + }, + { + "epoch": 7.877999186661245, + "grad_norm": 0.525811796338685, + "learning_rate": 7.095608028010803e-06, + "loss": 0.007, + "step": 19372 + }, + { + "epoch": 7.87840585603904, + "grad_norm": 0.3051682335754688, + "learning_rate": 7.094638529768232e-06, + "loss": 0.0045, + "step": 19373 + }, + { + "epoch": 7.878812525416836, + "grad_norm": 0.009374586902581245, + "learning_rate": 7.093669061350661e-06, + "loss": 0.0001, + "step": 19374 + }, + { + "epoch": 7.879219194794632, + "grad_norm": 3.538754046337003, + "learning_rate": 7.09269962276804e-06, + "loss": 0.0829, + "step": 19375 + }, + { + "epoch": 7.879625864172428, + "grad_norm": 1.126909401407387, + "learning_rate": 7.091730214030319e-06, + "loss": 0.0109, + "step": 19376 + }, + { + "epoch": 7.8800325335502235, + "grad_norm": 6.358281886327807, + "learning_rate": 7.0907608351474535e-06, + "loss": 0.0679, + "step": 19377 + }, + { + "epoch": 7.880439202928019, + "grad_norm": 2.4805814619813744, + "learning_rate": 7.089791486129391e-06, + "loss": 0.0313, + "step": 19378 + }, + { + "epoch": 7.880845872305815, + "grad_norm": 6.7106643563995325, + "learning_rate": 7.088822166986083e-06, + "loss": 0.1091, + "step": 19379 + }, + { + "epoch": 7.881252541683612, + "grad_norm": 4.821986419007903, + "learning_rate": 7.0878528777274814e-06, + "loss": 0.0393, + "step": 19380 + }, + { + "epoch": 7.881659211061407, + "grad_norm": 0.10145370234400787, + "learning_rate": 7.086883618363536e-06, + "loss": 0.0014, + "step": 19381 + }, + { + "epoch": 7.882065880439203, + "grad_norm": 10.025177933119865, + "learning_rate": 7.085914388904196e-06, + "loss": 0.3806, + "step": 19382 + }, + { + "epoch": 7.882472549816999, + "grad_norm": 0.06405626688709151, + "learning_rate": 7.0849451893594105e-06, + "loss": 0.0009, + "step": 19383 + }, + { + "epoch": 7.882879219194795, + "grad_norm": 1.6317724276756298, + "learning_rate": 7.083976019739131e-06, + "loss": 0.0221, + "step": 19384 + }, + { + "epoch": 7.8832858885725905, + "grad_norm": 1.6074783405372692, + "learning_rate": 7.083006880053305e-06, + "loss": 0.0167, + "step": 19385 + }, + { + "epoch": 7.883692557950386, + "grad_norm": 7.450497039993376, + "learning_rate": 7.082037770311881e-06, + "loss": 0.2028, + "step": 19386 + }, + { + "epoch": 7.884099227328182, + "grad_norm": 0.37161637981529927, + "learning_rate": 7.081068690524807e-06, + "loss": 0.0058, + "step": 19387 + }, + { + "epoch": 7.884505896705978, + "grad_norm": 0.32743989991512595, + "learning_rate": 7.080099640702034e-06, + "loss": 0.0031, + "step": 19388 + }, + { + "epoch": 7.8849125660837736, + "grad_norm": 1.0270639529090442, + "learning_rate": 7.0791306208535045e-06, + "loss": 0.0116, + "step": 19389 + }, + { + "epoch": 7.885319235461569, + "grad_norm": 0.0419852515805038, + "learning_rate": 7.078161630989173e-06, + "loss": 0.0006, + "step": 19390 + }, + { + "epoch": 7.885725904839366, + "grad_norm": 0.037438504418958705, + "learning_rate": 7.077192671118982e-06, + "loss": 0.0005, + "step": 19391 + }, + { + "epoch": 7.886132574217162, + "grad_norm": 1.1663441353997464, + "learning_rate": 7.076223741252876e-06, + "loss": 0.0136, + "step": 19392 + }, + { + "epoch": 7.8865392435949575, + "grad_norm": 1.441645029003112, + "learning_rate": 7.075254841400809e-06, + "loss": 0.0263, + "step": 19393 + }, + { + "epoch": 7.886945912972753, + "grad_norm": 0.6852077849665008, + "learning_rate": 7.0742859715727215e-06, + "loss": 0.0069, + "step": 19394 + }, + { + "epoch": 7.887352582350549, + "grad_norm": 0.15691038764767348, + "learning_rate": 7.073317131778562e-06, + "loss": 0.0017, + "step": 19395 + }, + { + "epoch": 7.887759251728345, + "grad_norm": 2.3251925987722877, + "learning_rate": 7.072348322028273e-06, + "loss": 0.0372, + "step": 19396 + }, + { + "epoch": 7.888165921106141, + "grad_norm": 0.0874989314591512, + "learning_rate": 7.0713795423318034e-06, + "loss": 0.0014, + "step": 19397 + }, + { + "epoch": 7.888572590483936, + "grad_norm": 2.204293483635698, + "learning_rate": 7.070410792699097e-06, + "loss": 0.0356, + "step": 19398 + }, + { + "epoch": 7.888979259861732, + "grad_norm": 12.216805808279794, + "learning_rate": 7.069442073140096e-06, + "loss": 0.3669, + "step": 19399 + }, + { + "epoch": 7.889385929239529, + "grad_norm": 0.0429076831019333, + "learning_rate": 7.068473383664749e-06, + "loss": 0.0006, + "step": 19400 + }, + { + "epoch": 7.8897925986173245, + "grad_norm": 0.03617158546019665, + "learning_rate": 7.067504724282998e-06, + "loss": 0.0004, + "step": 19401 + }, + { + "epoch": 7.89019926799512, + "grad_norm": 5.531780422230996, + "learning_rate": 7.066536095004786e-06, + "loss": 0.0652, + "step": 19402 + }, + { + "epoch": 7.890605937372916, + "grad_norm": 1.01986092231471, + "learning_rate": 7.065567495840058e-06, + "loss": 0.0202, + "step": 19403 + }, + { + "epoch": 7.891012606750712, + "grad_norm": 0.10097031876383737, + "learning_rate": 7.064598926798758e-06, + "loss": 0.0016, + "step": 19404 + }, + { + "epoch": 7.891419276128508, + "grad_norm": 0.03600776842671229, + "learning_rate": 7.063630387890824e-06, + "loss": 0.0004, + "step": 19405 + }, + { + "epoch": 7.891825945506303, + "grad_norm": 0.1147664066694413, + "learning_rate": 7.062661879126205e-06, + "loss": 0.0015, + "step": 19406 + }, + { + "epoch": 7.892232614884099, + "grad_norm": 5.3038913125088945, + "learning_rate": 7.06169340051484e-06, + "loss": 0.0576, + "step": 19407 + }, + { + "epoch": 7.892639284261895, + "grad_norm": 0.10975725778033357, + "learning_rate": 7.06072495206667e-06, + "loss": 0.0018, + "step": 19408 + }, + { + "epoch": 7.893045953639691, + "grad_norm": 7.031200065965304, + "learning_rate": 7.0597565337916365e-06, + "loss": 0.1957, + "step": 19409 + }, + { + "epoch": 7.893452623017486, + "grad_norm": 1.5069931066743194, + "learning_rate": 7.058788145699683e-06, + "loss": 0.0192, + "step": 19410 + }, + { + "epoch": 7.893859292395282, + "grad_norm": 0.004959983855524194, + "learning_rate": 7.0578197878007505e-06, + "loss": 0.0001, + "step": 19411 + }, + { + "epoch": 7.894265961773079, + "grad_norm": 0.15028054941203342, + "learning_rate": 7.056851460104777e-06, + "loss": 0.0017, + "step": 19412 + }, + { + "epoch": 7.894672631150875, + "grad_norm": 1.8559050672371353, + "learning_rate": 7.055883162621706e-06, + "loss": 0.0305, + "step": 19413 + }, + { + "epoch": 7.89507930052867, + "grad_norm": 0.7085517677128457, + "learning_rate": 7.054914895361475e-06, + "loss": 0.01, + "step": 19414 + }, + { + "epoch": 7.895485969906466, + "grad_norm": 5.381577620625844, + "learning_rate": 7.053946658334024e-06, + "loss": 0.1688, + "step": 19415 + }, + { + "epoch": 7.895892639284262, + "grad_norm": 7.191478587205838, + "learning_rate": 7.052978451549295e-06, + "loss": 0.2061, + "step": 19416 + }, + { + "epoch": 7.896299308662058, + "grad_norm": 0.03426138797967365, + "learning_rate": 7.052010275017226e-06, + "loss": 0.0005, + "step": 19417 + }, + { + "epoch": 7.896705978039853, + "grad_norm": 3.9219788787152994, + "learning_rate": 7.051042128747753e-06, + "loss": 0.0405, + "step": 19418 + }, + { + "epoch": 7.897112647417649, + "grad_norm": 1.191881790451006, + "learning_rate": 7.050074012750818e-06, + "loss": 0.0256, + "step": 19419 + }, + { + "epoch": 7.897519316795445, + "grad_norm": 4.233227092636877, + "learning_rate": 7.049105927036358e-06, + "loss": 0.0533, + "step": 19420 + }, + { + "epoch": 7.897925986173242, + "grad_norm": 0.0681844256006793, + "learning_rate": 7.048137871614311e-06, + "loss": 0.001, + "step": 19421 + }, + { + "epoch": 7.898332655551037, + "grad_norm": 0.36064969080289966, + "learning_rate": 7.047169846494612e-06, + "loss": 0.0058, + "step": 19422 + }, + { + "epoch": 7.898739324928833, + "grad_norm": 2.8754193225757985, + "learning_rate": 7.046201851687204e-06, + "loss": 0.039, + "step": 19423 + }, + { + "epoch": 7.899145994306629, + "grad_norm": 9.669932338252037, + "learning_rate": 7.04523388720202e-06, + "loss": 0.0755, + "step": 19424 + }, + { + "epoch": 7.899552663684425, + "grad_norm": 0.5167899064843989, + "learning_rate": 7.044265953048994e-06, + "loss": 0.0075, + "step": 19425 + }, + { + "epoch": 7.89995933306222, + "grad_norm": 1.9759981406386733, + "learning_rate": 7.04329804923807e-06, + "loss": 0.0243, + "step": 19426 + }, + { + "epoch": 7.900366002440016, + "grad_norm": 0.030889537226134297, + "learning_rate": 7.042330175779177e-06, + "loss": 0.0004, + "step": 19427 + }, + { + "epoch": 7.900772671817812, + "grad_norm": 5.613983619710701, + "learning_rate": 7.041362332682253e-06, + "loss": 0.0433, + "step": 19428 + }, + { + "epoch": 7.901179341195608, + "grad_norm": 11.943431840615943, + "learning_rate": 7.040394519957237e-06, + "loss": 0.2326, + "step": 19429 + }, + { + "epoch": 7.9015860105734035, + "grad_norm": 0.014625048971919827, + "learning_rate": 7.039426737614058e-06, + "loss": 0.0003, + "step": 19430 + }, + { + "epoch": 7.901992679951199, + "grad_norm": 2.5256707099027222, + "learning_rate": 7.038458985662655e-06, + "loss": 0.0315, + "step": 19431 + }, + { + "epoch": 7.902399349328996, + "grad_norm": 0.9388419331906559, + "learning_rate": 7.037491264112957e-06, + "loss": 0.0176, + "step": 19432 + }, + { + "epoch": 7.902806018706792, + "grad_norm": 0.49791469362639773, + "learning_rate": 7.036523572974907e-06, + "loss": 0.0058, + "step": 19433 + }, + { + "epoch": 7.903212688084587, + "grad_norm": 6.7243964364847555, + "learning_rate": 7.0355559122584325e-06, + "loss": 0.1042, + "step": 19434 + }, + { + "epoch": 7.903619357462383, + "grad_norm": 1.5322065290541815, + "learning_rate": 7.034588281973467e-06, + "loss": 0.0227, + "step": 19435 + }, + { + "epoch": 7.904026026840179, + "grad_norm": 4.089047270021475, + "learning_rate": 7.033620682129947e-06, + "loss": 0.0734, + "step": 19436 + }, + { + "epoch": 7.904432696217975, + "grad_norm": 6.518802141500398, + "learning_rate": 7.032653112737804e-06, + "loss": 0.117, + "step": 19437 + }, + { + "epoch": 7.9048393655957705, + "grad_norm": 1.7814990407294224, + "learning_rate": 7.0316855738069675e-06, + "loss": 0.0195, + "step": 19438 + }, + { + "epoch": 7.905246034973566, + "grad_norm": 0.3780391877403069, + "learning_rate": 7.030718065347375e-06, + "loss": 0.0039, + "step": 19439 + }, + { + "epoch": 7.905652704351362, + "grad_norm": 0.30677092546979445, + "learning_rate": 7.029750587368956e-06, + "loss": 0.0056, + "step": 19440 + }, + { + "epoch": 7.906059373729159, + "grad_norm": 0.029231308384891748, + "learning_rate": 7.0287831398816405e-06, + "loss": 0.0004, + "step": 19441 + }, + { + "epoch": 7.9064660431069544, + "grad_norm": 4.444488208453044, + "learning_rate": 7.027815722895364e-06, + "loss": 0.0736, + "step": 19442 + }, + { + "epoch": 7.90687271248475, + "grad_norm": 0.31762176986096563, + "learning_rate": 7.026848336420053e-06, + "loss": 0.0032, + "step": 19443 + }, + { + "epoch": 7.907279381862546, + "grad_norm": 8.0565962902352, + "learning_rate": 7.025880980465641e-06, + "loss": 0.1489, + "step": 19444 + }, + { + "epoch": 7.907686051240342, + "grad_norm": 2.284502440151845, + "learning_rate": 7.024913655042056e-06, + "loss": 0.0277, + "step": 19445 + }, + { + "epoch": 7.9080927206181375, + "grad_norm": 11.784523104237028, + "learning_rate": 7.0239463601592314e-06, + "loss": 0.2955, + "step": 19446 + }, + { + "epoch": 7.908499389995933, + "grad_norm": 7.680014192718549, + "learning_rate": 7.022979095827096e-06, + "loss": 0.1507, + "step": 19447 + }, + { + "epoch": 7.908906059373729, + "grad_norm": 7.977004988873345, + "learning_rate": 7.022011862055575e-06, + "loss": 0.1767, + "step": 19448 + }, + { + "epoch": 7.909312728751525, + "grad_norm": 0.0025828981406701675, + "learning_rate": 7.021044658854603e-06, + "loss": 0.0, + "step": 19449 + }, + { + "epoch": 7.909719398129321, + "grad_norm": 1.0274911988169586, + "learning_rate": 7.0200774862341095e-06, + "loss": 0.019, + "step": 19450 + }, + { + "epoch": 7.910126067507116, + "grad_norm": 0.8912732603063479, + "learning_rate": 7.019110344204015e-06, + "loss": 0.01, + "step": 19451 + }, + { + "epoch": 7.910532736884912, + "grad_norm": 0.03776383864478661, + "learning_rate": 7.0181432327742575e-06, + "loss": 0.0007, + "step": 19452 + }, + { + "epoch": 7.910939406262709, + "grad_norm": 0.21485427700608478, + "learning_rate": 7.0171761519547585e-06, + "loss": 0.0027, + "step": 19453 + }, + { + "epoch": 7.9113460756405045, + "grad_norm": 0.026401122925914663, + "learning_rate": 7.016209101755446e-06, + "loss": 0.0004, + "step": 19454 + }, + { + "epoch": 7.9117527450183, + "grad_norm": 0.021299934343426787, + "learning_rate": 7.015242082186257e-06, + "loss": 0.0003, + "step": 19455 + }, + { + "epoch": 7.912159414396096, + "grad_norm": 0.05373328123518574, + "learning_rate": 7.014275093257102e-06, + "loss": 0.001, + "step": 19456 + }, + { + "epoch": 7.912566083773892, + "grad_norm": 7.596195140763407, + "learning_rate": 7.0133081349779185e-06, + "loss": 0.1698, + "step": 19457 + }, + { + "epoch": 7.912972753151688, + "grad_norm": 0.10027703514163092, + "learning_rate": 7.01234120735863e-06, + "loss": 0.001, + "step": 19458 + }, + { + "epoch": 7.913379422529483, + "grad_norm": 0.008791042297057012, + "learning_rate": 7.011374310409162e-06, + "loss": 0.0001, + "step": 19459 + }, + { + "epoch": 7.913786091907279, + "grad_norm": 5.6470667014431895, + "learning_rate": 7.0104074441394396e-06, + "loss": 0.125, + "step": 19460 + }, + { + "epoch": 7.914192761285075, + "grad_norm": 7.448675529905033, + "learning_rate": 7.0094406085593925e-06, + "loss": 0.0817, + "step": 19461 + }, + { + "epoch": 7.9145994306628715, + "grad_norm": 0.666453219545738, + "learning_rate": 7.008473803678942e-06, + "loss": 0.0113, + "step": 19462 + }, + { + "epoch": 7.915006100040667, + "grad_norm": 8.25995991398399, + "learning_rate": 7.007507029508009e-06, + "loss": 0.2009, + "step": 19463 + }, + { + "epoch": 7.915412769418463, + "grad_norm": 0.2678906091358361, + "learning_rate": 7.006540286056527e-06, + "loss": 0.0029, + "step": 19464 + }, + { + "epoch": 7.915819438796259, + "grad_norm": 0.338882143459007, + "learning_rate": 7.0055735733344154e-06, + "loss": 0.0058, + "step": 19465 + }, + { + "epoch": 7.916226108174055, + "grad_norm": 5.488560797149237, + "learning_rate": 7.004606891351595e-06, + "loss": 0.0837, + "step": 19466 + }, + { + "epoch": 7.91663277755185, + "grad_norm": 0.1651756170807491, + "learning_rate": 7.0036402401179955e-06, + "loss": 0.0018, + "step": 19467 + }, + { + "epoch": 7.917039446929646, + "grad_norm": 0.025089618469727952, + "learning_rate": 7.0026736196435375e-06, + "loss": 0.0003, + "step": 19468 + }, + { + "epoch": 7.917446116307442, + "grad_norm": 1.9488613387117362, + "learning_rate": 7.001707029938141e-06, + "loss": 0.0337, + "step": 19469 + }, + { + "epoch": 7.917852785685238, + "grad_norm": 0.17461643002850616, + "learning_rate": 7.000740471011731e-06, + "loss": 0.0019, + "step": 19470 + }, + { + "epoch": 7.918259455063033, + "grad_norm": 13.137696718093794, + "learning_rate": 6.999773942874232e-06, + "loss": 0.3585, + "step": 19471 + }, + { + "epoch": 7.918666124440829, + "grad_norm": 0.1781442764923875, + "learning_rate": 6.9988074455355624e-06, + "loss": 0.002, + "step": 19472 + }, + { + "epoch": 7.919072793818626, + "grad_norm": 0.00029439603499068146, + "learning_rate": 6.997840979005642e-06, + "loss": 0.0, + "step": 19473 + }, + { + "epoch": 7.919479463196422, + "grad_norm": 10.06895560932852, + "learning_rate": 6.9968745432943985e-06, + "loss": 0.2955, + "step": 19474 + }, + { + "epoch": 7.919886132574217, + "grad_norm": 1.898113748469395, + "learning_rate": 6.995908138411748e-06, + "loss": 0.0166, + "step": 19475 + }, + { + "epoch": 7.920292801952013, + "grad_norm": 0.01536078423003436, + "learning_rate": 6.994941764367611e-06, + "loss": 0.0002, + "step": 19476 + }, + { + "epoch": 7.920699471329809, + "grad_norm": 0.04473929793235192, + "learning_rate": 6.99397542117191e-06, + "loss": 0.0004, + "step": 19477 + }, + { + "epoch": 7.921106140707605, + "grad_norm": 1.230066044413615, + "learning_rate": 6.993009108834566e-06, + "loss": 0.0144, + "step": 19478 + }, + { + "epoch": 7.9215128100854, + "grad_norm": 3.107711847170975, + "learning_rate": 6.992042827365494e-06, + "loss": 0.0582, + "step": 19479 + }, + { + "epoch": 7.921919479463196, + "grad_norm": 0.037916236707744126, + "learning_rate": 6.991076576774618e-06, + "loss": 0.0005, + "step": 19480 + }, + { + "epoch": 7.922326148840992, + "grad_norm": 3.1203120227509156, + "learning_rate": 6.990110357071855e-06, + "loss": 0.0548, + "step": 19481 + }, + { + "epoch": 7.922732818218789, + "grad_norm": 0.0721857615981633, + "learning_rate": 6.9891441682671235e-06, + "loss": 0.0009, + "step": 19482 + }, + { + "epoch": 7.923139487596584, + "grad_norm": 6.531776859853637, + "learning_rate": 6.988178010370343e-06, + "loss": 0.1443, + "step": 19483 + }, + { + "epoch": 7.92354615697438, + "grad_norm": 1.2154614798133305, + "learning_rate": 6.987211883391432e-06, + "loss": 0.0147, + "step": 19484 + }, + { + "epoch": 7.923952826352176, + "grad_norm": 13.237169669886384, + "learning_rate": 6.986245787340308e-06, + "loss": 0.4549, + "step": 19485 + }, + { + "epoch": 7.924359495729972, + "grad_norm": 19.428707099601635, + "learning_rate": 6.985279722226884e-06, + "loss": 0.2529, + "step": 19486 + }, + { + "epoch": 7.924766165107767, + "grad_norm": 1.147169820299032, + "learning_rate": 6.984313688061084e-06, + "loss": 0.0125, + "step": 19487 + }, + { + "epoch": 7.925172834485563, + "grad_norm": 3.1227049040946797, + "learning_rate": 6.983347684852821e-06, + "loss": 0.0508, + "step": 19488 + }, + { + "epoch": 7.925579503863359, + "grad_norm": 1.2442162301536976, + "learning_rate": 6.982381712612012e-06, + "loss": 0.0187, + "step": 19489 + }, + { + "epoch": 7.925986173241155, + "grad_norm": 4.949957053372691, + "learning_rate": 6.981415771348574e-06, + "loss": 0.1578, + "step": 19490 + }, + { + "epoch": 7.9263928426189505, + "grad_norm": 0.01109948945985943, + "learning_rate": 6.980449861072423e-06, + "loss": 0.0002, + "step": 19491 + }, + { + "epoch": 7.926799511996746, + "grad_norm": 0.1573462624580336, + "learning_rate": 6.97948398179347e-06, + "loss": 0.0026, + "step": 19492 + }, + { + "epoch": 7.927206181374542, + "grad_norm": 1.9326415041145015, + "learning_rate": 6.978518133521638e-06, + "loss": 0.0268, + "step": 19493 + }, + { + "epoch": 7.927612850752339, + "grad_norm": 3.4699487427132194, + "learning_rate": 6.9775523162668376e-06, + "loss": 0.04, + "step": 19494 + }, + { + "epoch": 7.9280195201301344, + "grad_norm": 3.21004832952182, + "learning_rate": 6.976586530038982e-06, + "loss": 0.0495, + "step": 19495 + }, + { + "epoch": 7.92842618950793, + "grad_norm": 0.8265053798670647, + "learning_rate": 6.975620774847988e-06, + "loss": 0.015, + "step": 19496 + }, + { + "epoch": 7.928832858885726, + "grad_norm": 0.03498373608864686, + "learning_rate": 6.974655050703771e-06, + "loss": 0.0007, + "step": 19497 + }, + { + "epoch": 7.929239528263522, + "grad_norm": 0.20748614871482468, + "learning_rate": 6.973689357616241e-06, + "loss": 0.0028, + "step": 19498 + }, + { + "epoch": 7.9296461976413175, + "grad_norm": 0.4582344293419296, + "learning_rate": 6.972723695595311e-06, + "loss": 0.0075, + "step": 19499 + }, + { + "epoch": 7.930052867019113, + "grad_norm": 1.8430408840648838, + "learning_rate": 6.9717580646508985e-06, + "loss": 0.0299, + "step": 19500 + }, + { + "epoch": 7.930459536396909, + "grad_norm": 2.9455565440718963, + "learning_rate": 6.9707924647929135e-06, + "loss": 0.0373, + "step": 19501 + }, + { + "epoch": 7.930866205774705, + "grad_norm": 3.4125471104546325, + "learning_rate": 6.9698268960312664e-06, + "loss": 0.0238, + "step": 19502 + }, + { + "epoch": 7.9312728751525015, + "grad_norm": 0.017107833445281762, + "learning_rate": 6.9688613583758735e-06, + "loss": 0.0002, + "step": 19503 + }, + { + "epoch": 7.931679544530297, + "grad_norm": 0.09615788705069435, + "learning_rate": 6.967895851836643e-06, + "loss": 0.0008, + "step": 19504 + }, + { + "epoch": 7.932086213908093, + "grad_norm": 10.439781759029936, + "learning_rate": 6.966930376423486e-06, + "loss": 0.4714, + "step": 19505 + }, + { + "epoch": 7.932492883285889, + "grad_norm": 0.5651500844875589, + "learning_rate": 6.965964932146318e-06, + "loss": 0.0067, + "step": 19506 + }, + { + "epoch": 7.9328995526636845, + "grad_norm": 9.06625470861089, + "learning_rate": 6.964999519015047e-06, + "loss": 0.1772, + "step": 19507 + }, + { + "epoch": 7.93330622204148, + "grad_norm": 0.03962149294017929, + "learning_rate": 6.964034137039581e-06, + "loss": 0.0005, + "step": 19508 + }, + { + "epoch": 7.933712891419276, + "grad_norm": 1.750007197977807, + "learning_rate": 6.963068786229835e-06, + "loss": 0.0151, + "step": 19509 + }, + { + "epoch": 7.934119560797072, + "grad_norm": 13.497123410547108, + "learning_rate": 6.962103466595715e-06, + "loss": 0.0612, + "step": 19510 + }, + { + "epoch": 7.934526230174868, + "grad_norm": 0.09447155552650251, + "learning_rate": 6.961138178147133e-06, + "loss": 0.0017, + "step": 19511 + }, + { + "epoch": 7.934932899552663, + "grad_norm": 0.1079753194543968, + "learning_rate": 6.960172920893995e-06, + "loss": 0.0016, + "step": 19512 + }, + { + "epoch": 7.935339568930459, + "grad_norm": 0.0032865066956430772, + "learning_rate": 6.959207694846215e-06, + "loss": 0.0, + "step": 19513 + }, + { + "epoch": 7.935746238308256, + "grad_norm": 0.10749169324920047, + "learning_rate": 6.958242500013697e-06, + "loss": 0.0013, + "step": 19514 + }, + { + "epoch": 7.9361529076860515, + "grad_norm": 0.7799454181734826, + "learning_rate": 6.957277336406348e-06, + "loss": 0.0094, + "step": 19515 + }, + { + "epoch": 7.936559577063847, + "grad_norm": 5.110879123646499, + "learning_rate": 6.956312204034082e-06, + "loss": 0.0774, + "step": 19516 + }, + { + "epoch": 7.936966246441643, + "grad_norm": 0.003739307810078956, + "learning_rate": 6.955347102906802e-06, + "loss": 0.0, + "step": 19517 + }, + { + "epoch": 7.937372915819439, + "grad_norm": 1.2624018310134693, + "learning_rate": 6.954382033034414e-06, + "loss": 0.0158, + "step": 19518 + }, + { + "epoch": 7.937779585197235, + "grad_norm": 1.341034845490283, + "learning_rate": 6.9534169944268316e-06, + "loss": 0.0083, + "step": 19519 + }, + { + "epoch": 7.93818625457503, + "grad_norm": 0.3557649492228184, + "learning_rate": 6.952451987093954e-06, + "loss": 0.0041, + "step": 19520 + }, + { + "epoch": 7.938592923952826, + "grad_norm": 0.006035797349158785, + "learning_rate": 6.951487011045692e-06, + "loss": 0.0001, + "step": 19521 + }, + { + "epoch": 7.938999593330622, + "grad_norm": 0.06714342955829711, + "learning_rate": 6.950522066291949e-06, + "loss": 0.0008, + "step": 19522 + }, + { + "epoch": 7.9394062627084185, + "grad_norm": 5.699825399769856, + "learning_rate": 6.949557152842632e-06, + "loss": 0.1796, + "step": 19523 + }, + { + "epoch": 7.939812932086214, + "grad_norm": 1.2566787006150724, + "learning_rate": 6.948592270707648e-06, + "loss": 0.0181, + "step": 19524 + }, + { + "epoch": 7.94021960146401, + "grad_norm": 0.22249102196196166, + "learning_rate": 6.947627419896896e-06, + "loss": 0.0032, + "step": 19525 + }, + { + "epoch": 7.940626270841806, + "grad_norm": 4.59036420321504, + "learning_rate": 6.946662600420288e-06, + "loss": 0.1202, + "step": 19526 + }, + { + "epoch": 7.941032940219602, + "grad_norm": 2.191036455940081, + "learning_rate": 6.945697812287725e-06, + "loss": 0.0495, + "step": 19527 + }, + { + "epoch": 7.941439609597397, + "grad_norm": 26.876648356653885, + "learning_rate": 6.94473305550911e-06, + "loss": 0.4248, + "step": 19528 + }, + { + "epoch": 7.941846278975193, + "grad_norm": 6.326494454253213, + "learning_rate": 6.943768330094348e-06, + "loss": 0.1294, + "step": 19529 + }, + { + "epoch": 7.942252948352989, + "grad_norm": 1.6752103330977954, + "learning_rate": 6.9428036360533435e-06, + "loss": 0.0216, + "step": 19530 + }, + { + "epoch": 7.942659617730785, + "grad_norm": 16.054089113671978, + "learning_rate": 6.9418389733959955e-06, + "loss": 0.3625, + "step": 19531 + }, + { + "epoch": 7.94306628710858, + "grad_norm": 7.090611405231394, + "learning_rate": 6.940874342132214e-06, + "loss": 0.1273, + "step": 19532 + }, + { + "epoch": 7.943472956486376, + "grad_norm": 1.9217175653883607, + "learning_rate": 6.939909742271893e-06, + "loss": 0.0322, + "step": 19533 + }, + { + "epoch": 7.943879625864172, + "grad_norm": 5.2379729324566275, + "learning_rate": 6.9389451738249415e-06, + "loss": 0.0635, + "step": 19534 + }, + { + "epoch": 7.944286295241969, + "grad_norm": 1.0525504095926197, + "learning_rate": 6.937980636801256e-06, + "loss": 0.0106, + "step": 19535 + }, + { + "epoch": 7.944692964619764, + "grad_norm": 0.003913766969755253, + "learning_rate": 6.937016131210741e-06, + "loss": 0.0001, + "step": 19536 + }, + { + "epoch": 7.94509963399756, + "grad_norm": 0.070389729402049, + "learning_rate": 6.936051657063299e-06, + "loss": 0.0007, + "step": 19537 + }, + { + "epoch": 7.945506303375356, + "grad_norm": 0.22933500991260702, + "learning_rate": 6.9350872143688256e-06, + "loss": 0.0031, + "step": 19538 + }, + { + "epoch": 7.945912972753152, + "grad_norm": 1.1732783086253065, + "learning_rate": 6.934122803137226e-06, + "loss": 0.0169, + "step": 19539 + }, + { + "epoch": 7.946319642130947, + "grad_norm": 2.938031644661236, + "learning_rate": 6.933158423378399e-06, + "loss": 0.0341, + "step": 19540 + }, + { + "epoch": 7.946726311508743, + "grad_norm": 0.20138622546790813, + "learning_rate": 6.932194075102243e-06, + "loss": 0.0025, + "step": 19541 + }, + { + "epoch": 7.947132980886539, + "grad_norm": 2.410002628115633, + "learning_rate": 6.93122975831866e-06, + "loss": 0.024, + "step": 19542 + }, + { + "epoch": 7.947539650264335, + "grad_norm": 1.119898382252811, + "learning_rate": 6.93026547303755e-06, + "loss": 0.0231, + "step": 19543 + }, + { + "epoch": 7.947946319642131, + "grad_norm": 4.374673263555218, + "learning_rate": 6.929301219268806e-06, + "loss": 0.0963, + "step": 19544 + }, + { + "epoch": 7.948352989019927, + "grad_norm": 0.10675610746433338, + "learning_rate": 6.928336997022333e-06, + "loss": 0.0009, + "step": 19545 + }, + { + "epoch": 7.948759658397723, + "grad_norm": 0.0852593899281761, + "learning_rate": 6.927372806308026e-06, + "loss": 0.0016, + "step": 19546 + }, + { + "epoch": 7.949166327775519, + "grad_norm": 0.17761104806966738, + "learning_rate": 6.926408647135785e-06, + "loss": 0.002, + "step": 19547 + }, + { + "epoch": 7.9495729971533144, + "grad_norm": 5.968392688715795, + "learning_rate": 6.925444519515503e-06, + "loss": 0.1083, + "step": 19548 + }, + { + "epoch": 7.94997966653111, + "grad_norm": 0.23100932344411212, + "learning_rate": 6.924480423457083e-06, + "loss": 0.0029, + "step": 19549 + }, + { + "epoch": 7.950386335908906, + "grad_norm": 0.041972627092078024, + "learning_rate": 6.923516358970419e-06, + "loss": 0.0007, + "step": 19550 + }, + { + "epoch": 7.950793005286702, + "grad_norm": 0.02609948884381766, + "learning_rate": 6.922552326065406e-06, + "loss": 0.0005, + "step": 19551 + }, + { + "epoch": 7.9511996746644975, + "grad_norm": 4.511755147700446, + "learning_rate": 6.921588324751944e-06, + "loss": 0.1789, + "step": 19552 + }, + { + "epoch": 7.951606344042293, + "grad_norm": 3.082454312334167, + "learning_rate": 6.920624355039926e-06, + "loss": 0.0649, + "step": 19553 + }, + { + "epoch": 7.952013013420089, + "grad_norm": 3.639890198320126, + "learning_rate": 6.919660416939249e-06, + "loss": 0.1119, + "step": 19554 + }, + { + "epoch": 7.952419682797886, + "grad_norm": 0.18415510919316552, + "learning_rate": 6.918696510459813e-06, + "loss": 0.0015, + "step": 19555 + }, + { + "epoch": 7.9528263521756815, + "grad_norm": 0.8981794552324356, + "learning_rate": 6.9177326356115025e-06, + "loss": 0.0133, + "step": 19556 + }, + { + "epoch": 7.953233021553477, + "grad_norm": 10.537107709119098, + "learning_rate": 6.91676879240422e-06, + "loss": 0.1919, + "step": 19557 + }, + { + "epoch": 7.953639690931273, + "grad_norm": 3.9418388729213247, + "learning_rate": 6.915804980847858e-06, + "loss": 0.1516, + "step": 19558 + }, + { + "epoch": 7.954046360309069, + "grad_norm": 0.03418593898405491, + "learning_rate": 6.914841200952307e-06, + "loss": 0.0005, + "step": 19559 + }, + { + "epoch": 7.9544530296868645, + "grad_norm": 0.49531291918693826, + "learning_rate": 6.913877452727466e-06, + "loss": 0.008, + "step": 19560 + }, + { + "epoch": 7.95485969906466, + "grad_norm": 0.37743101000231927, + "learning_rate": 6.912913736183227e-06, + "loss": 0.0058, + "step": 19561 + }, + { + "epoch": 7.955266368442456, + "grad_norm": 1.7072648132353319, + "learning_rate": 6.911950051329482e-06, + "loss": 0.0288, + "step": 19562 + }, + { + "epoch": 7.955673037820252, + "grad_norm": 4.549453644024931, + "learning_rate": 6.910986398176121e-06, + "loss": 0.1105, + "step": 19563 + }, + { + "epoch": 7.9560797071980485, + "grad_norm": 11.003105347201947, + "learning_rate": 6.9100227767330405e-06, + "loss": 0.2795, + "step": 19564 + }, + { + "epoch": 7.956486376575844, + "grad_norm": 0.06971185974447683, + "learning_rate": 6.909059187010132e-06, + "loss": 0.001, + "step": 19565 + }, + { + "epoch": 7.95689304595364, + "grad_norm": 7.308731103806283, + "learning_rate": 6.908095629017286e-06, + "loss": 0.2165, + "step": 19566 + }, + { + "epoch": 7.957299715331436, + "grad_norm": 1.221504176020015, + "learning_rate": 6.907132102764396e-06, + "loss": 0.016, + "step": 19567 + }, + { + "epoch": 7.9577063847092315, + "grad_norm": 0.6567612710459929, + "learning_rate": 6.90616860826135e-06, + "loss": 0.0063, + "step": 19568 + }, + { + "epoch": 7.958113054087027, + "grad_norm": 0.3921181480871634, + "learning_rate": 6.90520514551804e-06, + "loss": 0.0028, + "step": 19569 + }, + { + "epoch": 7.958519723464823, + "grad_norm": 3.5262457217391177, + "learning_rate": 6.904241714544359e-06, + "loss": 0.0609, + "step": 19570 + }, + { + "epoch": 7.958926392842619, + "grad_norm": 5.891923841463255, + "learning_rate": 6.903278315350194e-06, + "loss": 0.1819, + "step": 19571 + }, + { + "epoch": 7.959333062220415, + "grad_norm": 0.036486041202649644, + "learning_rate": 6.902314947945433e-06, + "loss": 0.0006, + "step": 19572 + }, + { + "epoch": 7.95973973159821, + "grad_norm": 4.350775704489057, + "learning_rate": 6.90135161233997e-06, + "loss": 0.0639, + "step": 19573 + }, + { + "epoch": 7.960146400976006, + "grad_norm": 0.13212136041208558, + "learning_rate": 6.900388308543695e-06, + "loss": 0.002, + "step": 19574 + }, + { + "epoch": 7.960553070353802, + "grad_norm": 24.833280497426482, + "learning_rate": 6.8994250365664915e-06, + "loss": 0.6165, + "step": 19575 + }, + { + "epoch": 7.9609597397315985, + "grad_norm": 13.35351686504158, + "learning_rate": 6.89846179641825e-06, + "loss": 0.2081, + "step": 19576 + }, + { + "epoch": 7.961366409109394, + "grad_norm": 0.06864086289817524, + "learning_rate": 6.89749858810886e-06, + "loss": 0.0008, + "step": 19577 + }, + { + "epoch": 7.96177307848719, + "grad_norm": 0.09155959340071737, + "learning_rate": 6.896535411648209e-06, + "loss": 0.0013, + "step": 19578 + }, + { + "epoch": 7.962179747864986, + "grad_norm": 0.1719210798443362, + "learning_rate": 6.89557226704618e-06, + "loss": 0.0024, + "step": 19579 + }, + { + "epoch": 7.962586417242782, + "grad_norm": 1.9772120288653998, + "learning_rate": 6.894609154312668e-06, + "loss": 0.0468, + "step": 19580 + }, + { + "epoch": 7.962993086620577, + "grad_norm": 7.548295731418375, + "learning_rate": 6.893646073457556e-06, + "loss": 0.0701, + "step": 19581 + }, + { + "epoch": 7.963399755998373, + "grad_norm": 0.013026745726905554, + "learning_rate": 6.892683024490728e-06, + "loss": 0.0002, + "step": 19582 + }, + { + "epoch": 7.963806425376169, + "grad_norm": 29.636698479354482, + "learning_rate": 6.891720007422075e-06, + "loss": 0.6866, + "step": 19583 + }, + { + "epoch": 7.964213094753965, + "grad_norm": 3.6783226735888266, + "learning_rate": 6.89075702226148e-06, + "loss": 0.0438, + "step": 19584 + }, + { + "epoch": 7.964619764131761, + "grad_norm": 6.01089743639146, + "learning_rate": 6.889794069018827e-06, + "loss": 0.0961, + "step": 19585 + }, + { + "epoch": 7.965026433509557, + "grad_norm": 0.09454388660987618, + "learning_rate": 6.888831147704006e-06, + "loss": 0.0018, + "step": 19586 + }, + { + "epoch": 7.965433102887353, + "grad_norm": 0.5538140637280011, + "learning_rate": 6.887868258326898e-06, + "loss": 0.008, + "step": 19587 + }, + { + "epoch": 7.965839772265149, + "grad_norm": 4.820598863700288, + "learning_rate": 6.88690540089739e-06, + "loss": 0.0973, + "step": 19588 + }, + { + "epoch": 7.966246441642944, + "grad_norm": 2.9477807713242177, + "learning_rate": 6.885942575425362e-06, + "loss": 0.057, + "step": 19589 + }, + { + "epoch": 7.96665311102074, + "grad_norm": 6.570053335168885, + "learning_rate": 6.884979781920702e-06, + "loss": 0.1509, + "step": 19590 + }, + { + "epoch": 7.967059780398536, + "grad_norm": 5.840360091978948, + "learning_rate": 6.884017020393294e-06, + "loss": 0.0944, + "step": 19591 + }, + { + "epoch": 7.967466449776332, + "grad_norm": 12.550920042237582, + "learning_rate": 6.883054290853017e-06, + "loss": 0.559, + "step": 19592 + }, + { + "epoch": 7.967873119154127, + "grad_norm": 7.943712317169515, + "learning_rate": 6.882091593309757e-06, + "loss": 0.1157, + "step": 19593 + }, + { + "epoch": 7.968279788531923, + "grad_norm": 3.1099347600330485, + "learning_rate": 6.8811289277733975e-06, + "loss": 0.0516, + "step": 19594 + }, + { + "epoch": 7.968686457909719, + "grad_norm": 0.003886099012847027, + "learning_rate": 6.8801662942538165e-06, + "loss": 0.0, + "step": 19595 + }, + { + "epoch": 7.969093127287516, + "grad_norm": 9.544773807473886, + "learning_rate": 6.879203692760901e-06, + "loss": 0.4658, + "step": 19596 + }, + { + "epoch": 7.969499796665311, + "grad_norm": 0.8986431392478643, + "learning_rate": 6.87824112330453e-06, + "loss": 0.0115, + "step": 19597 + }, + { + "epoch": 7.969906466043107, + "grad_norm": 8.628058631507697, + "learning_rate": 6.877278585894584e-06, + "loss": 0.1607, + "step": 19598 + }, + { + "epoch": 7.970313135420903, + "grad_norm": 1.4737895173795008, + "learning_rate": 6.876316080540945e-06, + "loss": 0.0223, + "step": 19599 + }, + { + "epoch": 7.970719804798699, + "grad_norm": 7.940861934278155, + "learning_rate": 6.875353607253495e-06, + "loss": 0.1073, + "step": 19600 + }, + { + "epoch": 7.9711264741764944, + "grad_norm": 3.331164924946963, + "learning_rate": 6.874391166042114e-06, + "loss": 0.0458, + "step": 19601 + }, + { + "epoch": 7.97153314355429, + "grad_norm": 0.8363725457533834, + "learning_rate": 6.873428756916676e-06, + "loss": 0.0083, + "step": 19602 + }, + { + "epoch": 7.971939812932086, + "grad_norm": 2.231575207508484, + "learning_rate": 6.872466379887068e-06, + "loss": 0.0286, + "step": 19603 + }, + { + "epoch": 7.972346482309882, + "grad_norm": 10.624004299126593, + "learning_rate": 6.871504034963168e-06, + "loss": 0.3953, + "step": 19604 + }, + { + "epoch": 7.972753151687678, + "grad_norm": 1.3004998369499141, + "learning_rate": 6.870541722154851e-06, + "loss": 0.0224, + "step": 19605 + }, + { + "epoch": 7.973159821065474, + "grad_norm": 0.031162117697281073, + "learning_rate": 6.869579441472e-06, + "loss": 0.0004, + "step": 19606 + }, + { + "epoch": 7.97356649044327, + "grad_norm": 3.2915293994318673, + "learning_rate": 6.868617192924493e-06, + "loss": 0.0435, + "step": 19607 + }, + { + "epoch": 7.973973159821066, + "grad_norm": 0.10877459608811536, + "learning_rate": 6.867654976522202e-06, + "loss": 0.0015, + "step": 19608 + }, + { + "epoch": 7.9743798291988615, + "grad_norm": 0.04339688973080272, + "learning_rate": 6.866692792275014e-06, + "loss": 0.0005, + "step": 19609 + }, + { + "epoch": 7.974786498576657, + "grad_norm": 6.687542206361421, + "learning_rate": 6.8657306401927995e-06, + "loss": 0.1007, + "step": 19610 + }, + { + "epoch": 7.975193167954453, + "grad_norm": 16.924653188907715, + "learning_rate": 6.864768520285435e-06, + "loss": 0.4519, + "step": 19611 + }, + { + "epoch": 7.975599837332249, + "grad_norm": 0.5537316251091441, + "learning_rate": 6.863806432562803e-06, + "loss": 0.0089, + "step": 19612 + }, + { + "epoch": 7.9760065067100445, + "grad_norm": 3.1273759319887597, + "learning_rate": 6.862844377034775e-06, + "loss": 0.0492, + "step": 19613 + }, + { + "epoch": 7.97641317608784, + "grad_norm": 5.751376723049928, + "learning_rate": 6.861882353711229e-06, + "loss": 0.113, + "step": 19614 + }, + { + "epoch": 7.976819845465636, + "grad_norm": 6.001655026874411, + "learning_rate": 6.8609203626020395e-06, + "loss": 0.082, + "step": 19615 + }, + { + "epoch": 7.977226514843432, + "grad_norm": 0.1604470659966344, + "learning_rate": 6.8599584037170815e-06, + "loss": 0.0028, + "step": 19616 + }, + { + "epoch": 7.9776331842212285, + "grad_norm": 7.1350632051240215, + "learning_rate": 6.8589964770662335e-06, + "loss": 0.1537, + "step": 19617 + }, + { + "epoch": 7.978039853599024, + "grad_norm": 3.311894720612913, + "learning_rate": 6.858034582659363e-06, + "loss": 0.0456, + "step": 19618 + }, + { + "epoch": 7.97844652297682, + "grad_norm": 1.326849793487774, + "learning_rate": 6.857072720506352e-06, + "loss": 0.0069, + "step": 19619 + }, + { + "epoch": 7.978853192354616, + "grad_norm": 0.15978448264276318, + "learning_rate": 6.856110890617073e-06, + "loss": 0.0028, + "step": 19620 + }, + { + "epoch": 7.9792598617324115, + "grad_norm": 1.3765307145092345, + "learning_rate": 6.855149093001394e-06, + "loss": 0.0223, + "step": 19621 + }, + { + "epoch": 7.979666531110207, + "grad_norm": 0.2759585572660259, + "learning_rate": 6.854187327669195e-06, + "loss": 0.0045, + "step": 19622 + }, + { + "epoch": 7.980073200488003, + "grad_norm": 0.41289653189998876, + "learning_rate": 6.853225594630346e-06, + "loss": 0.0045, + "step": 19623 + }, + { + "epoch": 7.980479869865799, + "grad_norm": 0.983259338122522, + "learning_rate": 6.852263893894717e-06, + "loss": 0.0114, + "step": 19624 + }, + { + "epoch": 7.980886539243595, + "grad_norm": 5.514484039296189, + "learning_rate": 6.851302225472187e-06, + "loss": 0.0996, + "step": 19625 + }, + { + "epoch": 7.981293208621391, + "grad_norm": 0.10837014548878723, + "learning_rate": 6.850340589372623e-06, + "loss": 0.0018, + "step": 19626 + }, + { + "epoch": 7.981699877999187, + "grad_norm": 6.109595747225522, + "learning_rate": 6.8493789856059e-06, + "loss": 0.1424, + "step": 19627 + }, + { + "epoch": 7.982106547376983, + "grad_norm": 0.22286752442540989, + "learning_rate": 6.848417414181885e-06, + "loss": 0.0035, + "step": 19628 + }, + { + "epoch": 7.9825132167547785, + "grad_norm": 8.202671671557363, + "learning_rate": 6.847455875110452e-06, + "loss": 0.1885, + "step": 19629 + }, + { + "epoch": 7.982919886132574, + "grad_norm": 0.5788518206201893, + "learning_rate": 6.846494368401472e-06, + "loss": 0.0089, + "step": 19630 + }, + { + "epoch": 7.98332655551037, + "grad_norm": 2.319517428675595, + "learning_rate": 6.8455328940648125e-06, + "loss": 0.0283, + "step": 19631 + }, + { + "epoch": 7.983733224888166, + "grad_norm": 1.0300379724435582, + "learning_rate": 6.844571452110347e-06, + "loss": 0.0162, + "step": 19632 + }, + { + "epoch": 7.984139894265962, + "grad_norm": 0.6543652543902122, + "learning_rate": 6.843610042547944e-06, + "loss": 0.0114, + "step": 19633 + }, + { + "epoch": 7.984546563643757, + "grad_norm": 0.24400040475598297, + "learning_rate": 6.84264866538747e-06, + "loss": 0.0027, + "step": 19634 + }, + { + "epoch": 7.984953233021553, + "grad_norm": 2.9265097068880053, + "learning_rate": 6.841687320638799e-06, + "loss": 0.0381, + "step": 19635 + }, + { + "epoch": 7.985359902399349, + "grad_norm": 3.769490690457835, + "learning_rate": 6.840726008311797e-06, + "loss": 0.0549, + "step": 19636 + }, + { + "epoch": 7.9857665717771456, + "grad_norm": 0.3344724518593021, + "learning_rate": 6.839764728416331e-06, + "loss": 0.0064, + "step": 19637 + }, + { + "epoch": 7.986173241154941, + "grad_norm": 4.9541193373180965, + "learning_rate": 6.8388034809622705e-06, + "loss": 0.0934, + "step": 19638 + }, + { + "epoch": 7.986579910532737, + "grad_norm": 0.15499710541126627, + "learning_rate": 6.837842265959485e-06, + "loss": 0.0021, + "step": 19639 + }, + { + "epoch": 7.986986579910533, + "grad_norm": 0.4073976903397717, + "learning_rate": 6.836881083417841e-06, + "loss": 0.0061, + "step": 19640 + }, + { + "epoch": 7.987393249288329, + "grad_norm": 11.228590561090353, + "learning_rate": 6.835919933347202e-06, + "loss": 0.3553, + "step": 19641 + }, + { + "epoch": 7.987799918666124, + "grad_norm": 3.7859713593998623, + "learning_rate": 6.834958815757437e-06, + "loss": 0.0293, + "step": 19642 + }, + { + "epoch": 7.98820658804392, + "grad_norm": 4.100614910450362, + "learning_rate": 6.833997730658416e-06, + "loss": 0.1172, + "step": 19643 + }, + { + "epoch": 7.988613257421716, + "grad_norm": 0.013385057543831716, + "learning_rate": 6.833036678059999e-06, + "loss": 0.0002, + "step": 19644 + }, + { + "epoch": 7.989019926799512, + "grad_norm": 5.436185539520854, + "learning_rate": 6.8320756579720545e-06, + "loss": 0.0862, + "step": 19645 + }, + { + "epoch": 7.989426596177308, + "grad_norm": 0.1807862110888563, + "learning_rate": 6.8311146704044495e-06, + "loss": 0.0022, + "step": 19646 + }, + { + "epoch": 7.989833265555104, + "grad_norm": 0.2895125495393711, + "learning_rate": 6.830153715367045e-06, + "loss": 0.0035, + "step": 19647 + }, + { + "epoch": 7.9902399349329, + "grad_norm": 5.108841769984959, + "learning_rate": 6.82919279286971e-06, + "loss": 0.0696, + "step": 19648 + }, + { + "epoch": 7.990646604310696, + "grad_norm": 0.35537841259432024, + "learning_rate": 6.828231902922306e-06, + "loss": 0.0077, + "step": 19649 + }, + { + "epoch": 7.991053273688491, + "grad_norm": 0.652658409701522, + "learning_rate": 6.8272710455346994e-06, + "loss": 0.0096, + "step": 19650 + }, + { + "epoch": 7.991459943066287, + "grad_norm": 3.9897083826445554, + "learning_rate": 6.826310220716749e-06, + "loss": 0.1051, + "step": 19651 + }, + { + "epoch": 7.991866612444083, + "grad_norm": 0.4573327370508394, + "learning_rate": 6.825349428478323e-06, + "loss": 0.0053, + "step": 19652 + }, + { + "epoch": 7.992273281821879, + "grad_norm": 2.811801430916673, + "learning_rate": 6.8243886688292844e-06, + "loss": 0.031, + "step": 19653 + }, + { + "epoch": 7.9926799511996744, + "grad_norm": 0.8364743281548735, + "learning_rate": 6.823427941779496e-06, + "loss": 0.0109, + "step": 19654 + }, + { + "epoch": 7.99308662057747, + "grad_norm": 1.231609239296121, + "learning_rate": 6.8224672473388155e-06, + "loss": 0.0345, + "step": 19655 + }, + { + "epoch": 7.993493289955266, + "grad_norm": 0.030479631013746115, + "learning_rate": 6.821506585517107e-06, + "loss": 0.0004, + "step": 19656 + }, + { + "epoch": 7.993899959333062, + "grad_norm": 2.245772961431227, + "learning_rate": 6.8205459563242345e-06, + "loss": 0.0311, + "step": 19657 + }, + { + "epoch": 7.994306628710858, + "grad_norm": 0.0059291661976540785, + "learning_rate": 6.819585359770058e-06, + "loss": 0.0001, + "step": 19658 + }, + { + "epoch": 7.994713298088654, + "grad_norm": 1.524948094924818, + "learning_rate": 6.818624795864437e-06, + "loss": 0.0125, + "step": 19659 + }, + { + "epoch": 7.99511996746645, + "grad_norm": 3.2858125006935697, + "learning_rate": 6.817664264617235e-06, + "loss": 0.052, + "step": 19660 + }, + { + "epoch": 7.995526636844246, + "grad_norm": 0.09748149981500515, + "learning_rate": 6.81670376603831e-06, + "loss": 0.0018, + "step": 19661 + }, + { + "epoch": 7.9959333062220415, + "grad_norm": 0.09812214877159557, + "learning_rate": 6.815743300137523e-06, + "loss": 0.0012, + "step": 19662 + }, + { + "epoch": 7.996339975599837, + "grad_norm": 9.368625087124801, + "learning_rate": 6.814782866924735e-06, + "loss": 0.1801, + "step": 19663 + }, + { + "epoch": 7.996746644977633, + "grad_norm": 0.06050638244028163, + "learning_rate": 6.813822466409804e-06, + "loss": 0.001, + "step": 19664 + }, + { + "epoch": 7.997153314355429, + "grad_norm": 0.9523571689157501, + "learning_rate": 6.812862098602586e-06, + "loss": 0.011, + "step": 19665 + }, + { + "epoch": 7.9975599837332245, + "grad_norm": 7.435098614017861, + "learning_rate": 6.811901763512946e-06, + "loss": 0.0624, + "step": 19666 + }, + { + "epoch": 7.997966653111021, + "grad_norm": 0.4623826254270306, + "learning_rate": 6.810941461150738e-06, + "loss": 0.0041, + "step": 19667 + }, + { + "epoch": 7.998373322488817, + "grad_norm": 9.518732510924474, + "learning_rate": 6.80998119152582e-06, + "loss": 0.2665, + "step": 19668 + }, + { + "epoch": 7.998779991866613, + "grad_norm": 0.02743302291945347, + "learning_rate": 6.809020954648051e-06, + "loss": 0.0003, + "step": 19669 + }, + { + "epoch": 7.9991866612444085, + "grad_norm": 1.2081031449461863, + "learning_rate": 6.808060750527289e-06, + "loss": 0.0111, + "step": 19670 + }, + { + "epoch": 7.999593330622204, + "grad_norm": 1.206172981233131, + "learning_rate": 6.8071005791733894e-06, + "loss": 0.0097, + "step": 19671 + }, + { + "epoch": 8.0, + "grad_norm": 4.797640490882633, + "learning_rate": 6.806140440596207e-06, + "loss": 0.0456, + "step": 19672 + }, + { + "epoch": 8.000406669377796, + "grad_norm": 0.007163501319228077, + "learning_rate": 6.805180334805603e-06, + "loss": 0.0001, + "step": 19673 + }, + { + "epoch": 8.000813338755592, + "grad_norm": 0.011912670221556202, + "learning_rate": 6.80422026181143e-06, + "loss": 0.0002, + "step": 19674 + }, + { + "epoch": 8.001220008133387, + "grad_norm": 7.384070248163258, + "learning_rate": 6.8032602216235435e-06, + "loss": 0.1664, + "step": 19675 + }, + { + "epoch": 8.001626677511183, + "grad_norm": 0.0706154174078318, + "learning_rate": 6.802300214251801e-06, + "loss": 0.001, + "step": 19676 + }, + { + "epoch": 8.002033346888979, + "grad_norm": 0.037010206662959066, + "learning_rate": 6.801340239706056e-06, + "loss": 0.0005, + "step": 19677 + }, + { + "epoch": 8.002440016266775, + "grad_norm": 1.2994001289942925, + "learning_rate": 6.800380297996164e-06, + "loss": 0.0171, + "step": 19678 + }, + { + "epoch": 8.00284668564457, + "grad_norm": 1.002379711177966, + "learning_rate": 6.799420389131977e-06, + "loss": 0.0166, + "step": 19679 + }, + { + "epoch": 8.003253355022366, + "grad_norm": 2.6779326240133106, + "learning_rate": 6.7984605131233505e-06, + "loss": 0.038, + "step": 19680 + }, + { + "epoch": 8.003660024400162, + "grad_norm": 0.2543603014088702, + "learning_rate": 6.7975006699801394e-06, + "loss": 0.0025, + "step": 19681 + }, + { + "epoch": 8.004066693777958, + "grad_norm": 0.07121499749102055, + "learning_rate": 6.796540859712195e-06, + "loss": 0.0008, + "step": 19682 + }, + { + "epoch": 8.004473363155755, + "grad_norm": 0.3608007689673589, + "learning_rate": 6.795581082329371e-06, + "loss": 0.005, + "step": 19683 + }, + { + "epoch": 8.004880032533551, + "grad_norm": 0.19144882202659222, + "learning_rate": 6.79462133784152e-06, + "loss": 0.0019, + "step": 19684 + }, + { + "epoch": 8.005286701911347, + "grad_norm": 1.4594680968564675, + "learning_rate": 6.7936616262584935e-06, + "loss": 0.0255, + "step": 19685 + }, + { + "epoch": 8.005693371289142, + "grad_norm": 0.4814873041056489, + "learning_rate": 6.792701947590145e-06, + "loss": 0.0075, + "step": 19686 + }, + { + "epoch": 8.006100040666938, + "grad_norm": 0.26952493933834687, + "learning_rate": 6.791742301846325e-06, + "loss": 0.0041, + "step": 19687 + }, + { + "epoch": 8.006506710044734, + "grad_norm": 1.8931273236151542, + "learning_rate": 6.7907826890368845e-06, + "loss": 0.0242, + "step": 19688 + }, + { + "epoch": 8.00691337942253, + "grad_norm": 8.367081133026579, + "learning_rate": 6.789823109171676e-06, + "loss": 0.0992, + "step": 19689 + }, + { + "epoch": 8.007320048800326, + "grad_norm": 0.009611512556823404, + "learning_rate": 6.788863562260548e-06, + "loss": 0.0001, + "step": 19690 + }, + { + "epoch": 8.007726718178121, + "grad_norm": 5.244010788182244, + "learning_rate": 6.787904048313354e-06, + "loss": 0.0806, + "step": 19691 + }, + { + "epoch": 8.008133387555917, + "grad_norm": 4.546602730325272, + "learning_rate": 6.786944567339937e-06, + "loss": 0.0539, + "step": 19692 + }, + { + "epoch": 8.008540056933713, + "grad_norm": 0.026152135584052005, + "learning_rate": 6.785985119350155e-06, + "loss": 0.0004, + "step": 19693 + }, + { + "epoch": 8.008946726311509, + "grad_norm": 1.9951783658868134, + "learning_rate": 6.785025704353852e-06, + "loss": 0.0233, + "step": 19694 + }, + { + "epoch": 8.009353395689304, + "grad_norm": 0.06538952898133572, + "learning_rate": 6.784066322360876e-06, + "loss": 0.0011, + "step": 19695 + }, + { + "epoch": 8.0097600650671, + "grad_norm": 0.3558537320817987, + "learning_rate": 6.783106973381082e-06, + "loss": 0.0044, + "step": 19696 + }, + { + "epoch": 8.010166734444896, + "grad_norm": 0.02192335416418906, + "learning_rate": 6.782147657424312e-06, + "loss": 0.0002, + "step": 19697 + }, + { + "epoch": 8.010573403822692, + "grad_norm": 0.8851220450498013, + "learning_rate": 6.7811883745004135e-06, + "loss": 0.0121, + "step": 19698 + }, + { + "epoch": 8.010980073200487, + "grad_norm": 1.2375002936356247, + "learning_rate": 6.78022912461924e-06, + "loss": 0.0127, + "step": 19699 + }, + { + "epoch": 8.011386742578283, + "grad_norm": 1.079599384117329, + "learning_rate": 6.779269907790633e-06, + "loss": 0.0091, + "step": 19700 + }, + { + "epoch": 8.011793411956079, + "grad_norm": 0.8415600035165063, + "learning_rate": 6.7783107240244415e-06, + "loss": 0.0088, + "step": 19701 + }, + { + "epoch": 8.012200081333875, + "grad_norm": 0.21516005990288573, + "learning_rate": 6.777351573330513e-06, + "loss": 0.0034, + "step": 19702 + }, + { + "epoch": 8.012606750711672, + "grad_norm": 0.028784962213784095, + "learning_rate": 6.7763924557186925e-06, + "loss": 0.0003, + "step": 19703 + }, + { + "epoch": 8.013013420089468, + "grad_norm": 1.4001768914245103, + "learning_rate": 6.775433371198825e-06, + "loss": 0.019, + "step": 19704 + }, + { + "epoch": 8.013420089467264, + "grad_norm": 5.9278579479075875, + "learning_rate": 6.774474319780755e-06, + "loss": 0.2159, + "step": 19705 + }, + { + "epoch": 8.01382675884506, + "grad_norm": 2.0388835590261776, + "learning_rate": 6.773515301474332e-06, + "loss": 0.0249, + "step": 19706 + }, + { + "epoch": 8.014233428222855, + "grad_norm": 0.032690274563207, + "learning_rate": 6.772556316289398e-06, + "loss": 0.0005, + "step": 19707 + }, + { + "epoch": 8.014640097600651, + "grad_norm": 0.3186252562041038, + "learning_rate": 6.771597364235796e-06, + "loss": 0.0054, + "step": 19708 + }, + { + "epoch": 8.015046766978447, + "grad_norm": 1.0323924734342456, + "learning_rate": 6.770638445323374e-06, + "loss": 0.0122, + "step": 19709 + }, + { + "epoch": 8.015453436356243, + "grad_norm": 2.083112767459672, + "learning_rate": 6.769679559561975e-06, + "loss": 0.015, + "step": 19710 + }, + { + "epoch": 8.015860105734038, + "grad_norm": 0.12165003585838283, + "learning_rate": 6.768720706961437e-06, + "loss": 0.0015, + "step": 19711 + }, + { + "epoch": 8.016266775111834, + "grad_norm": 0.5513784904234935, + "learning_rate": 6.76776188753161e-06, + "loss": 0.0081, + "step": 19712 + }, + { + "epoch": 8.01667344448963, + "grad_norm": 7.685517806176321, + "learning_rate": 6.766803101282333e-06, + "loss": 0.0721, + "step": 19713 + }, + { + "epoch": 8.017080113867426, + "grad_norm": 8.351557182081937, + "learning_rate": 6.765844348223449e-06, + "loss": 0.1571, + "step": 19714 + }, + { + "epoch": 8.017486783245221, + "grad_norm": 0.23758445261401429, + "learning_rate": 6.764885628364801e-06, + "loss": 0.0032, + "step": 19715 + }, + { + "epoch": 8.017893452623017, + "grad_norm": 0.06474554163742932, + "learning_rate": 6.7639269417162315e-06, + "loss": 0.0005, + "step": 19716 + }, + { + "epoch": 8.018300122000813, + "grad_norm": 1.4557048862260114, + "learning_rate": 6.76296828828758e-06, + "loss": 0.0201, + "step": 19717 + }, + { + "epoch": 8.018706791378609, + "grad_norm": 3.4184779203452607, + "learning_rate": 6.7620096680886875e-06, + "loss": 0.0376, + "step": 19718 + }, + { + "epoch": 8.019113460756405, + "grad_norm": 0.02985482550861563, + "learning_rate": 6.761051081129395e-06, + "loss": 0.0004, + "step": 19719 + }, + { + "epoch": 8.0195201301342, + "grad_norm": 0.18848784411354974, + "learning_rate": 6.760092527419544e-06, + "loss": 0.0031, + "step": 19720 + }, + { + "epoch": 8.019926799511996, + "grad_norm": 0.03132444774282679, + "learning_rate": 6.759134006968973e-06, + "loss": 0.0004, + "step": 19721 + }, + { + "epoch": 8.020333468889792, + "grad_norm": 2.9037971103868356, + "learning_rate": 6.758175519787524e-06, + "loss": 0.0578, + "step": 19722 + }, + { + "epoch": 8.020740138267588, + "grad_norm": 3.908804740549328, + "learning_rate": 6.757217065885034e-06, + "loss": 0.1622, + "step": 19723 + }, + { + "epoch": 8.021146807645385, + "grad_norm": 0.012904943786781094, + "learning_rate": 6.756258645271342e-06, + "loss": 0.0001, + "step": 19724 + }, + { + "epoch": 8.021553477023181, + "grad_norm": 8.461759187125214, + "learning_rate": 6.755300257956289e-06, + "loss": 0.2856, + "step": 19725 + }, + { + "epoch": 8.021960146400977, + "grad_norm": 1.674384464036713, + "learning_rate": 6.754341903949712e-06, + "loss": 0.0434, + "step": 19726 + }, + { + "epoch": 8.022366815778772, + "grad_norm": 2.1374452926907814, + "learning_rate": 6.753383583261446e-06, + "loss": 0.0798, + "step": 19727 + }, + { + "epoch": 8.022773485156568, + "grad_norm": 4.7053180053367845, + "learning_rate": 6.752425295901334e-06, + "loss": 0.0849, + "step": 19728 + }, + { + "epoch": 8.023180154534364, + "grad_norm": 1.4958733163484756, + "learning_rate": 6.7514670418792116e-06, + "loss": 0.0238, + "step": 19729 + }, + { + "epoch": 8.02358682391216, + "grad_norm": 0.02161643418606295, + "learning_rate": 6.750508821204916e-06, + "loss": 0.0002, + "step": 19730 + }, + { + "epoch": 8.023993493289955, + "grad_norm": 0.04386478651925388, + "learning_rate": 6.749550633888278e-06, + "loss": 0.0004, + "step": 19731 + }, + { + "epoch": 8.024400162667751, + "grad_norm": 3.6854233142374304, + "learning_rate": 6.748592479939142e-06, + "loss": 0.0526, + "step": 19732 + }, + { + "epoch": 8.024806832045547, + "grad_norm": 0.009768969509694699, + "learning_rate": 6.747634359367341e-06, + "loss": 0.0001, + "step": 19733 + }, + { + "epoch": 8.025213501423343, + "grad_norm": 0.19431875399909218, + "learning_rate": 6.746676272182707e-06, + "loss": 0.0024, + "step": 19734 + }, + { + "epoch": 8.025620170801139, + "grad_norm": 0.10103929526915466, + "learning_rate": 6.745718218395081e-06, + "loss": 0.0013, + "step": 19735 + }, + { + "epoch": 8.026026840178934, + "grad_norm": 2.7476248831717474, + "learning_rate": 6.744760198014296e-06, + "loss": 0.0261, + "step": 19736 + }, + { + "epoch": 8.02643350955673, + "grad_norm": 4.411780259412433, + "learning_rate": 6.7438022110501835e-06, + "loss": 0.0632, + "step": 19737 + }, + { + "epoch": 8.026840178934526, + "grad_norm": 0.08971751747362486, + "learning_rate": 6.742844257512582e-06, + "loss": 0.0012, + "step": 19738 + }, + { + "epoch": 8.027246848312322, + "grad_norm": 3.8418890056462254, + "learning_rate": 6.741886337411323e-06, + "loss": 0.1205, + "step": 19739 + }, + { + "epoch": 8.027653517690117, + "grad_norm": 2.1496647474545663, + "learning_rate": 6.740928450756241e-06, + "loss": 0.0255, + "step": 19740 + }, + { + "epoch": 8.028060187067913, + "grad_norm": 0.18204686103255827, + "learning_rate": 6.739970597557167e-06, + "loss": 0.0017, + "step": 19741 + }, + { + "epoch": 8.028466856445709, + "grad_norm": 0.19653059043891358, + "learning_rate": 6.739012777823936e-06, + "loss": 0.003, + "step": 19742 + }, + { + "epoch": 8.028873525823505, + "grad_norm": 3.5302719363529924, + "learning_rate": 6.738054991566383e-06, + "loss": 0.1124, + "step": 19743 + }, + { + "epoch": 8.029280195201302, + "grad_norm": 0.24659589350650563, + "learning_rate": 6.737097238794334e-06, + "loss": 0.0032, + "step": 19744 + }, + { + "epoch": 8.029686864579098, + "grad_norm": 0.35327028763569346, + "learning_rate": 6.736139519517625e-06, + "loss": 0.0014, + "step": 19745 + }, + { + "epoch": 8.030093533956894, + "grad_norm": 0.2056088303842754, + "learning_rate": 6.735181833746087e-06, + "loss": 0.0036, + "step": 19746 + }, + { + "epoch": 8.03050020333469, + "grad_norm": 0.04347562189000148, + "learning_rate": 6.734224181489548e-06, + "loss": 0.0007, + "step": 19747 + }, + { + "epoch": 8.030906872712485, + "grad_norm": 0.5709627606323312, + "learning_rate": 6.7332665627578455e-06, + "loss": 0.0078, + "step": 19748 + }, + { + "epoch": 8.031313542090281, + "grad_norm": 0.1216776070108797, + "learning_rate": 6.732308977560804e-06, + "loss": 0.001, + "step": 19749 + }, + { + "epoch": 8.031720211468077, + "grad_norm": 0.09044878626670207, + "learning_rate": 6.731351425908254e-06, + "loss": 0.001, + "step": 19750 + }, + { + "epoch": 8.032126880845873, + "grad_norm": 0.10068388661321669, + "learning_rate": 6.730393907810028e-06, + "loss": 0.0016, + "step": 19751 + }, + { + "epoch": 8.032533550223668, + "grad_norm": 0.7682894889207503, + "learning_rate": 6.729436423275955e-06, + "loss": 0.0095, + "step": 19752 + }, + { + "epoch": 8.032940219601464, + "grad_norm": 0.6528586976819022, + "learning_rate": 6.728478972315862e-06, + "loss": 0.0052, + "step": 19753 + }, + { + "epoch": 8.03334688897926, + "grad_norm": 0.3824823336771468, + "learning_rate": 6.727521554939582e-06, + "loss": 0.0034, + "step": 19754 + }, + { + "epoch": 8.033753558357056, + "grad_norm": 1.4943750197615198, + "learning_rate": 6.726564171156936e-06, + "loss": 0.0135, + "step": 19755 + }, + { + "epoch": 8.034160227734851, + "grad_norm": 5.670297822436596, + "learning_rate": 6.725606820977759e-06, + "loss": 0.0953, + "step": 19756 + }, + { + "epoch": 8.034566897112647, + "grad_norm": 0.5619032674870388, + "learning_rate": 6.724649504411876e-06, + "loss": 0.0089, + "step": 19757 + }, + { + "epoch": 8.034973566490443, + "grad_norm": 0.013615839979280377, + "learning_rate": 6.7236922214691116e-06, + "loss": 0.0002, + "step": 19758 + }, + { + "epoch": 8.035380235868239, + "grad_norm": 1.7202513394746508, + "learning_rate": 6.722734972159295e-06, + "loss": 0.0181, + "step": 19759 + }, + { + "epoch": 8.035786905246034, + "grad_norm": 1.162719653973114, + "learning_rate": 6.721777756492256e-06, + "loss": 0.0194, + "step": 19760 + }, + { + "epoch": 8.03619357462383, + "grad_norm": 0.08136835739402358, + "learning_rate": 6.720820574477818e-06, + "loss": 0.0012, + "step": 19761 + }, + { + "epoch": 8.036600244001626, + "grad_norm": 11.30244107786243, + "learning_rate": 6.7198634261258055e-06, + "loss": 0.1385, + "step": 19762 + }, + { + "epoch": 8.037006913379422, + "grad_norm": 1.702525934811169, + "learning_rate": 6.718906311446047e-06, + "loss": 0.0137, + "step": 19763 + }, + { + "epoch": 8.037413582757218, + "grad_norm": 0.0011506145985798995, + "learning_rate": 6.717949230448367e-06, + "loss": 0.0, + "step": 19764 + }, + { + "epoch": 8.037820252135015, + "grad_norm": 6.179166094247694, + "learning_rate": 6.716992183142587e-06, + "loss": 0.1391, + "step": 19765 + }, + { + "epoch": 8.03822692151281, + "grad_norm": 0.17477225365118165, + "learning_rate": 6.716035169538538e-06, + "loss": 0.0021, + "step": 19766 + }, + { + "epoch": 8.038633590890607, + "grad_norm": 0.003649760481457678, + "learning_rate": 6.71507818964604e-06, + "loss": 0.0, + "step": 19767 + }, + { + "epoch": 8.039040260268402, + "grad_norm": 0.022021085849558853, + "learning_rate": 6.714121243474916e-06, + "loss": 0.0003, + "step": 19768 + }, + { + "epoch": 8.039446929646198, + "grad_norm": 0.045276240189267083, + "learning_rate": 6.713164331034993e-06, + "loss": 0.0007, + "step": 19769 + }, + { + "epoch": 8.039853599023994, + "grad_norm": 0.11413370400819564, + "learning_rate": 6.712207452336093e-06, + "loss": 0.0014, + "step": 19770 + }, + { + "epoch": 8.04026026840179, + "grad_norm": 0.008010208503600817, + "learning_rate": 6.711250607388038e-06, + "loss": 0.0001, + "step": 19771 + }, + { + "epoch": 8.040666937779585, + "grad_norm": 1.8587658488128889, + "learning_rate": 6.710293796200648e-06, + "loss": 0.0213, + "step": 19772 + }, + { + "epoch": 8.041073607157381, + "grad_norm": 0.24531463940501722, + "learning_rate": 6.70933701878375e-06, + "loss": 0.0026, + "step": 19773 + }, + { + "epoch": 8.041480276535177, + "grad_norm": 1.409898445756768, + "learning_rate": 6.708380275147164e-06, + "loss": 0.0119, + "step": 19774 + }, + { + "epoch": 8.041886945912973, + "grad_norm": 3.834788961903926, + "learning_rate": 6.707423565300708e-06, + "loss": 0.0551, + "step": 19775 + }, + { + "epoch": 8.042293615290768, + "grad_norm": 0.06926396760438387, + "learning_rate": 6.7064668892542096e-06, + "loss": 0.001, + "step": 19776 + }, + { + "epoch": 8.042700284668564, + "grad_norm": 0.015105148767987934, + "learning_rate": 6.705510247017486e-06, + "loss": 0.0002, + "step": 19777 + }, + { + "epoch": 8.04310695404636, + "grad_norm": 0.494654218218106, + "learning_rate": 6.704553638600355e-06, + "loss": 0.0079, + "step": 19778 + }, + { + "epoch": 8.043513623424156, + "grad_norm": 0.2126210171247687, + "learning_rate": 6.703597064012643e-06, + "loss": 0.0039, + "step": 19779 + }, + { + "epoch": 8.043920292801952, + "grad_norm": 20.337610636089565, + "learning_rate": 6.702640523264165e-06, + "loss": 0.404, + "step": 19780 + }, + { + "epoch": 8.044326962179747, + "grad_norm": 1.308278766630467, + "learning_rate": 6.701684016364741e-06, + "loss": 0.0202, + "step": 19781 + }, + { + "epoch": 8.044733631557543, + "grad_norm": 0.07007121840558433, + "learning_rate": 6.7007275433241906e-06, + "loss": 0.0012, + "step": 19782 + }, + { + "epoch": 8.045140300935339, + "grad_norm": 1.4305445701738, + "learning_rate": 6.699771104152333e-06, + "loss": 0.0151, + "step": 19783 + }, + { + "epoch": 8.045546970313135, + "grad_norm": 0.03388812597448713, + "learning_rate": 6.698814698858985e-06, + "loss": 0.0004, + "step": 19784 + }, + { + "epoch": 8.045953639690932, + "grad_norm": 2.2673053615394507, + "learning_rate": 6.697858327453966e-06, + "loss": 0.0361, + "step": 19785 + }, + { + "epoch": 8.046360309068728, + "grad_norm": 1.6669888805737627, + "learning_rate": 6.6969019899470935e-06, + "loss": 0.0177, + "step": 19786 + }, + { + "epoch": 8.046766978446524, + "grad_norm": 0.007817241299441079, + "learning_rate": 6.695945686348184e-06, + "loss": 0.0001, + "step": 19787 + }, + { + "epoch": 8.04717364782432, + "grad_norm": 0.1231921932773548, + "learning_rate": 6.6949894166670545e-06, + "loss": 0.0021, + "step": 19788 + }, + { + "epoch": 8.047580317202115, + "grad_norm": 1.3268731103055462, + "learning_rate": 6.694033180913523e-06, + "loss": 0.012, + "step": 19789 + }, + { + "epoch": 8.047986986579911, + "grad_norm": 0.14285045308945005, + "learning_rate": 6.6930769790974045e-06, + "loss": 0.0023, + "step": 19790 + }, + { + "epoch": 8.048393655957707, + "grad_norm": 0.03471971850006125, + "learning_rate": 6.692120811228514e-06, + "loss": 0.0005, + "step": 19791 + }, + { + "epoch": 8.048800325335502, + "grad_norm": 1.95117338134423, + "learning_rate": 6.69116467731667e-06, + "loss": 0.0329, + "step": 19792 + }, + { + "epoch": 8.049206994713298, + "grad_norm": 0.7794108423246474, + "learning_rate": 6.6902085773716865e-06, + "loss": 0.0103, + "step": 19793 + }, + { + "epoch": 8.049613664091094, + "grad_norm": 4.566523856990112, + "learning_rate": 6.689252511403376e-06, + "loss": 0.0868, + "step": 19794 + }, + { + "epoch": 8.05002033346889, + "grad_norm": 2.915052238830742, + "learning_rate": 6.688296479421554e-06, + "loss": 0.0667, + "step": 19795 + }, + { + "epoch": 8.050427002846686, + "grad_norm": 0.006054764342788676, + "learning_rate": 6.6873404814360374e-06, + "loss": 0.0, + "step": 19796 + }, + { + "epoch": 8.050833672224481, + "grad_norm": 9.82136157448013, + "learning_rate": 6.6863845174566385e-06, + "loss": 0.0547, + "step": 19797 + }, + { + "epoch": 8.051240341602277, + "grad_norm": 0.06268086483888863, + "learning_rate": 6.685428587493167e-06, + "loss": 0.0008, + "step": 19798 + }, + { + "epoch": 8.051647010980073, + "grad_norm": 2.5522310300484294, + "learning_rate": 6.684472691555441e-06, + "loss": 0.0365, + "step": 19799 + }, + { + "epoch": 8.052053680357869, + "grad_norm": 0.0022384888010083853, + "learning_rate": 6.683516829653274e-06, + "loss": 0.0, + "step": 19800 + }, + { + "epoch": 8.052460349735664, + "grad_norm": 0.0272280260075193, + "learning_rate": 6.6825610017964715e-06, + "loss": 0.0003, + "step": 19801 + }, + { + "epoch": 8.05286701911346, + "grad_norm": 0.5532558058096345, + "learning_rate": 6.6816052079948525e-06, + "loss": 0.0056, + "step": 19802 + }, + { + "epoch": 8.053273688491256, + "grad_norm": 3.487828848909391, + "learning_rate": 6.680649448258226e-06, + "loss": 0.0552, + "step": 19803 + }, + { + "epoch": 8.053680357869052, + "grad_norm": 1.4595096317991885, + "learning_rate": 6.679693722596402e-06, + "loss": 0.0158, + "step": 19804 + }, + { + "epoch": 8.054087027246847, + "grad_norm": 0.9459580475696759, + "learning_rate": 6.678738031019194e-06, + "loss": 0.0126, + "step": 19805 + }, + { + "epoch": 8.054493696624645, + "grad_norm": 2.3898507384765533, + "learning_rate": 6.6777823735364126e-06, + "loss": 0.0407, + "step": 19806 + }, + { + "epoch": 8.05490036600244, + "grad_norm": 0.23752192008116005, + "learning_rate": 6.6768267501578675e-06, + "loss": 0.0022, + "step": 19807 + }, + { + "epoch": 8.055307035380237, + "grad_norm": 8.463371446976279, + "learning_rate": 6.675871160893366e-06, + "loss": 0.1829, + "step": 19808 + }, + { + "epoch": 8.055713704758032, + "grad_norm": 1.887619921232841, + "learning_rate": 6.674915605752721e-06, + "loss": 0.0213, + "step": 19809 + }, + { + "epoch": 8.056120374135828, + "grad_norm": 0.26547517364898426, + "learning_rate": 6.6739600847457435e-06, + "loss": 0.0042, + "step": 19810 + }, + { + "epoch": 8.056527043513624, + "grad_norm": 1.6383250470354631, + "learning_rate": 6.673004597882235e-06, + "loss": 0.0274, + "step": 19811 + }, + { + "epoch": 8.05693371289142, + "grad_norm": 0.8691062618746379, + "learning_rate": 6.67204914517201e-06, + "loss": 0.0109, + "step": 19812 + }, + { + "epoch": 8.057340382269215, + "grad_norm": 0.6435701743686112, + "learning_rate": 6.671093726624877e-06, + "loss": 0.0072, + "step": 19813 + }, + { + "epoch": 8.057747051647011, + "grad_norm": 0.38704673655199506, + "learning_rate": 6.6701383422506416e-06, + "loss": 0.0042, + "step": 19814 + }, + { + "epoch": 8.058153721024807, + "grad_norm": 0.011255597657627915, + "learning_rate": 6.669182992059113e-06, + "loss": 0.0002, + "step": 19815 + }, + { + "epoch": 8.058560390402603, + "grad_norm": 14.285875110934334, + "learning_rate": 6.668227676060097e-06, + "loss": 0.2783, + "step": 19816 + }, + { + "epoch": 8.058967059780398, + "grad_norm": 2.047987411106153, + "learning_rate": 6.6672723942633986e-06, + "loss": 0.0271, + "step": 19817 + }, + { + "epoch": 8.059373729158194, + "grad_norm": 0.009183058374822561, + "learning_rate": 6.666317146678829e-06, + "loss": 0.0001, + "step": 19818 + }, + { + "epoch": 8.05978039853599, + "grad_norm": 3.732020959925785, + "learning_rate": 6.665361933316191e-06, + "loss": 0.0673, + "step": 19819 + }, + { + "epoch": 8.060187067913786, + "grad_norm": 1.7736545085472653, + "learning_rate": 6.664406754185291e-06, + "loss": 0.0234, + "step": 19820 + }, + { + "epoch": 8.060593737291581, + "grad_norm": 0.006112382331190907, + "learning_rate": 6.663451609295934e-06, + "loss": 0.0001, + "step": 19821 + }, + { + "epoch": 8.061000406669377, + "grad_norm": 0.026784216396301564, + "learning_rate": 6.662496498657925e-06, + "loss": 0.0003, + "step": 19822 + }, + { + "epoch": 8.061407076047173, + "grad_norm": 8.839790559845492, + "learning_rate": 6.66154142228107e-06, + "loss": 0.125, + "step": 19823 + }, + { + "epoch": 8.061813745424969, + "grad_norm": 0.1266229729735368, + "learning_rate": 6.660586380175171e-06, + "loss": 0.0014, + "step": 19824 + }, + { + "epoch": 8.062220414802765, + "grad_norm": 5.0898576649895, + "learning_rate": 6.659631372350037e-06, + "loss": 0.0469, + "step": 19825 + }, + { + "epoch": 8.062627084180562, + "grad_norm": 3.6362385304713665, + "learning_rate": 6.658676398815467e-06, + "loss": 0.0651, + "step": 19826 + }, + { + "epoch": 8.063033753558358, + "grad_norm": 0.011146694184359004, + "learning_rate": 6.657721459581263e-06, + "loss": 0.0002, + "step": 19827 + }, + { + "epoch": 8.063440422936154, + "grad_norm": 0.7360022466760137, + "learning_rate": 6.6567665546572315e-06, + "loss": 0.0089, + "step": 19828 + }, + { + "epoch": 8.06384709231395, + "grad_norm": 0.6708281078810594, + "learning_rate": 6.655811684053175e-06, + "loss": 0.0073, + "step": 19829 + }, + { + "epoch": 8.064253761691745, + "grad_norm": 5.835454532064515, + "learning_rate": 6.654856847778892e-06, + "loss": 0.0839, + "step": 19830 + }, + { + "epoch": 8.064660431069541, + "grad_norm": 0.0903988477863866, + "learning_rate": 6.653902045844189e-06, + "loss": 0.0011, + "step": 19831 + }, + { + "epoch": 8.065067100447337, + "grad_norm": 2.216200589498071, + "learning_rate": 6.652947278258867e-06, + "loss": 0.0284, + "step": 19832 + }, + { + "epoch": 8.065473769825132, + "grad_norm": 1.3231037688615694, + "learning_rate": 6.651992545032724e-06, + "loss": 0.0148, + "step": 19833 + }, + { + "epoch": 8.065880439202928, + "grad_norm": 0.010504823197525067, + "learning_rate": 6.651037846175562e-06, + "loss": 0.0001, + "step": 19834 + }, + { + "epoch": 8.066287108580724, + "grad_norm": 0.5082520402136832, + "learning_rate": 6.6500831816971834e-06, + "loss": 0.0064, + "step": 19835 + }, + { + "epoch": 8.06669377795852, + "grad_norm": 6.172569665867651, + "learning_rate": 6.6491285516073865e-06, + "loss": 0.1637, + "step": 19836 + }, + { + "epoch": 8.067100447336315, + "grad_norm": 2.714164840858784, + "learning_rate": 6.648173955915971e-06, + "loss": 0.0362, + "step": 19837 + }, + { + "epoch": 8.067507116714111, + "grad_norm": 0.003902309796066088, + "learning_rate": 6.6472193946327365e-06, + "loss": 0.0001, + "step": 19838 + }, + { + "epoch": 8.067913786091907, + "grad_norm": 0.40324231278509837, + "learning_rate": 6.646264867767484e-06, + "loss": 0.0031, + "step": 19839 + }, + { + "epoch": 8.068320455469703, + "grad_norm": 0.1120401550144911, + "learning_rate": 6.645310375330007e-06, + "loss": 0.0012, + "step": 19840 + }, + { + "epoch": 8.068727124847499, + "grad_norm": 0.5407951586064158, + "learning_rate": 6.644355917330111e-06, + "loss": 0.004, + "step": 19841 + }, + { + "epoch": 8.069133794225294, + "grad_norm": 5.898147413718351, + "learning_rate": 6.643401493777591e-06, + "loss": 0.2188, + "step": 19842 + }, + { + "epoch": 8.06954046360309, + "grad_norm": 0.6746265833455396, + "learning_rate": 6.642447104682239e-06, + "loss": 0.0084, + "step": 19843 + }, + { + "epoch": 8.069947132980886, + "grad_norm": 0.21446001569209797, + "learning_rate": 6.641492750053862e-06, + "loss": 0.0026, + "step": 19844 + }, + { + "epoch": 8.070353802358682, + "grad_norm": 0.11926428178945908, + "learning_rate": 6.64053842990225e-06, + "loss": 0.0014, + "step": 19845 + }, + { + "epoch": 8.070760471736477, + "grad_norm": 0.5086802561435593, + "learning_rate": 6.639584144237203e-06, + "loss": 0.0057, + "step": 19846 + }, + { + "epoch": 8.071167141114275, + "grad_norm": 1.7141228776511968, + "learning_rate": 6.638629893068516e-06, + "loss": 0.0212, + "step": 19847 + }, + { + "epoch": 8.07157381049207, + "grad_norm": 4.765923275075126, + "learning_rate": 6.637675676405983e-06, + "loss": 0.0595, + "step": 19848 + }, + { + "epoch": 8.071980479869866, + "grad_norm": 7.738330609589257, + "learning_rate": 6.636721494259404e-06, + "loss": 0.2216, + "step": 19849 + }, + { + "epoch": 8.072387149247662, + "grad_norm": 0.013835428786428178, + "learning_rate": 6.635767346638569e-06, + "loss": 0.0002, + "step": 19850 + }, + { + "epoch": 8.072793818625458, + "grad_norm": 5.692084463647967, + "learning_rate": 6.6348132335532764e-06, + "loss": 0.056, + "step": 19851 + }, + { + "epoch": 8.073200488003254, + "grad_norm": 0.21088163316730807, + "learning_rate": 6.633859155013322e-06, + "loss": 0.002, + "step": 19852 + }, + { + "epoch": 8.07360715738105, + "grad_norm": 0.9956666692047179, + "learning_rate": 6.632905111028493e-06, + "loss": 0.0145, + "step": 19853 + }, + { + "epoch": 8.074013826758845, + "grad_norm": 1.4247302313460404, + "learning_rate": 6.6319511016085955e-06, + "loss": 0.017, + "step": 19854 + }, + { + "epoch": 8.074420496136641, + "grad_norm": 0.04405700107462754, + "learning_rate": 6.630997126763408e-06, + "loss": 0.0007, + "step": 19855 + }, + { + "epoch": 8.074827165514437, + "grad_norm": 6.626093106239333, + "learning_rate": 6.6300431865027325e-06, + "loss": 0.1447, + "step": 19856 + }, + { + "epoch": 8.075233834892233, + "grad_norm": 1.9598362666338236, + "learning_rate": 6.62908928083636e-06, + "loss": 0.015, + "step": 19857 + }, + { + "epoch": 8.075640504270028, + "grad_norm": 1.4252258832563849, + "learning_rate": 6.6281354097740815e-06, + "loss": 0.0183, + "step": 19858 + }, + { + "epoch": 8.076047173647824, + "grad_norm": 0.027877023946907737, + "learning_rate": 6.627181573325691e-06, + "loss": 0.0004, + "step": 19859 + }, + { + "epoch": 8.07645384302562, + "grad_norm": 0.6824075714536351, + "learning_rate": 6.6262277715009796e-06, + "loss": 0.0088, + "step": 19860 + }, + { + "epoch": 8.076860512403416, + "grad_norm": 3.5757696799227445, + "learning_rate": 6.625274004309737e-06, + "loss": 0.0268, + "step": 19861 + }, + { + "epoch": 8.077267181781211, + "grad_norm": 0.07046934345857618, + "learning_rate": 6.624320271761755e-06, + "loss": 0.001, + "step": 19862 + }, + { + "epoch": 8.077673851159007, + "grad_norm": 0.0611498597699272, + "learning_rate": 6.623366573866825e-06, + "loss": 0.0007, + "step": 19863 + }, + { + "epoch": 8.078080520536803, + "grad_norm": 0.7063702991106614, + "learning_rate": 6.622412910634737e-06, + "loss": 0.0092, + "step": 19864 + }, + { + "epoch": 8.078487189914599, + "grad_norm": 0.1691029450966401, + "learning_rate": 6.621459282075278e-06, + "loss": 0.0022, + "step": 19865 + }, + { + "epoch": 8.078893859292394, + "grad_norm": 0.18466563123236523, + "learning_rate": 6.620505688198242e-06, + "loss": 0.0015, + "step": 19866 + }, + { + "epoch": 8.079300528670192, + "grad_norm": 0.03901941204737955, + "learning_rate": 6.619552129013416e-06, + "loss": 0.0006, + "step": 19867 + }, + { + "epoch": 8.079707198047988, + "grad_norm": 16.832067590950036, + "learning_rate": 6.6185986045305874e-06, + "loss": 0.3494, + "step": 19868 + }, + { + "epoch": 8.080113867425784, + "grad_norm": 0.015091708332403843, + "learning_rate": 6.617645114759548e-06, + "loss": 0.0003, + "step": 19869 + }, + { + "epoch": 8.08052053680358, + "grad_norm": 4.564236178320947, + "learning_rate": 6.616691659710083e-06, + "loss": 0.0542, + "step": 19870 + }, + { + "epoch": 8.080927206181375, + "grad_norm": 2.4351389733804023, + "learning_rate": 6.615738239391979e-06, + "loss": 0.0146, + "step": 19871 + }, + { + "epoch": 8.08133387555917, + "grad_norm": 0.06582609591621227, + "learning_rate": 6.6147848538150276e-06, + "loss": 0.0008, + "step": 19872 + }, + { + "epoch": 8.081740544936967, + "grad_norm": 4.391300231689287, + "learning_rate": 6.613831502989012e-06, + "loss": 0.0729, + "step": 19873 + }, + { + "epoch": 8.082147214314762, + "grad_norm": 1.5308136302261957, + "learning_rate": 6.6128781869237226e-06, + "loss": 0.0151, + "step": 19874 + }, + { + "epoch": 8.082553883692558, + "grad_norm": 0.4318271294618423, + "learning_rate": 6.611924905628939e-06, + "loss": 0.0077, + "step": 19875 + }, + { + "epoch": 8.082960553070354, + "grad_norm": 0.015280964476634273, + "learning_rate": 6.610971659114456e-06, + "loss": 0.0002, + "step": 19876 + }, + { + "epoch": 8.08336722244815, + "grad_norm": 5.167460916181734, + "learning_rate": 6.610018447390052e-06, + "loss": 0.0199, + "step": 19877 + }, + { + "epoch": 8.083773891825945, + "grad_norm": 0.1189848263645624, + "learning_rate": 6.6090652704655145e-06, + "loss": 0.0012, + "step": 19878 + }, + { + "epoch": 8.084180561203741, + "grad_norm": 2.9602600684809843, + "learning_rate": 6.60811212835063e-06, + "loss": 0.0511, + "step": 19879 + }, + { + "epoch": 8.084587230581537, + "grad_norm": 1.1293685949045398, + "learning_rate": 6.607159021055182e-06, + "loss": 0.0145, + "step": 19880 + }, + { + "epoch": 8.084993899959333, + "grad_norm": 0.8289949434527293, + "learning_rate": 6.606205948588952e-06, + "loss": 0.0087, + "step": 19881 + }, + { + "epoch": 8.085400569337128, + "grad_norm": 0.18388318968453668, + "learning_rate": 6.605252910961728e-06, + "loss": 0.0026, + "step": 19882 + }, + { + "epoch": 8.085807238714924, + "grad_norm": 0.002498876575469679, + "learning_rate": 6.604299908183292e-06, + "loss": 0.0, + "step": 19883 + }, + { + "epoch": 8.08621390809272, + "grad_norm": 0.09345125728288611, + "learning_rate": 6.603346940263422e-06, + "loss": 0.0007, + "step": 19884 + }, + { + "epoch": 8.086620577470516, + "grad_norm": 6.737929860227003, + "learning_rate": 6.6023940072119095e-06, + "loss": 0.1435, + "step": 19885 + }, + { + "epoch": 8.087027246848312, + "grad_norm": 0.07042872359956312, + "learning_rate": 6.601441109038532e-06, + "loss": 0.001, + "step": 19886 + }, + { + "epoch": 8.087433916226107, + "grad_norm": 0.38715660215946135, + "learning_rate": 6.600488245753072e-06, + "loss": 0.0048, + "step": 19887 + }, + { + "epoch": 8.087840585603905, + "grad_norm": 1.3603014413016923, + "learning_rate": 6.5995354173653085e-06, + "loss": 0.0192, + "step": 19888 + }, + { + "epoch": 8.0882472549817, + "grad_norm": 0.15059574825714248, + "learning_rate": 6.598582623885028e-06, + "loss": 0.0021, + "step": 19889 + }, + { + "epoch": 8.088653924359496, + "grad_norm": 0.00011531640520519178, + "learning_rate": 6.5976298653220084e-06, + "loss": 0.0, + "step": 19890 + }, + { + "epoch": 8.089060593737292, + "grad_norm": 0.5335364793893007, + "learning_rate": 6.59667714168603e-06, + "loss": 0.0059, + "step": 19891 + }, + { + "epoch": 8.089467263115088, + "grad_norm": 0.4347060438535364, + "learning_rate": 6.595724452986873e-06, + "loss": 0.0055, + "step": 19892 + }, + { + "epoch": 8.089873932492884, + "grad_norm": 8.998761668462642, + "learning_rate": 6.5947717992343205e-06, + "loss": 0.2395, + "step": 19893 + }, + { + "epoch": 8.09028060187068, + "grad_norm": 0.6558940095542694, + "learning_rate": 6.5938191804381456e-06, + "loss": 0.0087, + "step": 19894 + }, + { + "epoch": 8.090687271248475, + "grad_norm": 0.03549294711632391, + "learning_rate": 6.592866596608134e-06, + "loss": 0.0005, + "step": 19895 + }, + { + "epoch": 8.091093940626271, + "grad_norm": 2.4490231678487433, + "learning_rate": 6.59191404775406e-06, + "loss": 0.012, + "step": 19896 + }, + { + "epoch": 8.091500610004067, + "grad_norm": 0.005726777185580595, + "learning_rate": 6.5909615338857044e-06, + "loss": 0.0001, + "step": 19897 + }, + { + "epoch": 8.091907279381862, + "grad_norm": 4.371530029752828, + "learning_rate": 6.5900090550128435e-06, + "loss": 0.1663, + "step": 19898 + }, + { + "epoch": 8.092313948759658, + "grad_norm": 0.08452342266941676, + "learning_rate": 6.589056611145257e-06, + "loss": 0.0008, + "step": 19899 + }, + { + "epoch": 8.092720618137454, + "grad_norm": 0.23079436048898605, + "learning_rate": 6.588104202292721e-06, + "loss": 0.0018, + "step": 19900 + }, + { + "epoch": 8.09312728751525, + "grad_norm": 5.466534152345847, + "learning_rate": 6.5871518284650105e-06, + "loss": 0.1253, + "step": 19901 + }, + { + "epoch": 8.093533956893046, + "grad_norm": 0.29791046085687456, + "learning_rate": 6.586199489671907e-06, + "loss": 0.005, + "step": 19902 + }, + { + "epoch": 8.093940626270841, + "grad_norm": 3.013713390295548, + "learning_rate": 6.5852471859231825e-06, + "loss": 0.0284, + "step": 19903 + }, + { + "epoch": 8.094347295648637, + "grad_norm": 7.258634516603885, + "learning_rate": 6.584294917228613e-06, + "loss": 0.2106, + "step": 19904 + }, + { + "epoch": 8.094753965026433, + "grad_norm": 1.0050965935321994, + "learning_rate": 6.583342683597977e-06, + "loss": 0.0102, + "step": 19905 + }, + { + "epoch": 8.095160634404229, + "grad_norm": 0.061113426745640964, + "learning_rate": 6.582390485041048e-06, + "loss": 0.0004, + "step": 19906 + }, + { + "epoch": 8.095567303782024, + "grad_norm": 0.005201506115574306, + "learning_rate": 6.581438321567599e-06, + "loss": 0.0001, + "step": 19907 + }, + { + "epoch": 8.095973973159822, + "grad_norm": 5.100301294566317, + "learning_rate": 6.580486193187408e-06, + "loss": 0.0732, + "step": 19908 + }, + { + "epoch": 8.096380642537618, + "grad_norm": 0.06290298931623649, + "learning_rate": 6.5795340999102474e-06, + "loss": 0.0006, + "step": 19909 + }, + { + "epoch": 8.096787311915413, + "grad_norm": 0.3709438640067692, + "learning_rate": 6.57858204174589e-06, + "loss": 0.0047, + "step": 19910 + }, + { + "epoch": 8.09719398129321, + "grad_norm": 0.07607191808298308, + "learning_rate": 6.577630018704109e-06, + "loss": 0.0007, + "step": 19911 + }, + { + "epoch": 8.097600650671005, + "grad_norm": 0.0043969637901143865, + "learning_rate": 6.576678030794679e-06, + "loss": 0.0001, + "step": 19912 + }, + { + "epoch": 8.0980073200488, + "grad_norm": 2.3356485958391366, + "learning_rate": 6.575726078027374e-06, + "loss": 0.0368, + "step": 19913 + }, + { + "epoch": 8.098413989426597, + "grad_norm": 0.06520159346099327, + "learning_rate": 6.57477416041196e-06, + "loss": 0.001, + "step": 19914 + }, + { + "epoch": 8.098820658804392, + "grad_norm": 0.06590227948192486, + "learning_rate": 6.573822277958216e-06, + "loss": 0.0008, + "step": 19915 + }, + { + "epoch": 8.099227328182188, + "grad_norm": 6.940607872958566, + "learning_rate": 6.572870430675911e-06, + "loss": 0.0903, + "step": 19916 + }, + { + "epoch": 8.099633997559984, + "grad_norm": 0.32820351294094996, + "learning_rate": 6.571918618574813e-06, + "loss": 0.004, + "step": 19917 + }, + { + "epoch": 8.10004066693778, + "grad_norm": 0.018071436016366047, + "learning_rate": 6.570966841664699e-06, + "loss": 0.0003, + "step": 19918 + }, + { + "epoch": 8.100447336315575, + "grad_norm": 0.005095680674151935, + "learning_rate": 6.570015099955335e-06, + "loss": 0.0001, + "step": 19919 + }, + { + "epoch": 8.100854005693371, + "grad_norm": 2.6364676096311337, + "learning_rate": 6.56906339345649e-06, + "loss": 0.0308, + "step": 19920 + }, + { + "epoch": 8.101260675071167, + "grad_norm": 1.462755796303159, + "learning_rate": 6.5681117221779386e-06, + "loss": 0.0164, + "step": 19921 + }, + { + "epoch": 8.101667344448963, + "grad_norm": 1.2444093554729525, + "learning_rate": 6.5671600861294485e-06, + "loss": 0.0125, + "step": 19922 + }, + { + "epoch": 8.102074013826758, + "grad_norm": 6.034711051927257, + "learning_rate": 6.566208485320787e-06, + "loss": 0.0269, + "step": 19923 + }, + { + "epoch": 8.102480683204554, + "grad_norm": 0.07670941626179359, + "learning_rate": 6.565256919761722e-06, + "loss": 0.0013, + "step": 19924 + }, + { + "epoch": 8.10288735258235, + "grad_norm": 0.08720105342605365, + "learning_rate": 6.564305389462024e-06, + "loss": 0.0005, + "step": 19925 + }, + { + "epoch": 8.103294021960146, + "grad_norm": 0.03543345535160576, + "learning_rate": 6.563353894431461e-06, + "loss": 0.0006, + "step": 19926 + }, + { + "epoch": 8.103700691337941, + "grad_norm": 1.7803904854367218, + "learning_rate": 6.562402434679798e-06, + "loss": 0.0369, + "step": 19927 + }, + { + "epoch": 8.104107360715737, + "grad_norm": 0.035525257119477786, + "learning_rate": 6.561451010216807e-06, + "loss": 0.0004, + "step": 19928 + }, + { + "epoch": 8.104514030093535, + "grad_norm": 0.08177636896365313, + "learning_rate": 6.560499621052252e-06, + "loss": 0.0011, + "step": 19929 + }, + { + "epoch": 8.10492069947133, + "grad_norm": 0.08427390724294948, + "learning_rate": 6.559548267195896e-06, + "loss": 0.0009, + "step": 19930 + }, + { + "epoch": 8.105327368849126, + "grad_norm": 0.7300931306962118, + "learning_rate": 6.5585969486575116e-06, + "loss": 0.0077, + "step": 19931 + }, + { + "epoch": 8.105734038226922, + "grad_norm": 0.2285650335010942, + "learning_rate": 6.557645665446861e-06, + "loss": 0.0033, + "step": 19932 + }, + { + "epoch": 8.106140707604718, + "grad_norm": 0.8527358972256368, + "learning_rate": 6.556694417573709e-06, + "loss": 0.0132, + "step": 19933 + }, + { + "epoch": 8.106547376982514, + "grad_norm": 0.7708739224069125, + "learning_rate": 6.555743205047823e-06, + "loss": 0.0068, + "step": 19934 + }, + { + "epoch": 8.10695404636031, + "grad_norm": 0.25939565418828336, + "learning_rate": 6.554792027878966e-06, + "loss": 0.0045, + "step": 19935 + }, + { + "epoch": 8.107360715738105, + "grad_norm": 6.644036348044811, + "learning_rate": 6.553840886076904e-06, + "loss": 0.0438, + "step": 19936 + }, + { + "epoch": 8.107767385115901, + "grad_norm": 1.4847636906029902, + "learning_rate": 6.552889779651397e-06, + "loss": 0.0172, + "step": 19937 + }, + { + "epoch": 8.108174054493697, + "grad_norm": 25.39498509692577, + "learning_rate": 6.5519387086122135e-06, + "loss": 0.2647, + "step": 19938 + }, + { + "epoch": 8.108580723871492, + "grad_norm": 0.1961006109233177, + "learning_rate": 6.550987672969116e-06, + "loss": 0.003, + "step": 19939 + }, + { + "epoch": 8.108987393249288, + "grad_norm": 1.389963872288166, + "learning_rate": 6.550036672731861e-06, + "loss": 0.0229, + "step": 19940 + }, + { + "epoch": 8.109394062627084, + "grad_norm": 2.4501786829733603, + "learning_rate": 6.549085707910219e-06, + "loss": 0.0338, + "step": 19941 + }, + { + "epoch": 8.10980073200488, + "grad_norm": 0.019789417187825245, + "learning_rate": 6.5481347785139506e-06, + "loss": 0.0002, + "step": 19942 + }, + { + "epoch": 8.110207401382675, + "grad_norm": 3.813907224830141, + "learning_rate": 6.547183884552812e-06, + "loss": 0.0364, + "step": 19943 + }, + { + "epoch": 8.110614070760471, + "grad_norm": 2.256458807236014, + "learning_rate": 6.546233026036572e-06, + "loss": 0.0147, + "step": 19944 + }, + { + "epoch": 8.111020740138267, + "grad_norm": 5.029686267185445, + "learning_rate": 6.545282202974988e-06, + "loss": 0.0537, + "step": 19945 + }, + { + "epoch": 8.111427409516063, + "grad_norm": 1.446125951490358, + "learning_rate": 6.544331415377817e-06, + "loss": 0.0198, + "step": 19946 + }, + { + "epoch": 8.111834078893859, + "grad_norm": 0.43786541830467823, + "learning_rate": 6.543380663254827e-06, + "loss": 0.0051, + "step": 19947 + }, + { + "epoch": 8.112240748271654, + "grad_norm": 0.01143808237324246, + "learning_rate": 6.542429946615774e-06, + "loss": 0.0001, + "step": 19948 + }, + { + "epoch": 8.112647417649452, + "grad_norm": 0.2622848818813411, + "learning_rate": 6.541479265470417e-06, + "loss": 0.0034, + "step": 19949 + }, + { + "epoch": 8.113054087027248, + "grad_norm": 6.951975138556997, + "learning_rate": 6.540528619828514e-06, + "loss": 0.1112, + "step": 19950 + }, + { + "epoch": 8.113460756405043, + "grad_norm": 11.549738021868485, + "learning_rate": 6.53957800969983e-06, + "loss": 0.1617, + "step": 19951 + }, + { + "epoch": 8.11386742578284, + "grad_norm": 2.1028343743321103, + "learning_rate": 6.538627435094117e-06, + "loss": 0.0244, + "step": 19952 + }, + { + "epoch": 8.114274095160635, + "grad_norm": 0.0860721443860833, + "learning_rate": 6.5376768960211345e-06, + "loss": 0.001, + "step": 19953 + }, + { + "epoch": 8.11468076453843, + "grad_norm": 9.04531533854386, + "learning_rate": 6.536726392490647e-06, + "loss": 0.4976, + "step": 19954 + }, + { + "epoch": 8.115087433916226, + "grad_norm": 0.5475398193384794, + "learning_rate": 6.5357759245123995e-06, + "loss": 0.0123, + "step": 19955 + }, + { + "epoch": 8.115494103294022, + "grad_norm": 0.00961938295302333, + "learning_rate": 6.534825492096157e-06, + "loss": 0.0001, + "step": 19956 + }, + { + "epoch": 8.115900772671818, + "grad_norm": 0.7098766872797402, + "learning_rate": 6.533875095251677e-06, + "loss": 0.0068, + "step": 19957 + }, + { + "epoch": 8.116307442049614, + "grad_norm": 0.09577695288889738, + "learning_rate": 6.532924733988711e-06, + "loss": 0.0012, + "step": 19958 + }, + { + "epoch": 8.11671411142741, + "grad_norm": 4.348946987687424, + "learning_rate": 6.531974408317019e-06, + "loss": 0.0448, + "step": 19959 + }, + { + "epoch": 8.117120780805205, + "grad_norm": 0.5517122807958376, + "learning_rate": 6.5310241182463556e-06, + "loss": 0.006, + "step": 19960 + }, + { + "epoch": 8.117527450183001, + "grad_norm": 0.02336925038964918, + "learning_rate": 6.530073863786473e-06, + "loss": 0.0003, + "step": 19961 + }, + { + "epoch": 8.117934119560797, + "grad_norm": 0.000755263287512883, + "learning_rate": 6.529123644947131e-06, + "loss": 0.0, + "step": 19962 + }, + { + "epoch": 8.118340788938593, + "grad_norm": 0.8835643921048407, + "learning_rate": 6.528173461738081e-06, + "loss": 0.0122, + "step": 19963 + }, + { + "epoch": 8.118747458316388, + "grad_norm": 11.148165194588177, + "learning_rate": 6.5272233141690775e-06, + "loss": 0.2107, + "step": 19964 + }, + { + "epoch": 8.119154127694184, + "grad_norm": 11.707927406716486, + "learning_rate": 6.5262732022498735e-06, + "loss": 0.3671, + "step": 19965 + }, + { + "epoch": 8.11956079707198, + "grad_norm": 0.027314157568004597, + "learning_rate": 6.525323125990224e-06, + "loss": 0.0004, + "step": 19966 + }, + { + "epoch": 8.119967466449776, + "grad_norm": 0.22774287016032482, + "learning_rate": 6.524373085399883e-06, + "loss": 0.0019, + "step": 19967 + }, + { + "epoch": 8.120374135827571, + "grad_norm": 0.04514944994892858, + "learning_rate": 6.523423080488599e-06, + "loss": 0.0003, + "step": 19968 + }, + { + "epoch": 8.120780805205367, + "grad_norm": 0.06039331638767788, + "learning_rate": 6.522473111266128e-06, + "loss": 0.0005, + "step": 19969 + }, + { + "epoch": 8.121187474583165, + "grad_norm": 0.2032472263584741, + "learning_rate": 6.521523177742222e-06, + "loss": 0.0013, + "step": 19970 + }, + { + "epoch": 8.12159414396096, + "grad_norm": 0.5296055039788599, + "learning_rate": 6.520573279926629e-06, + "loss": 0.0062, + "step": 19971 + }, + { + "epoch": 8.122000813338756, + "grad_norm": 0.08877305401244895, + "learning_rate": 6.519623417829104e-06, + "loss": 0.0013, + "step": 19972 + }, + { + "epoch": 8.122407482716552, + "grad_norm": 4.206352034224587, + "learning_rate": 6.5186735914593955e-06, + "loss": 0.2037, + "step": 19973 + }, + { + "epoch": 8.122814152094348, + "grad_norm": 1.113155816252252, + "learning_rate": 6.517723800827253e-06, + "loss": 0.0079, + "step": 19974 + }, + { + "epoch": 8.123220821472144, + "grad_norm": 0.5208264775464475, + "learning_rate": 6.5167740459424315e-06, + "loss": 0.0057, + "step": 19975 + }, + { + "epoch": 8.12362749084994, + "grad_norm": 0.292663642207815, + "learning_rate": 6.515824326814677e-06, + "loss": 0.0039, + "step": 19976 + }, + { + "epoch": 8.124034160227735, + "grad_norm": 1.0372513725595125, + "learning_rate": 6.514874643453739e-06, + "loss": 0.0116, + "step": 19977 + }, + { + "epoch": 8.12444082960553, + "grad_norm": 4.014738242180252, + "learning_rate": 6.513924995869365e-06, + "loss": 0.0295, + "step": 19978 + }, + { + "epoch": 8.124847498983327, + "grad_norm": 0.6717024822691854, + "learning_rate": 6.512975384071307e-06, + "loss": 0.0131, + "step": 19979 + }, + { + "epoch": 8.125254168361122, + "grad_norm": 0.08927797139092158, + "learning_rate": 6.512025808069314e-06, + "loss": 0.0011, + "step": 19980 + }, + { + "epoch": 8.125660837738918, + "grad_norm": 0.015308177201837306, + "learning_rate": 6.5110762678731264e-06, + "loss": 0.0002, + "step": 19981 + }, + { + "epoch": 8.126067507116714, + "grad_norm": 0.04582736481414658, + "learning_rate": 6.510126763492499e-06, + "loss": 0.0005, + "step": 19982 + }, + { + "epoch": 8.12647417649451, + "grad_norm": 0.7211815033061164, + "learning_rate": 6.509177294937179e-06, + "loss": 0.0093, + "step": 19983 + }, + { + "epoch": 8.126880845872305, + "grad_norm": 0.5785853018011599, + "learning_rate": 6.508227862216907e-06, + "loss": 0.0068, + "step": 19984 + }, + { + "epoch": 8.127287515250101, + "grad_norm": 1.5674670678069074, + "learning_rate": 6.507278465341435e-06, + "loss": 0.0175, + "step": 19985 + }, + { + "epoch": 8.127694184627897, + "grad_norm": 0.6044962986417672, + "learning_rate": 6.50632910432051e-06, + "loss": 0.0069, + "step": 19986 + }, + { + "epoch": 8.128100854005693, + "grad_norm": 0.43431141650124444, + "learning_rate": 6.505379779163872e-06, + "loss": 0.0054, + "step": 19987 + }, + { + "epoch": 8.128507523383488, + "grad_norm": 0.011193986615139415, + "learning_rate": 6.504430489881271e-06, + "loss": 0.0002, + "step": 19988 + }, + { + "epoch": 8.128914192761284, + "grad_norm": 9.757735565896255, + "learning_rate": 6.503481236482449e-06, + "loss": 0.2055, + "step": 19989 + }, + { + "epoch": 8.129320862139082, + "grad_norm": 0.6658958447694079, + "learning_rate": 6.502532018977155e-06, + "loss": 0.0066, + "step": 19990 + }, + { + "epoch": 8.129727531516878, + "grad_norm": 0.016992284932470507, + "learning_rate": 6.501582837375125e-06, + "loss": 0.0002, + "step": 19991 + }, + { + "epoch": 8.130134200894673, + "grad_norm": 3.3921823732636285, + "learning_rate": 6.5006336916861115e-06, + "loss": 0.0826, + "step": 19992 + }, + { + "epoch": 8.130540870272469, + "grad_norm": 0.005909394478049784, + "learning_rate": 6.499684581919854e-06, + "loss": 0.0001, + "step": 19993 + }, + { + "epoch": 8.130947539650265, + "grad_norm": 0.08175578567845951, + "learning_rate": 6.498735508086094e-06, + "loss": 0.0013, + "step": 19994 + }, + { + "epoch": 8.13135420902806, + "grad_norm": 0.004544721989585767, + "learning_rate": 6.497786470194578e-06, + "loss": 0.0001, + "step": 19995 + }, + { + "epoch": 8.131760878405856, + "grad_norm": 0.1595359417451656, + "learning_rate": 6.496837468255046e-06, + "loss": 0.0021, + "step": 19996 + }, + { + "epoch": 8.132167547783652, + "grad_norm": 2.6447963132746404, + "learning_rate": 6.495888502277239e-06, + "loss": 0.0398, + "step": 19997 + }, + { + "epoch": 8.132574217161448, + "grad_norm": 4.347418193777534, + "learning_rate": 6.494939572270902e-06, + "loss": 0.045, + "step": 19998 + }, + { + "epoch": 8.132980886539244, + "grad_norm": 0.0779560904350261, + "learning_rate": 6.493990678245775e-06, + "loss": 0.0009, + "step": 19999 + }, + { + "epoch": 8.13338755591704, + "grad_norm": 0.0444533499272169, + "learning_rate": 6.4930418202115965e-06, + "loss": 0.0005, + "step": 20000 + }, + { + "epoch": 8.133794225294835, + "grad_norm": 0.004043590900347979, + "learning_rate": 6.49209299817811e-06, + "loss": 0.0001, + "step": 20001 + }, + { + "epoch": 8.134200894672631, + "grad_norm": 0.8526721529019804, + "learning_rate": 6.491144212155054e-06, + "loss": 0.0093, + "step": 20002 + }, + { + "epoch": 8.134607564050427, + "grad_norm": 0.03279643899194171, + "learning_rate": 6.490195462152169e-06, + "loss": 0.0003, + "step": 20003 + }, + { + "epoch": 8.135014233428222, + "grad_norm": 5.28479122418879, + "learning_rate": 6.489246748179192e-06, + "loss": 0.0977, + "step": 20004 + }, + { + "epoch": 8.135420902806018, + "grad_norm": 1.370369696069121, + "learning_rate": 6.488298070245867e-06, + "loss": 0.0138, + "step": 20005 + }, + { + "epoch": 8.135827572183814, + "grad_norm": 0.19342832316458064, + "learning_rate": 6.487349428361928e-06, + "loss": 0.0026, + "step": 20006 + }, + { + "epoch": 8.13623424156161, + "grad_norm": 0.3960541965379298, + "learning_rate": 6.486400822537113e-06, + "loss": 0.0041, + "step": 20007 + }, + { + "epoch": 8.136640910939406, + "grad_norm": 0.22865792805805224, + "learning_rate": 6.485452252781165e-06, + "loss": 0.0031, + "step": 20008 + }, + { + "epoch": 8.137047580317201, + "grad_norm": 0.04889613656737725, + "learning_rate": 6.484503719103818e-06, + "loss": 0.0008, + "step": 20009 + }, + { + "epoch": 8.137454249694997, + "grad_norm": 0.16581609198900008, + "learning_rate": 6.483555221514808e-06, + "loss": 0.0013, + "step": 20010 + }, + { + "epoch": 8.137860919072795, + "grad_norm": 0.01181872990211702, + "learning_rate": 6.482606760023875e-06, + "loss": 0.0002, + "step": 20011 + }, + { + "epoch": 8.13826758845059, + "grad_norm": 8.291074550630444, + "learning_rate": 6.481658334640755e-06, + "loss": 0.1759, + "step": 20012 + }, + { + "epoch": 8.138674257828386, + "grad_norm": 0.4611471789783179, + "learning_rate": 6.480709945375181e-06, + "loss": 0.0072, + "step": 20013 + }, + { + "epoch": 8.139080927206182, + "grad_norm": 0.09907768689958497, + "learning_rate": 6.47976159223689e-06, + "loss": 0.0013, + "step": 20014 + }, + { + "epoch": 8.139487596583978, + "grad_norm": 0.14152169632291317, + "learning_rate": 6.4788132752356205e-06, + "loss": 0.0014, + "step": 20015 + }, + { + "epoch": 8.139894265961773, + "grad_norm": 0.11540254006719546, + "learning_rate": 6.477864994381103e-06, + "loss": 0.0013, + "step": 20016 + }, + { + "epoch": 8.14030093533957, + "grad_norm": 0.0029336465086736247, + "learning_rate": 6.476916749683073e-06, + "loss": 0.0, + "step": 20017 + }, + { + "epoch": 8.140707604717365, + "grad_norm": 0.816314743685451, + "learning_rate": 6.475968541151267e-06, + "loss": 0.0093, + "step": 20018 + }, + { + "epoch": 8.14111427409516, + "grad_norm": 0.2873141250668527, + "learning_rate": 6.475020368795419e-06, + "loss": 0.0036, + "step": 20019 + }, + { + "epoch": 8.141520943472957, + "grad_norm": 3.5739376575126607, + "learning_rate": 6.474072232625258e-06, + "loss": 0.0459, + "step": 20020 + }, + { + "epoch": 8.141927612850752, + "grad_norm": 0.022822323919523848, + "learning_rate": 6.473124132650524e-06, + "loss": 0.0003, + "step": 20021 + }, + { + "epoch": 8.142334282228548, + "grad_norm": 0.6936267869355928, + "learning_rate": 6.472176068880942e-06, + "loss": 0.0095, + "step": 20022 + }, + { + "epoch": 8.142740951606344, + "grad_norm": 0.21735043286888012, + "learning_rate": 6.4712280413262495e-06, + "loss": 0.003, + "step": 20023 + }, + { + "epoch": 8.14314762098414, + "grad_norm": 1.764926352634861, + "learning_rate": 6.470280049996178e-06, + "loss": 0.028, + "step": 20024 + }, + { + "epoch": 8.143554290361935, + "grad_norm": 1.0776069409707452, + "learning_rate": 6.469332094900458e-06, + "loss": 0.0076, + "step": 20025 + }, + { + "epoch": 8.143960959739731, + "grad_norm": 1.5506279643509298, + "learning_rate": 6.468384176048821e-06, + "loss": 0.0152, + "step": 20026 + }, + { + "epoch": 8.144367629117527, + "grad_norm": 0.1591992560470299, + "learning_rate": 6.467436293450996e-06, + "loss": 0.0028, + "step": 20027 + }, + { + "epoch": 8.144774298495323, + "grad_norm": 1.1988004756508448, + "learning_rate": 6.466488447116718e-06, + "loss": 0.0253, + "step": 20028 + }, + { + "epoch": 8.145180967873118, + "grad_norm": 9.738041174355038, + "learning_rate": 6.465540637055714e-06, + "loss": 0.072, + "step": 20029 + }, + { + "epoch": 8.145587637250914, + "grad_norm": 0.09714368891694747, + "learning_rate": 6.464592863277712e-06, + "loss": 0.0011, + "step": 20030 + }, + { + "epoch": 8.145994306628712, + "grad_norm": 0.057248198759008995, + "learning_rate": 6.463645125792446e-06, + "loss": 0.0009, + "step": 20031 + }, + { + "epoch": 8.146400976006507, + "grad_norm": 0.0007648499831591267, + "learning_rate": 6.462697424609642e-06, + "loss": 0.0, + "step": 20032 + }, + { + "epoch": 8.146807645384303, + "grad_norm": 2.867630693681062, + "learning_rate": 6.461749759739027e-06, + "loss": 0.0323, + "step": 20033 + }, + { + "epoch": 8.147214314762099, + "grad_norm": 7.743026824434513, + "learning_rate": 6.460802131190334e-06, + "loss": 0.2377, + "step": 20034 + }, + { + "epoch": 8.147620984139895, + "grad_norm": 1.7424547833046216, + "learning_rate": 6.459854538973289e-06, + "loss": 0.0217, + "step": 20035 + }, + { + "epoch": 8.14802765351769, + "grad_norm": 1.3259775064251225, + "learning_rate": 6.458906983097616e-06, + "loss": 0.0172, + "step": 20036 + }, + { + "epoch": 8.148434322895486, + "grad_norm": 0.030936622861469486, + "learning_rate": 6.457959463573047e-06, + "loss": 0.0005, + "step": 20037 + }, + { + "epoch": 8.148840992273282, + "grad_norm": 1.0822029779576334, + "learning_rate": 6.457011980409307e-06, + "loss": 0.0058, + "step": 20038 + }, + { + "epoch": 8.149247661651078, + "grad_norm": 0.0038007896228921087, + "learning_rate": 6.456064533616122e-06, + "loss": 0.0001, + "step": 20039 + }, + { + "epoch": 8.149654331028874, + "grad_norm": 1.1365056886940728, + "learning_rate": 6.4551171232032165e-06, + "loss": 0.0123, + "step": 20040 + }, + { + "epoch": 8.15006100040667, + "grad_norm": 0.0785279670938413, + "learning_rate": 6.45416974918032e-06, + "loss": 0.001, + "step": 20041 + }, + { + "epoch": 8.150467669784465, + "grad_norm": 0.028668188340053212, + "learning_rate": 6.453222411557156e-06, + "loss": 0.0003, + "step": 20042 + }, + { + "epoch": 8.150874339162261, + "grad_norm": 0.06123467582565898, + "learning_rate": 6.452275110343445e-06, + "loss": 0.0008, + "step": 20043 + }, + { + "epoch": 8.151281008540057, + "grad_norm": 0.14025277848096207, + "learning_rate": 6.45132784554892e-06, + "loss": 0.002, + "step": 20044 + }, + { + "epoch": 8.151687677917852, + "grad_norm": 0.03907472902290912, + "learning_rate": 6.450380617183298e-06, + "loss": 0.0004, + "step": 20045 + }, + { + "epoch": 8.152094347295648, + "grad_norm": 0.01105612500055309, + "learning_rate": 6.449433425256306e-06, + "loss": 0.0001, + "step": 20046 + }, + { + "epoch": 8.152501016673444, + "grad_norm": 0.12146219391401102, + "learning_rate": 6.448486269777667e-06, + "loss": 0.0013, + "step": 20047 + }, + { + "epoch": 8.15290768605124, + "grad_norm": 1.1137482781370485, + "learning_rate": 6.447539150757105e-06, + "loss": 0.0157, + "step": 20048 + }, + { + "epoch": 8.153314355429035, + "grad_norm": 0.1367413589929311, + "learning_rate": 6.446592068204341e-06, + "loss": 0.0018, + "step": 20049 + }, + { + "epoch": 8.153721024806831, + "grad_norm": 0.010723727642355647, + "learning_rate": 6.445645022129097e-06, + "loss": 0.0002, + "step": 20050 + }, + { + "epoch": 8.154127694184627, + "grad_norm": 0.0005566397997348357, + "learning_rate": 6.444698012541098e-06, + "loss": 0.0, + "step": 20051 + }, + { + "epoch": 8.154534363562425, + "grad_norm": 0.1524222822596145, + "learning_rate": 6.443751039450062e-06, + "loss": 0.0011, + "step": 20052 + }, + { + "epoch": 8.15494103294022, + "grad_norm": 0.21666921659604305, + "learning_rate": 6.442804102865712e-06, + "loss": 0.0023, + "step": 20053 + }, + { + "epoch": 8.155347702318016, + "grad_norm": 0.005106405020895018, + "learning_rate": 6.4418572027977675e-06, + "loss": 0.0001, + "step": 20054 + }, + { + "epoch": 8.155754371695812, + "grad_norm": 0.2037455446883352, + "learning_rate": 6.440910339255948e-06, + "loss": 0.0023, + "step": 20055 + }, + { + "epoch": 8.156161041073608, + "grad_norm": 0.008606125997658486, + "learning_rate": 6.439963512249977e-06, + "loss": 0.0001, + "step": 20056 + }, + { + "epoch": 8.156567710451403, + "grad_norm": 0.6440160874431899, + "learning_rate": 6.4390167217895724e-06, + "loss": 0.0108, + "step": 20057 + }, + { + "epoch": 8.1569743798292, + "grad_norm": 2.405599620786308, + "learning_rate": 6.438069967884451e-06, + "loss": 0.024, + "step": 20058 + }, + { + "epoch": 8.157381049206995, + "grad_norm": 1.0404510105874931, + "learning_rate": 6.4371232505443356e-06, + "loss": 0.0081, + "step": 20059 + }, + { + "epoch": 8.15778771858479, + "grad_norm": 0.28664581477651, + "learning_rate": 6.4361765697789445e-06, + "loss": 0.0017, + "step": 20060 + }, + { + "epoch": 8.158194387962586, + "grad_norm": 0.013680425518419487, + "learning_rate": 6.435229925597991e-06, + "loss": 0.0001, + "step": 20061 + }, + { + "epoch": 8.158601057340382, + "grad_norm": 4.801914099275061, + "learning_rate": 6.4342833180112e-06, + "loss": 0.0368, + "step": 20062 + }, + { + "epoch": 8.159007726718178, + "grad_norm": 0.0063844905432328125, + "learning_rate": 6.4333367470282844e-06, + "loss": 0.0001, + "step": 20063 + }, + { + "epoch": 8.159414396095974, + "grad_norm": 0.03995572882579639, + "learning_rate": 6.4323902126589595e-06, + "loss": 0.0005, + "step": 20064 + }, + { + "epoch": 8.15982106547377, + "grad_norm": 0.24833939662276472, + "learning_rate": 6.431443714912948e-06, + "loss": 0.0017, + "step": 20065 + }, + { + "epoch": 8.160227734851565, + "grad_norm": 0.07382021030886654, + "learning_rate": 6.430497253799962e-06, + "loss": 0.0011, + "step": 20066 + }, + { + "epoch": 8.160634404229361, + "grad_norm": 0.2313955983802565, + "learning_rate": 6.429550829329718e-06, + "loss": 0.0033, + "step": 20067 + }, + { + "epoch": 8.161041073607157, + "grad_norm": 23.326343067176296, + "learning_rate": 6.42860444151193e-06, + "loss": 0.4357, + "step": 20068 + }, + { + "epoch": 8.161447742984953, + "grad_norm": 0.015104384507609613, + "learning_rate": 6.427658090356317e-06, + "loss": 0.0002, + "step": 20069 + }, + { + "epoch": 8.161854412362748, + "grad_norm": 0.08221362451210348, + "learning_rate": 6.426711775872592e-06, + "loss": 0.0007, + "step": 20070 + }, + { + "epoch": 8.162261081740544, + "grad_norm": 0.10681615245529531, + "learning_rate": 6.425765498070467e-06, + "loss": 0.0013, + "step": 20071 + }, + { + "epoch": 8.162667751118342, + "grad_norm": 0.7048868435989901, + "learning_rate": 6.424819256959659e-06, + "loss": 0.0054, + "step": 20072 + }, + { + "epoch": 8.163074420496137, + "grad_norm": 0.0017569896629481196, + "learning_rate": 6.423873052549881e-06, + "loss": 0.0, + "step": 20073 + }, + { + "epoch": 8.163481089873933, + "grad_norm": 0.01374446488849078, + "learning_rate": 6.422926884850844e-06, + "loss": 0.0002, + "step": 20074 + }, + { + "epoch": 8.163887759251729, + "grad_norm": 2.0358544004668486, + "learning_rate": 6.421980753872266e-06, + "loss": 0.0236, + "step": 20075 + }, + { + "epoch": 8.164294428629525, + "grad_norm": 5.508146806938209, + "learning_rate": 6.421034659623855e-06, + "loss": 0.0834, + "step": 20076 + }, + { + "epoch": 8.16470109800732, + "grad_norm": 0.0031392344828313785, + "learning_rate": 6.4200886021153215e-06, + "loss": 0.0001, + "step": 20077 + }, + { + "epoch": 8.165107767385116, + "grad_norm": 3.5214715439445796, + "learning_rate": 6.419142581356384e-06, + "loss": 0.0434, + "step": 20078 + }, + { + "epoch": 8.165514436762912, + "grad_norm": 0.10056016395981335, + "learning_rate": 6.418196597356751e-06, + "loss": 0.0008, + "step": 20079 + }, + { + "epoch": 8.165921106140708, + "grad_norm": 0.19520374349085695, + "learning_rate": 6.41725065012613e-06, + "loss": 0.002, + "step": 20080 + }, + { + "epoch": 8.166327775518504, + "grad_norm": 0.8738191047430118, + "learning_rate": 6.416304739674235e-06, + "loss": 0.0106, + "step": 20081 + }, + { + "epoch": 8.1667344448963, + "grad_norm": 0.1779354079168008, + "learning_rate": 6.415358866010774e-06, + "loss": 0.0043, + "step": 20082 + }, + { + "epoch": 8.167141114274095, + "grad_norm": 8.687913041471585, + "learning_rate": 6.414413029145461e-06, + "loss": 0.2482, + "step": 20083 + }, + { + "epoch": 8.16754778365189, + "grad_norm": 0.009191682978877267, + "learning_rate": 6.413467229088001e-06, + "loss": 0.0001, + "step": 20084 + }, + { + "epoch": 8.167954453029687, + "grad_norm": 0.01511633826913863, + "learning_rate": 6.412521465848106e-06, + "loss": 0.0002, + "step": 20085 + }, + { + "epoch": 8.168361122407482, + "grad_norm": 4.783828738442129, + "learning_rate": 6.411575739435484e-06, + "loss": 0.0658, + "step": 20086 + }, + { + "epoch": 8.168767791785278, + "grad_norm": 0.006362484973068968, + "learning_rate": 6.41063004985984e-06, + "loss": 0.0001, + "step": 20087 + }, + { + "epoch": 8.169174461163074, + "grad_norm": 0.03443980672934789, + "learning_rate": 6.4096843971308885e-06, + "loss": 0.0006, + "step": 20088 + }, + { + "epoch": 8.16958113054087, + "grad_norm": 2.3835846568340195, + "learning_rate": 6.408738781258333e-06, + "loss": 0.0119, + "step": 20089 + }, + { + "epoch": 8.169987799918665, + "grad_norm": 0.04963505963579117, + "learning_rate": 6.407793202251879e-06, + "loss": 0.0004, + "step": 20090 + }, + { + "epoch": 8.170394469296461, + "grad_norm": 0.41205969322881003, + "learning_rate": 6.406847660121239e-06, + "loss": 0.0047, + "step": 20091 + }, + { + "epoch": 8.170801138674257, + "grad_norm": 0.027659998882815947, + "learning_rate": 6.405902154876116e-06, + "loss": 0.0004, + "step": 20092 + }, + { + "epoch": 8.171207808052054, + "grad_norm": 0.2005258836632291, + "learning_rate": 6.4049566865262144e-06, + "loss": 0.0028, + "step": 20093 + }, + { + "epoch": 8.17161447742985, + "grad_norm": 0.016502238379742837, + "learning_rate": 6.404011255081241e-06, + "loss": 0.0001, + "step": 20094 + }, + { + "epoch": 8.172021146807646, + "grad_norm": 9.429651789688469, + "learning_rate": 6.403065860550904e-06, + "loss": 0.5184, + "step": 20095 + }, + { + "epoch": 8.172427816185442, + "grad_norm": 0.5414887191537966, + "learning_rate": 6.402120502944906e-06, + "loss": 0.0069, + "step": 20096 + }, + { + "epoch": 8.172834485563238, + "grad_norm": 0.2318115513958118, + "learning_rate": 6.401175182272949e-06, + "loss": 0.0017, + "step": 20097 + }, + { + "epoch": 8.173241154941033, + "grad_norm": 10.213276882695208, + "learning_rate": 6.400229898544744e-06, + "loss": 0.2434, + "step": 20098 + }, + { + "epoch": 8.173647824318829, + "grad_norm": 0.39713976583937505, + "learning_rate": 6.39928465176999e-06, + "loss": 0.0053, + "step": 20099 + }, + { + "epoch": 8.174054493696625, + "grad_norm": 1.9672887946597246, + "learning_rate": 6.398339441958389e-06, + "loss": 0.0329, + "step": 20100 + }, + { + "epoch": 8.17446116307442, + "grad_norm": 0.16904811743986095, + "learning_rate": 6.397394269119648e-06, + "loss": 0.0015, + "step": 20101 + }, + { + "epoch": 8.174867832452216, + "grad_norm": 0.02419580488092595, + "learning_rate": 6.3964491332634684e-06, + "loss": 0.0003, + "step": 20102 + }, + { + "epoch": 8.175274501830012, + "grad_norm": 0.47396389357110963, + "learning_rate": 6.395504034399549e-06, + "loss": 0.0061, + "step": 20103 + }, + { + "epoch": 8.175681171207808, + "grad_norm": 1.198367147999807, + "learning_rate": 6.394558972537599e-06, + "loss": 0.0183, + "step": 20104 + }, + { + "epoch": 8.176087840585604, + "grad_norm": 0.2201982644524405, + "learning_rate": 6.393613947687314e-06, + "loss": 0.0026, + "step": 20105 + }, + { + "epoch": 8.1764945099634, + "grad_norm": 0.12755196481049147, + "learning_rate": 6.392668959858397e-06, + "loss": 0.0013, + "step": 20106 + }, + { + "epoch": 8.176901179341195, + "grad_norm": 0.42311710285249693, + "learning_rate": 6.391724009060547e-06, + "loss": 0.0037, + "step": 20107 + }, + { + "epoch": 8.177307848718991, + "grad_norm": 0.14520956992760767, + "learning_rate": 6.390779095303467e-06, + "loss": 0.0017, + "step": 20108 + }, + { + "epoch": 8.177714518096787, + "grad_norm": 0.186214472282179, + "learning_rate": 6.389834218596858e-06, + "loss": 0.0026, + "step": 20109 + }, + { + "epoch": 8.178121187474582, + "grad_norm": 1.5433754321633801, + "learning_rate": 6.388889378950414e-06, + "loss": 0.0127, + "step": 20110 + }, + { + "epoch": 8.178527856852378, + "grad_norm": 4.033256502944181, + "learning_rate": 6.387944576373842e-06, + "loss": 0.1568, + "step": 20111 + }, + { + "epoch": 8.178934526230174, + "grad_norm": 0.40000133870556687, + "learning_rate": 6.386999810876835e-06, + "loss": 0.0075, + "step": 20112 + }, + { + "epoch": 8.179341195607972, + "grad_norm": 1.8494385921923442, + "learning_rate": 6.386055082469091e-06, + "loss": 0.0225, + "step": 20113 + }, + { + "epoch": 8.179747864985767, + "grad_norm": 0.3281446552514249, + "learning_rate": 6.385110391160314e-06, + "loss": 0.0038, + "step": 20114 + }, + { + "epoch": 8.180154534363563, + "grad_norm": 0.00239953069986705, + "learning_rate": 6.384165736960197e-06, + "loss": 0.0, + "step": 20115 + }, + { + "epoch": 8.180561203741359, + "grad_norm": 0.8510032146531786, + "learning_rate": 6.38322111987844e-06, + "loss": 0.0099, + "step": 20116 + }, + { + "epoch": 8.180967873119155, + "grad_norm": 0.7824553471254623, + "learning_rate": 6.382276539924735e-06, + "loss": 0.0102, + "step": 20117 + }, + { + "epoch": 8.18137454249695, + "grad_norm": 0.05338209172772124, + "learning_rate": 6.381331997108785e-06, + "loss": 0.0006, + "step": 20118 + }, + { + "epoch": 8.181781211874746, + "grad_norm": 0.24148268422502808, + "learning_rate": 6.380387491440283e-06, + "loss": 0.0039, + "step": 20119 + }, + { + "epoch": 8.182187881252542, + "grad_norm": 0.17558413486089436, + "learning_rate": 6.379443022928923e-06, + "loss": 0.0016, + "step": 20120 + }, + { + "epoch": 8.182594550630338, + "grad_norm": 0.35592302171077955, + "learning_rate": 6.378498591584404e-06, + "loss": 0.0051, + "step": 20121 + }, + { + "epoch": 8.183001220008133, + "grad_norm": 0.14829476546081388, + "learning_rate": 6.37755419741642e-06, + "loss": 0.0014, + "step": 20122 + }, + { + "epoch": 8.18340788938593, + "grad_norm": 0.13342435508869122, + "learning_rate": 6.3766098404346645e-06, + "loss": 0.0023, + "step": 20123 + }, + { + "epoch": 8.183814558763725, + "grad_norm": 0.006197508763918425, + "learning_rate": 6.375665520648832e-06, + "loss": 0.0001, + "step": 20124 + }, + { + "epoch": 8.18422122814152, + "grad_norm": 0.018731087420014687, + "learning_rate": 6.374721238068619e-06, + "loss": 0.0003, + "step": 20125 + }, + { + "epoch": 8.184627897519317, + "grad_norm": 0.182642701670228, + "learning_rate": 6.373776992703715e-06, + "loss": 0.0029, + "step": 20126 + }, + { + "epoch": 8.185034566897112, + "grad_norm": 0.036368081048281896, + "learning_rate": 6.372832784563818e-06, + "loss": 0.0006, + "step": 20127 + }, + { + "epoch": 8.185441236274908, + "grad_norm": 1.5100251207827617, + "learning_rate": 6.371888613658616e-06, + "loss": 0.0113, + "step": 20128 + }, + { + "epoch": 8.185847905652704, + "grad_norm": 0.01575741486061818, + "learning_rate": 6.370944479997805e-06, + "loss": 0.0002, + "step": 20129 + }, + { + "epoch": 8.1862545750305, + "grad_norm": 0.03458869188033704, + "learning_rate": 6.3700003835910725e-06, + "loss": 0.0005, + "step": 20130 + }, + { + "epoch": 8.186661244408295, + "grad_norm": 3.261944863768014, + "learning_rate": 6.369056324448115e-06, + "loss": 0.058, + "step": 20131 + }, + { + "epoch": 8.187067913786091, + "grad_norm": 0.012385599016991667, + "learning_rate": 6.368112302578622e-06, + "loss": 0.0002, + "step": 20132 + }, + { + "epoch": 8.187474583163887, + "grad_norm": 1.5520397105814048, + "learning_rate": 6.367168317992281e-06, + "loss": 0.0189, + "step": 20133 + }, + { + "epoch": 8.187881252541684, + "grad_norm": 0.09108343232650187, + "learning_rate": 6.366224370698789e-06, + "loss": 0.0012, + "step": 20134 + }, + { + "epoch": 8.18828792191948, + "grad_norm": 0.0017744405192787825, + "learning_rate": 6.365280460707832e-06, + "loss": 0.0, + "step": 20135 + }, + { + "epoch": 8.188694591297276, + "grad_norm": 1.244224854506436, + "learning_rate": 6.3643365880290985e-06, + "loss": 0.0123, + "step": 20136 + }, + { + "epoch": 8.189101260675072, + "grad_norm": 0.02674172599466464, + "learning_rate": 6.3633927526722815e-06, + "loss": 0.0003, + "step": 20137 + }, + { + "epoch": 8.189507930052867, + "grad_norm": 0.4022939167436692, + "learning_rate": 6.362448954647069e-06, + "loss": 0.007, + "step": 20138 + }, + { + "epoch": 8.189914599430663, + "grad_norm": 0.2864741769880324, + "learning_rate": 6.3615051939631464e-06, + "loss": 0.0026, + "step": 20139 + }, + { + "epoch": 8.190321268808459, + "grad_norm": 2.5380542054308104, + "learning_rate": 6.360561470630207e-06, + "loss": 0.0363, + "step": 20140 + }, + { + "epoch": 8.190727938186255, + "grad_norm": 5.285917713779162, + "learning_rate": 6.359617784657935e-06, + "loss": 0.1137, + "step": 20141 + }, + { + "epoch": 8.19113460756405, + "grad_norm": 0.34529384289112997, + "learning_rate": 6.358674136056018e-06, + "loss": 0.0035, + "step": 20142 + }, + { + "epoch": 8.191541276941846, + "grad_norm": 0.8023732431547114, + "learning_rate": 6.3577305248341414e-06, + "loss": 0.0085, + "step": 20143 + }, + { + "epoch": 8.191947946319642, + "grad_norm": 0.01219304854328423, + "learning_rate": 6.356786951001996e-06, + "loss": 0.0002, + "step": 20144 + }, + { + "epoch": 8.192354615697438, + "grad_norm": 0.6521197575313267, + "learning_rate": 6.3558434145692675e-06, + "loss": 0.0067, + "step": 20145 + }, + { + "epoch": 8.192761285075234, + "grad_norm": 0.49760145062885397, + "learning_rate": 6.354899915545638e-06, + "loss": 0.005, + "step": 20146 + }, + { + "epoch": 8.19316795445303, + "grad_norm": 0.124929255573781, + "learning_rate": 6.353956453940797e-06, + "loss": 0.0016, + "step": 20147 + }, + { + "epoch": 8.193574623830825, + "grad_norm": 0.0022166502068500155, + "learning_rate": 6.3530130297644275e-06, + "loss": 0.0, + "step": 20148 + }, + { + "epoch": 8.193981293208621, + "grad_norm": 0.03513761793697085, + "learning_rate": 6.352069643026214e-06, + "loss": 0.0003, + "step": 20149 + }, + { + "epoch": 8.194387962586417, + "grad_norm": 0.001721908730450122, + "learning_rate": 6.351126293735843e-06, + "loss": 0.0, + "step": 20150 + }, + { + "epoch": 8.194794631964212, + "grad_norm": 0.6275919147422689, + "learning_rate": 6.350182981902997e-06, + "loss": 0.0038, + "step": 20151 + }, + { + "epoch": 8.195201301342008, + "grad_norm": 0.003260576445389697, + "learning_rate": 6.349239707537358e-06, + "loss": 0.0, + "step": 20152 + }, + { + "epoch": 8.195607970719804, + "grad_norm": 3.605064888476844, + "learning_rate": 6.348296470648616e-06, + "loss": 0.0824, + "step": 20153 + }, + { + "epoch": 8.196014640097602, + "grad_norm": 15.962281508955593, + "learning_rate": 6.347353271246443e-06, + "loss": 0.4909, + "step": 20154 + }, + { + "epoch": 8.196421309475397, + "grad_norm": 3.1639996897679716, + "learning_rate": 6.346410109340529e-06, + "loss": 0.0233, + "step": 20155 + }, + { + "epoch": 8.196827978853193, + "grad_norm": 0.026233753739961507, + "learning_rate": 6.345466984940555e-06, + "loss": 0.0002, + "step": 20156 + }, + { + "epoch": 8.197234648230989, + "grad_norm": 0.004004422188748207, + "learning_rate": 6.344523898056201e-06, + "loss": 0.0, + "step": 20157 + }, + { + "epoch": 8.197641317608785, + "grad_norm": 5.4851077140855224, + "learning_rate": 6.343580848697146e-06, + "loss": 0.1003, + "step": 20158 + }, + { + "epoch": 8.19804798698658, + "grad_norm": 0.3226098359654958, + "learning_rate": 6.342637836873077e-06, + "loss": 0.0038, + "step": 20159 + }, + { + "epoch": 8.198454656364376, + "grad_norm": 1.062319517146081, + "learning_rate": 6.34169486259367e-06, + "loss": 0.0085, + "step": 20160 + }, + { + "epoch": 8.198861325742172, + "grad_norm": 5.752547851642466, + "learning_rate": 6.340751925868606e-06, + "loss": 0.1632, + "step": 20161 + }, + { + "epoch": 8.199267995119968, + "grad_norm": 0.03296514975397504, + "learning_rate": 6.339809026707567e-06, + "loss": 0.0004, + "step": 20162 + }, + { + "epoch": 8.199674664497763, + "grad_norm": 0.881185403878195, + "learning_rate": 6.33886616512023e-06, + "loss": 0.0113, + "step": 20163 + }, + { + "epoch": 8.20008133387556, + "grad_norm": 0.021299299642884264, + "learning_rate": 6.337923341116272e-06, + "loss": 0.0003, + "step": 20164 + }, + { + "epoch": 8.200488003253355, + "grad_norm": 4.232143305024223, + "learning_rate": 6.336980554705376e-06, + "loss": 0.0803, + "step": 20165 + }, + { + "epoch": 8.20089467263115, + "grad_norm": 0.014657424626506257, + "learning_rate": 6.336037805897219e-06, + "loss": 0.0002, + "step": 20166 + }, + { + "epoch": 8.201301342008946, + "grad_norm": 7.1624422903632, + "learning_rate": 6.335095094701475e-06, + "loss": 0.0944, + "step": 20167 + }, + { + "epoch": 8.201708011386742, + "grad_norm": 5.6687793697544775, + "learning_rate": 6.334152421127826e-06, + "loss": 0.1231, + "step": 20168 + }, + { + "epoch": 8.202114680764538, + "grad_norm": 1.977740072671953, + "learning_rate": 6.33320978518595e-06, + "loss": 0.0238, + "step": 20169 + }, + { + "epoch": 8.202521350142334, + "grad_norm": 0.36287594239937326, + "learning_rate": 6.3322671868855176e-06, + "loss": 0.0037, + "step": 20170 + }, + { + "epoch": 8.20292801952013, + "grad_norm": 0.006409323148383345, + "learning_rate": 6.331324626236208e-06, + "loss": 0.0001, + "step": 20171 + }, + { + "epoch": 8.203334688897925, + "grad_norm": 3.7952634393378166, + "learning_rate": 6.330382103247701e-06, + "loss": 0.0574, + "step": 20172 + }, + { + "epoch": 8.203741358275721, + "grad_norm": 0.0017294808268137987, + "learning_rate": 6.329439617929667e-06, + "loss": 0.0, + "step": 20173 + }, + { + "epoch": 8.204148027653517, + "grad_norm": 0.23960611017164332, + "learning_rate": 6.328497170291781e-06, + "loss": 0.0016, + "step": 20174 + }, + { + "epoch": 8.204554697031314, + "grad_norm": 0.13857932087837926, + "learning_rate": 6.3275547603437205e-06, + "loss": 0.0014, + "step": 20175 + }, + { + "epoch": 8.20496136640911, + "grad_norm": 0.10084564803422595, + "learning_rate": 6.326612388095159e-06, + "loss": 0.0014, + "step": 20176 + }, + { + "epoch": 8.205368035786906, + "grad_norm": 0.5584394573305463, + "learning_rate": 6.325670053555768e-06, + "loss": 0.0123, + "step": 20177 + }, + { + "epoch": 8.205774705164702, + "grad_norm": 1.7491838771922639, + "learning_rate": 6.324727756735227e-06, + "loss": 0.023, + "step": 20178 + }, + { + "epoch": 8.206181374542497, + "grad_norm": 0.07015810793651223, + "learning_rate": 6.3237854976432035e-06, + "loss": 0.0009, + "step": 20179 + }, + { + "epoch": 8.206588043920293, + "grad_norm": 0.4328897621701657, + "learning_rate": 6.32284327628937e-06, + "loss": 0.0061, + "step": 20180 + }, + { + "epoch": 8.206994713298089, + "grad_norm": 0.02130531890250251, + "learning_rate": 6.3219010926834045e-06, + "loss": 0.0002, + "step": 20181 + }, + { + "epoch": 8.207401382675885, + "grad_norm": 0.01346909401411096, + "learning_rate": 6.320958946834974e-06, + "loss": 0.0002, + "step": 20182 + }, + { + "epoch": 8.20780805205368, + "grad_norm": 1.7739470437875875, + "learning_rate": 6.320016838753752e-06, + "loss": 0.018, + "step": 20183 + }, + { + "epoch": 8.208214721431476, + "grad_norm": 0.0019992639656947558, + "learning_rate": 6.319074768449407e-06, + "loss": 0.0, + "step": 20184 + }, + { + "epoch": 8.208621390809272, + "grad_norm": 3.2180098618985804, + "learning_rate": 6.318132735931615e-06, + "loss": 0.0356, + "step": 20185 + }, + { + "epoch": 8.209028060187068, + "grad_norm": 0.390871532696154, + "learning_rate": 6.317190741210042e-06, + "loss": 0.0046, + "step": 20186 + }, + { + "epoch": 8.209434729564864, + "grad_norm": 5.2087914946425, + "learning_rate": 6.316248784294359e-06, + "loss": 0.057, + "step": 20187 + }, + { + "epoch": 8.20984139894266, + "grad_norm": 1.0049280750972638, + "learning_rate": 6.3153068651942375e-06, + "loss": 0.0091, + "step": 20188 + }, + { + "epoch": 8.210248068320455, + "grad_norm": 0.7931631819858234, + "learning_rate": 6.314364983919346e-06, + "loss": 0.0077, + "step": 20189 + }, + { + "epoch": 8.21065473769825, + "grad_norm": 0.228639872725361, + "learning_rate": 6.313423140479349e-06, + "loss": 0.0019, + "step": 20190 + }, + { + "epoch": 8.211061407076047, + "grad_norm": 4.376231761317201, + "learning_rate": 6.312481334883923e-06, + "loss": 0.1217, + "step": 20191 + }, + { + "epoch": 8.211468076453842, + "grad_norm": 0.477091936188409, + "learning_rate": 6.311539567142731e-06, + "loss": 0.0052, + "step": 20192 + }, + { + "epoch": 8.211874745831638, + "grad_norm": 0.011685946116852001, + "learning_rate": 6.31059783726544e-06, + "loss": 0.0002, + "step": 20193 + }, + { + "epoch": 8.212281415209434, + "grad_norm": 0.26290432582230167, + "learning_rate": 6.309656145261721e-06, + "loss": 0.0033, + "step": 20194 + }, + { + "epoch": 8.212688084587231, + "grad_norm": 0.04057081312039937, + "learning_rate": 6.308714491141239e-06, + "loss": 0.0004, + "step": 20195 + }, + { + "epoch": 8.213094753965027, + "grad_norm": 0.011657921067249676, + "learning_rate": 6.30777287491366e-06, + "loss": 0.0002, + "step": 20196 + }, + { + "epoch": 8.213501423342823, + "grad_norm": 0.003170368998848505, + "learning_rate": 6.30683129658865e-06, + "loss": 0.0, + "step": 20197 + }, + { + "epoch": 8.213908092720619, + "grad_norm": 6.959489123087342, + "learning_rate": 6.305889756175877e-06, + "loss": 0.1054, + "step": 20198 + }, + { + "epoch": 8.214314762098414, + "grad_norm": 2.2253803949780497, + "learning_rate": 6.304948253685004e-06, + "loss": 0.0162, + "step": 20199 + }, + { + "epoch": 8.21472143147621, + "grad_norm": 0.4340522500985172, + "learning_rate": 6.304006789125696e-06, + "loss": 0.0026, + "step": 20200 + }, + { + "epoch": 8.215128100854006, + "grad_norm": 0.033397040962013555, + "learning_rate": 6.30306536250762e-06, + "loss": 0.0004, + "step": 20201 + }, + { + "epoch": 8.215534770231802, + "grad_norm": 2.476118929517038, + "learning_rate": 6.302123973840439e-06, + "loss": 0.0283, + "step": 20202 + }, + { + "epoch": 8.215941439609598, + "grad_norm": 0.1591528003090887, + "learning_rate": 6.3011826231338135e-06, + "loss": 0.0023, + "step": 20203 + }, + { + "epoch": 8.216348108987393, + "grad_norm": 0.5124686670872155, + "learning_rate": 6.300241310397412e-06, + "loss": 0.0069, + "step": 20204 + }, + { + "epoch": 8.216754778365189, + "grad_norm": 4.085527422144974, + "learning_rate": 6.299300035640896e-06, + "loss": 0.0429, + "step": 20205 + }, + { + "epoch": 8.217161447742985, + "grad_norm": 0.05252652798763582, + "learning_rate": 6.298358798873927e-06, + "loss": 0.0006, + "step": 20206 + }, + { + "epoch": 8.21756811712078, + "grad_norm": 1.962841763232234, + "learning_rate": 6.297417600106168e-06, + "loss": 0.0285, + "step": 20207 + }, + { + "epoch": 8.217974786498576, + "grad_norm": 0.013745329354849138, + "learning_rate": 6.296476439347281e-06, + "loss": 0.0002, + "step": 20208 + }, + { + "epoch": 8.218381455876372, + "grad_norm": 0.012786834505138567, + "learning_rate": 6.295535316606929e-06, + "loss": 0.0001, + "step": 20209 + }, + { + "epoch": 8.218788125254168, + "grad_norm": 0.007913325239696357, + "learning_rate": 6.294594231894767e-06, + "loss": 0.0001, + "step": 20210 + }, + { + "epoch": 8.219194794631964, + "grad_norm": 0.03682613294852855, + "learning_rate": 6.293653185220463e-06, + "loss": 0.0004, + "step": 20211 + }, + { + "epoch": 8.21960146400976, + "grad_norm": 0.5170415393164688, + "learning_rate": 6.292712176593676e-06, + "loss": 0.0049, + "step": 20212 + }, + { + "epoch": 8.220008133387555, + "grad_norm": 0.02511169046721648, + "learning_rate": 6.2917712060240595e-06, + "loss": 0.0004, + "step": 20213 + }, + { + "epoch": 8.220414802765351, + "grad_norm": 0.012197811798994663, + "learning_rate": 6.290830273521283e-06, + "loss": 0.0002, + "step": 20214 + }, + { + "epoch": 8.220821472143147, + "grad_norm": 1.6318978981876915, + "learning_rate": 6.2898893790949975e-06, + "loss": 0.0266, + "step": 20215 + }, + { + "epoch": 8.221228141520944, + "grad_norm": 0.0965258771198858, + "learning_rate": 6.288948522754865e-06, + "loss": 0.0015, + "step": 20216 + }, + { + "epoch": 8.22163481089874, + "grad_norm": 2.0365648427652063, + "learning_rate": 6.288007704510545e-06, + "loss": 0.0332, + "step": 20217 + }, + { + "epoch": 8.222041480276536, + "grad_norm": 0.6791123026749253, + "learning_rate": 6.287066924371694e-06, + "loss": 0.0101, + "step": 20218 + }, + { + "epoch": 8.222448149654332, + "grad_norm": 4.966728883518674, + "learning_rate": 6.286126182347967e-06, + "loss": 0.0756, + "step": 20219 + }, + { + "epoch": 8.222854819032127, + "grad_norm": 0.054920738218325965, + "learning_rate": 6.285185478449027e-06, + "loss": 0.0007, + "step": 20220 + }, + { + "epoch": 8.223261488409923, + "grad_norm": 0.027897588120320976, + "learning_rate": 6.2842448126845265e-06, + "loss": 0.0003, + "step": 20221 + }, + { + "epoch": 8.223668157787719, + "grad_norm": 0.7479286856042416, + "learning_rate": 6.283304185064125e-06, + "loss": 0.0043, + "step": 20222 + }, + { + "epoch": 8.224074827165515, + "grad_norm": 2.4184131492216503, + "learning_rate": 6.282363595597473e-06, + "loss": 0.0246, + "step": 20223 + }, + { + "epoch": 8.22448149654331, + "grad_norm": 2.415274505277601, + "learning_rate": 6.281423044294233e-06, + "loss": 0.0325, + "step": 20224 + }, + { + "epoch": 8.224888165921106, + "grad_norm": 0.016200102163667657, + "learning_rate": 6.280482531164056e-06, + "loss": 0.0001, + "step": 20225 + }, + { + "epoch": 8.225294835298902, + "grad_norm": 0.10229863724225599, + "learning_rate": 6.2795420562165955e-06, + "loss": 0.0014, + "step": 20226 + }, + { + "epoch": 8.225701504676698, + "grad_norm": 9.362611337663107, + "learning_rate": 6.2786016194615105e-06, + "loss": 0.1759, + "step": 20227 + }, + { + "epoch": 8.226108174054493, + "grad_norm": 0.06670533673279802, + "learning_rate": 6.277661220908453e-06, + "loss": 0.0012, + "step": 20228 + }, + { + "epoch": 8.22651484343229, + "grad_norm": 0.23493744582453543, + "learning_rate": 6.2767208605670736e-06, + "loss": 0.0024, + "step": 20229 + }, + { + "epoch": 8.226921512810085, + "grad_norm": 0.07629282465946176, + "learning_rate": 6.275780538447031e-06, + "loss": 0.0008, + "step": 20230 + }, + { + "epoch": 8.22732818218788, + "grad_norm": 14.601020655533612, + "learning_rate": 6.274840254557976e-06, + "loss": 0.1419, + "step": 20231 + }, + { + "epoch": 8.227734851565677, + "grad_norm": 0.11514284817381648, + "learning_rate": 6.273900008909561e-06, + "loss": 0.001, + "step": 20232 + }, + { + "epoch": 8.228141520943472, + "grad_norm": 0.019089789419439115, + "learning_rate": 6.272959801511436e-06, + "loss": 0.0003, + "step": 20233 + }, + { + "epoch": 8.228548190321268, + "grad_norm": 0.8676745103988829, + "learning_rate": 6.272019632373256e-06, + "loss": 0.0069, + "step": 20234 + }, + { + "epoch": 8.228954859699064, + "grad_norm": 0.005564480600703253, + "learning_rate": 6.27107950150467e-06, + "loss": 0.0001, + "step": 20235 + }, + { + "epoch": 8.229361529076861, + "grad_norm": 0.3058701139803362, + "learning_rate": 6.270139408915328e-06, + "loss": 0.0023, + "step": 20236 + }, + { + "epoch": 8.229768198454657, + "grad_norm": 5.0695192564400475, + "learning_rate": 6.269199354614884e-06, + "loss": 0.0697, + "step": 20237 + }, + { + "epoch": 8.230174867832453, + "grad_norm": 2.002557413730902, + "learning_rate": 6.268259338612988e-06, + "loss": 0.0159, + "step": 20238 + }, + { + "epoch": 8.230581537210249, + "grad_norm": 0.6167979245429082, + "learning_rate": 6.267319360919284e-06, + "loss": 0.0078, + "step": 20239 + }, + { + "epoch": 8.230988206588044, + "grad_norm": 0.017506312665250847, + "learning_rate": 6.266379421543427e-06, + "loss": 0.0002, + "step": 20240 + }, + { + "epoch": 8.23139487596584, + "grad_norm": 0.2570091097002572, + "learning_rate": 6.265439520495066e-06, + "loss": 0.0024, + "step": 20241 + }, + { + "epoch": 8.231801545343636, + "grad_norm": 0.016132807906374773, + "learning_rate": 6.264499657783845e-06, + "loss": 0.0002, + "step": 20242 + }, + { + "epoch": 8.232208214721432, + "grad_norm": 2.4707744052371408, + "learning_rate": 6.263559833419417e-06, + "loss": 0.0197, + "step": 20243 + }, + { + "epoch": 8.232614884099227, + "grad_norm": 0.5656720082767627, + "learning_rate": 6.262620047411427e-06, + "loss": 0.006, + "step": 20244 + }, + { + "epoch": 8.233021553477023, + "grad_norm": 0.005481693172146486, + "learning_rate": 6.2616802997695255e-06, + "loss": 0.0001, + "step": 20245 + }, + { + "epoch": 8.233428222854819, + "grad_norm": 0.032356821246968696, + "learning_rate": 6.260740590503354e-06, + "loss": 0.0003, + "step": 20246 + }, + { + "epoch": 8.233834892232615, + "grad_norm": 0.4156658138103733, + "learning_rate": 6.259800919622564e-06, + "loss": 0.0022, + "step": 20247 + }, + { + "epoch": 8.23424156161041, + "grad_norm": 0.2904208163545279, + "learning_rate": 6.258861287136799e-06, + "loss": 0.0025, + "step": 20248 + }, + { + "epoch": 8.234648230988206, + "grad_norm": 2.105982450231881, + "learning_rate": 6.2579216930557065e-06, + "loss": 0.0286, + "step": 20249 + }, + { + "epoch": 8.235054900366002, + "grad_norm": 0.3752755562705699, + "learning_rate": 6.2569821373889316e-06, + "loss": 0.0042, + "step": 20250 + }, + { + "epoch": 8.235461569743798, + "grad_norm": 0.12960325665544298, + "learning_rate": 6.256042620146119e-06, + "loss": 0.0014, + "step": 20251 + }, + { + "epoch": 8.235868239121594, + "grad_norm": 0.2201948493001289, + "learning_rate": 6.25510314133691e-06, + "loss": 0.0028, + "step": 20252 + }, + { + "epoch": 8.23627490849939, + "grad_norm": 0.06698980289190841, + "learning_rate": 6.254163700970959e-06, + "loss": 0.0006, + "step": 20253 + }, + { + "epoch": 8.236681577877185, + "grad_norm": 0.9944118266728238, + "learning_rate": 6.253224299057897e-06, + "loss": 0.0068, + "step": 20254 + }, + { + "epoch": 8.237088247254981, + "grad_norm": 2.2003876500013932, + "learning_rate": 6.2522849356073755e-06, + "loss": 0.046, + "step": 20255 + }, + { + "epoch": 8.237494916632777, + "grad_norm": 1.3022796957646208, + "learning_rate": 6.251345610629035e-06, + "loss": 0.0099, + "step": 20256 + }, + { + "epoch": 8.237901586010574, + "grad_norm": 0.003969015608167061, + "learning_rate": 6.250406324132517e-06, + "loss": 0.0001, + "step": 20257 + }, + { + "epoch": 8.23830825538837, + "grad_norm": 9.087238622974546, + "learning_rate": 6.249467076127467e-06, + "loss": 0.1637, + "step": 20258 + }, + { + "epoch": 8.238714924766166, + "grad_norm": 0.08138252066764316, + "learning_rate": 6.2485278666235234e-06, + "loss": 0.0007, + "step": 20259 + }, + { + "epoch": 8.239121594143962, + "grad_norm": 0.013131625650974787, + "learning_rate": 6.247588695630329e-06, + "loss": 0.0001, + "step": 20260 + }, + { + "epoch": 8.239528263521757, + "grad_norm": 0.1443904936820213, + "learning_rate": 6.246649563157528e-06, + "loss": 0.0009, + "step": 20261 + }, + { + "epoch": 8.239934932899553, + "grad_norm": 0.0946517294337507, + "learning_rate": 6.245710469214757e-06, + "loss": 0.0012, + "step": 20262 + }, + { + "epoch": 8.240341602277349, + "grad_norm": 0.20299193857840075, + "learning_rate": 6.244771413811657e-06, + "loss": 0.0026, + "step": 20263 + }, + { + "epoch": 8.240748271655145, + "grad_norm": 1.504022833172715, + "learning_rate": 6.243832396957868e-06, + "loss": 0.0165, + "step": 20264 + }, + { + "epoch": 8.24115494103294, + "grad_norm": 1.3012394744164386, + "learning_rate": 6.24289341866303e-06, + "loss": 0.0163, + "step": 20265 + }, + { + "epoch": 8.241561610410736, + "grad_norm": 2.8536035114908413, + "learning_rate": 6.241954478936783e-06, + "loss": 0.0371, + "step": 20266 + }, + { + "epoch": 8.241968279788532, + "grad_norm": 0.08571913998632752, + "learning_rate": 6.241015577788762e-06, + "loss": 0.0013, + "step": 20267 + }, + { + "epoch": 8.242374949166328, + "grad_norm": 1.6581736078713023, + "learning_rate": 6.240076715228611e-06, + "loss": 0.0199, + "step": 20268 + }, + { + "epoch": 8.242781618544123, + "grad_norm": 6.418005735972211, + "learning_rate": 6.239137891265965e-06, + "loss": 0.0496, + "step": 20269 + }, + { + "epoch": 8.24318828792192, + "grad_norm": 0.0421885593581514, + "learning_rate": 6.238199105910458e-06, + "loss": 0.0002, + "step": 20270 + }, + { + "epoch": 8.243594957299715, + "grad_norm": 0.27241187777949977, + "learning_rate": 6.237260359171733e-06, + "loss": 0.0024, + "step": 20271 + }, + { + "epoch": 8.24400162667751, + "grad_norm": 0.0020026527516170925, + "learning_rate": 6.236321651059425e-06, + "loss": 0.0, + "step": 20272 + }, + { + "epoch": 8.244408296055306, + "grad_norm": 0.016923010881561094, + "learning_rate": 6.235382981583169e-06, + "loss": 0.0002, + "step": 20273 + }, + { + "epoch": 8.244814965433102, + "grad_norm": 0.11270462024427845, + "learning_rate": 6.234444350752599e-06, + "loss": 0.0011, + "step": 20274 + }, + { + "epoch": 8.245221634810898, + "grad_norm": 0.011571920502547442, + "learning_rate": 6.233505758577355e-06, + "loss": 0.0001, + "step": 20275 + }, + { + "epoch": 8.245628304188694, + "grad_norm": 4.880546249902786, + "learning_rate": 6.23256720506707e-06, + "loss": 0.0395, + "step": 20276 + }, + { + "epoch": 8.246034973566491, + "grad_norm": 0.15074562668634245, + "learning_rate": 6.2316286902313775e-06, + "loss": 0.0016, + "step": 20277 + }, + { + "epoch": 8.246441642944287, + "grad_norm": 1.3145068328466518, + "learning_rate": 6.230690214079913e-06, + "loss": 0.0133, + "step": 20278 + }, + { + "epoch": 8.246848312322083, + "grad_norm": 0.044853425122756474, + "learning_rate": 6.229751776622312e-06, + "loss": 0.0006, + "step": 20279 + }, + { + "epoch": 8.247254981699879, + "grad_norm": 0.14365422226343616, + "learning_rate": 6.2288133778682055e-06, + "loss": 0.0017, + "step": 20280 + }, + { + "epoch": 8.247661651077674, + "grad_norm": 0.015756020369917554, + "learning_rate": 6.227875017827228e-06, + "loss": 0.0002, + "step": 20281 + }, + { + "epoch": 8.24806832045547, + "grad_norm": 0.008909104157029845, + "learning_rate": 6.226936696509013e-06, + "loss": 0.0001, + "step": 20282 + }, + { + "epoch": 8.248474989833266, + "grad_norm": 0.1825772021615021, + "learning_rate": 6.225998413923189e-06, + "loss": 0.0016, + "step": 20283 + }, + { + "epoch": 8.248881659211062, + "grad_norm": 1.8461801903905335, + "learning_rate": 6.225060170079393e-06, + "loss": 0.0307, + "step": 20284 + }, + { + "epoch": 8.249288328588857, + "grad_norm": 0.012464614648838237, + "learning_rate": 6.224121964987255e-06, + "loss": 0.0002, + "step": 20285 + }, + { + "epoch": 8.249694997966653, + "grad_norm": 0.05787515701153518, + "learning_rate": 6.223183798656405e-06, + "loss": 0.0007, + "step": 20286 + }, + { + "epoch": 8.250101667344449, + "grad_norm": 1.8907205643040617, + "learning_rate": 6.222245671096472e-06, + "loss": 0.0193, + "step": 20287 + }, + { + "epoch": 8.250508336722245, + "grad_norm": 1.0550151260896006, + "learning_rate": 6.221307582317089e-06, + "loss": 0.0102, + "step": 20288 + }, + { + "epoch": 8.25091500610004, + "grad_norm": 6.626785778200377, + "learning_rate": 6.220369532327888e-06, + "loss": 0.043, + "step": 20289 + }, + { + "epoch": 8.251321675477836, + "grad_norm": 0.32914564110721845, + "learning_rate": 6.219431521138493e-06, + "loss": 0.0037, + "step": 20290 + }, + { + "epoch": 8.251728344855632, + "grad_norm": 0.16403483791000564, + "learning_rate": 6.218493548758537e-06, + "loss": 0.0023, + "step": 20291 + }, + { + "epoch": 8.252135014233428, + "grad_norm": 7.093070371563491, + "learning_rate": 6.2175556151976475e-06, + "loss": 0.0599, + "step": 20292 + }, + { + "epoch": 8.252541683611224, + "grad_norm": 0.1847691073441393, + "learning_rate": 6.216617720465455e-06, + "loss": 0.0016, + "step": 20293 + }, + { + "epoch": 8.25294835298902, + "grad_norm": 2.1008313177904174, + "learning_rate": 6.215679864571584e-06, + "loss": 0.0299, + "step": 20294 + }, + { + "epoch": 8.253355022366815, + "grad_norm": 0.4501726980784715, + "learning_rate": 6.214742047525666e-06, + "loss": 0.0039, + "step": 20295 + }, + { + "epoch": 8.25376169174461, + "grad_norm": 0.41354512740550725, + "learning_rate": 6.213804269337322e-06, + "loss": 0.0029, + "step": 20296 + }, + { + "epoch": 8.254168361122407, + "grad_norm": 0.030702283137485048, + "learning_rate": 6.212866530016186e-06, + "loss": 0.0005, + "step": 20297 + }, + { + "epoch": 8.254575030500204, + "grad_norm": 5.699300200656195, + "learning_rate": 6.21192882957188e-06, + "loss": 0.0497, + "step": 20298 + }, + { + "epoch": 8.254981699878, + "grad_norm": 2.295023721098895, + "learning_rate": 6.210991168014033e-06, + "loss": 0.0253, + "step": 20299 + }, + { + "epoch": 8.255388369255796, + "grad_norm": 0.0038614189945657025, + "learning_rate": 6.210053545352266e-06, + "loss": 0.0, + "step": 20300 + }, + { + "epoch": 8.255795038633591, + "grad_norm": 5.273002026644352, + "learning_rate": 6.209115961596208e-06, + "loss": 0.1507, + "step": 20301 + }, + { + "epoch": 8.256201708011387, + "grad_norm": 0.012677099573744873, + "learning_rate": 6.208178416755484e-06, + "loss": 0.0002, + "step": 20302 + }, + { + "epoch": 8.256608377389183, + "grad_norm": 0.11838616813328377, + "learning_rate": 6.207240910839714e-06, + "loss": 0.001, + "step": 20303 + }, + { + "epoch": 8.257015046766979, + "grad_norm": 0.06846373363611756, + "learning_rate": 6.2063034438585256e-06, + "loss": 0.0005, + "step": 20304 + }, + { + "epoch": 8.257421716144774, + "grad_norm": 6.6393851519626175, + "learning_rate": 6.205366015821543e-06, + "loss": 0.0958, + "step": 20305 + }, + { + "epoch": 8.25782838552257, + "grad_norm": 3.2573863049152543, + "learning_rate": 6.2044286267383854e-06, + "loss": 0.0197, + "step": 20306 + }, + { + "epoch": 8.258235054900366, + "grad_norm": 0.011183930852662095, + "learning_rate": 6.203491276618681e-06, + "loss": 0.0002, + "step": 20307 + }, + { + "epoch": 8.258641724278162, + "grad_norm": 0.030587604252088636, + "learning_rate": 6.202553965472049e-06, + "loss": 0.0004, + "step": 20308 + }, + { + "epoch": 8.259048393655958, + "grad_norm": 1.8212989220723057, + "learning_rate": 6.20161669330811e-06, + "loss": 0.0132, + "step": 20309 + }, + { + "epoch": 8.259455063033753, + "grad_norm": 11.701887433912846, + "learning_rate": 6.200679460136489e-06, + "loss": 0.151, + "step": 20310 + }, + { + "epoch": 8.259861732411549, + "grad_norm": 0.3715394645162805, + "learning_rate": 6.199742265966806e-06, + "loss": 0.0028, + "step": 20311 + }, + { + "epoch": 8.260268401789345, + "grad_norm": 3.308739899718516, + "learning_rate": 6.198805110808681e-06, + "loss": 0.0579, + "step": 20312 + }, + { + "epoch": 8.26067507116714, + "grad_norm": 0.0004223626592297787, + "learning_rate": 6.197867994671734e-06, + "loss": 0.0, + "step": 20313 + }, + { + "epoch": 8.261081740544936, + "grad_norm": 0.006503711761303955, + "learning_rate": 6.196930917565584e-06, + "loss": 0.0001, + "step": 20314 + }, + { + "epoch": 8.261488409922732, + "grad_norm": 0.12728531880696548, + "learning_rate": 6.195993879499855e-06, + "loss": 0.0013, + "step": 20315 + }, + { + "epoch": 8.261895079300528, + "grad_norm": 1.1803174991710261, + "learning_rate": 6.19505688048416e-06, + "loss": 0.0077, + "step": 20316 + }, + { + "epoch": 8.262301748678324, + "grad_norm": 0.054708787953976976, + "learning_rate": 6.194119920528126e-06, + "loss": 0.0006, + "step": 20317 + }, + { + "epoch": 8.262708418056121, + "grad_norm": 12.092487784370245, + "learning_rate": 6.1931829996413624e-06, + "loss": 0.2316, + "step": 20318 + }, + { + "epoch": 8.263115087433917, + "grad_norm": 0.35381018007561416, + "learning_rate": 6.1922461178334915e-06, + "loss": 0.0035, + "step": 20319 + }, + { + "epoch": 8.263521756811713, + "grad_norm": 5.223157113816033, + "learning_rate": 6.191309275114132e-06, + "loss": 0.1511, + "step": 20320 + }, + { + "epoch": 8.263928426189509, + "grad_norm": 0.5552506298876743, + "learning_rate": 6.1903724714929e-06, + "loss": 0.0041, + "step": 20321 + }, + { + "epoch": 8.264335095567304, + "grad_norm": 1.270156626908071, + "learning_rate": 6.189435706979411e-06, + "loss": 0.0164, + "step": 20322 + }, + { + "epoch": 8.2647417649451, + "grad_norm": 0.8431630207973883, + "learning_rate": 6.188498981583283e-06, + "loss": 0.0067, + "step": 20323 + }, + { + "epoch": 8.265148434322896, + "grad_norm": 1.7768123500400663, + "learning_rate": 6.1875622953141315e-06, + "loss": 0.0138, + "step": 20324 + }, + { + "epoch": 8.265555103700692, + "grad_norm": 0.02245802755133895, + "learning_rate": 6.186625648181573e-06, + "loss": 0.0003, + "step": 20325 + }, + { + "epoch": 8.265961773078487, + "grad_norm": 3.2371794516733754, + "learning_rate": 6.185689040195217e-06, + "loss": 0.0269, + "step": 20326 + }, + { + "epoch": 8.266368442456283, + "grad_norm": 0.0014585645982161273, + "learning_rate": 6.1847524713646855e-06, + "loss": 0.0, + "step": 20327 + }, + { + "epoch": 8.266775111834079, + "grad_norm": 0.0006148617479515568, + "learning_rate": 6.18381594169959e-06, + "loss": 0.0, + "step": 20328 + }, + { + "epoch": 8.267181781211875, + "grad_norm": 0.002735867261006364, + "learning_rate": 6.182879451209544e-06, + "loss": 0.0, + "step": 20329 + }, + { + "epoch": 8.26758845058967, + "grad_norm": 8.880532143954609, + "learning_rate": 6.181942999904163e-06, + "loss": 0.1369, + "step": 20330 + }, + { + "epoch": 8.267995119967466, + "grad_norm": 0.5213412482900341, + "learning_rate": 6.181006587793057e-06, + "loss": 0.005, + "step": 20331 + }, + { + "epoch": 8.268401789345262, + "grad_norm": 0.11561127412932302, + "learning_rate": 6.180070214885839e-06, + "loss": 0.001, + "step": 20332 + }, + { + "epoch": 8.268808458723058, + "grad_norm": 0.23425033499193634, + "learning_rate": 6.179133881192125e-06, + "loss": 0.0017, + "step": 20333 + }, + { + "epoch": 8.269215128100853, + "grad_norm": 2.49087378219244, + "learning_rate": 6.178197586721526e-06, + "loss": 0.0302, + "step": 20334 + }, + { + "epoch": 8.26962179747865, + "grad_norm": 0.41210180462857365, + "learning_rate": 6.177261331483651e-06, + "loss": 0.0047, + "step": 20335 + }, + { + "epoch": 8.270028466856445, + "grad_norm": 0.17836423596008386, + "learning_rate": 6.1763251154881106e-06, + "loss": 0.0014, + "step": 20336 + }, + { + "epoch": 8.27043513623424, + "grad_norm": 1.560765503323248, + "learning_rate": 6.17538893874452e-06, + "loss": 0.0151, + "step": 20337 + }, + { + "epoch": 8.270841805612037, + "grad_norm": 0.011993323421663942, + "learning_rate": 6.174452801262486e-06, + "loss": 0.0001, + "step": 20338 + }, + { + "epoch": 8.271248474989834, + "grad_norm": 2.624609445571047, + "learning_rate": 6.173516703051618e-06, + "loss": 0.0268, + "step": 20339 + }, + { + "epoch": 8.27165514436763, + "grad_norm": 1.1515701032630816, + "learning_rate": 6.1725806441215286e-06, + "loss": 0.0156, + "step": 20340 + }, + { + "epoch": 8.272061813745426, + "grad_norm": 0.2655801901543577, + "learning_rate": 6.171644624481825e-06, + "loss": 0.0027, + "step": 20341 + }, + { + "epoch": 8.272468483123221, + "grad_norm": 0.08744989170976311, + "learning_rate": 6.170708644142114e-06, + "loss": 0.0005, + "step": 20342 + }, + { + "epoch": 8.272875152501017, + "grad_norm": 0.056737489798741596, + "learning_rate": 6.169772703112008e-06, + "loss": 0.0008, + "step": 20343 + }, + { + "epoch": 8.273281821878813, + "grad_norm": 0.04418046750327803, + "learning_rate": 6.1688368014011126e-06, + "loss": 0.0004, + "step": 20344 + }, + { + "epoch": 8.273688491256609, + "grad_norm": 0.2884749359090732, + "learning_rate": 6.1679009390190345e-06, + "loss": 0.0035, + "step": 20345 + }, + { + "epoch": 8.274095160634404, + "grad_norm": 0.00548700409854488, + "learning_rate": 6.166965115975383e-06, + "loss": 0.0001, + "step": 20346 + }, + { + "epoch": 8.2745018300122, + "grad_norm": 4.6056165424767, + "learning_rate": 6.166029332279765e-06, + "loss": 0.085, + "step": 20347 + }, + { + "epoch": 8.274908499389996, + "grad_norm": 7.122261592702959, + "learning_rate": 6.165093587941784e-06, + "loss": 0.1797, + "step": 20348 + }, + { + "epoch": 8.275315168767792, + "grad_norm": 0.5359238320853309, + "learning_rate": 6.164157882971047e-06, + "loss": 0.0025, + "step": 20349 + }, + { + "epoch": 8.275721838145587, + "grad_norm": 0.0026035321051262237, + "learning_rate": 6.16322221737716e-06, + "loss": 0.0, + "step": 20350 + }, + { + "epoch": 8.276128507523383, + "grad_norm": 1.656249413547828, + "learning_rate": 6.16228659116973e-06, + "loss": 0.0177, + "step": 20351 + }, + { + "epoch": 8.276535176901179, + "grad_norm": 0.0048755420079368535, + "learning_rate": 6.16135100435836e-06, + "loss": 0.0001, + "step": 20352 + }, + { + "epoch": 8.276941846278975, + "grad_norm": 0.0071649139768898045, + "learning_rate": 6.160415456952653e-06, + "loss": 0.0001, + "step": 20353 + }, + { + "epoch": 8.27734851565677, + "grad_norm": 0.18591931767869554, + "learning_rate": 6.1594799489622105e-06, + "loss": 0.0025, + "step": 20354 + }, + { + "epoch": 8.277755185034566, + "grad_norm": 1.597426503408262, + "learning_rate": 6.158544480396642e-06, + "loss": 0.0155, + "step": 20355 + }, + { + "epoch": 8.278161854412362, + "grad_norm": 7.8506397885362915, + "learning_rate": 6.157609051265547e-06, + "loss": 0.138, + "step": 20356 + }, + { + "epoch": 8.278568523790158, + "grad_norm": 3.9394979391636555, + "learning_rate": 6.1566736615785285e-06, + "loss": 0.1086, + "step": 20357 + }, + { + "epoch": 8.278975193167954, + "grad_norm": 0.1489611374658343, + "learning_rate": 6.15573831134519e-06, + "loss": 0.0016, + "step": 20358 + }, + { + "epoch": 8.279381862545751, + "grad_norm": 0.37558757761678613, + "learning_rate": 6.1548030005751335e-06, + "loss": 0.004, + "step": 20359 + }, + { + "epoch": 8.279788531923547, + "grad_norm": 0.2757402129174098, + "learning_rate": 6.153867729277956e-06, + "loss": 0.0025, + "step": 20360 + }, + { + "epoch": 8.280195201301343, + "grad_norm": 0.08194604657645646, + "learning_rate": 6.152932497463264e-06, + "loss": 0.0008, + "step": 20361 + }, + { + "epoch": 8.280601870679138, + "grad_norm": 0.06654490016300786, + "learning_rate": 6.151997305140658e-06, + "loss": 0.0009, + "step": 20362 + }, + { + "epoch": 8.281008540056934, + "grad_norm": 2.154187688088068, + "learning_rate": 6.151062152319733e-06, + "loss": 0.0269, + "step": 20363 + }, + { + "epoch": 8.28141520943473, + "grad_norm": 0.12920734689529023, + "learning_rate": 6.150127039010094e-06, + "loss": 0.001, + "step": 20364 + }, + { + "epoch": 8.281821878812526, + "grad_norm": 0.37603291172097897, + "learning_rate": 6.14919196522134e-06, + "loss": 0.0039, + "step": 20365 + }, + { + "epoch": 8.282228548190322, + "grad_norm": 0.7139692268041704, + "learning_rate": 6.148256930963066e-06, + "loss": 0.0093, + "step": 20366 + }, + { + "epoch": 8.282635217568117, + "grad_norm": 0.07334457220031494, + "learning_rate": 6.147321936244873e-06, + "loss": 0.0006, + "step": 20367 + }, + { + "epoch": 8.283041886945913, + "grad_norm": 0.023409472938575298, + "learning_rate": 6.146386981076361e-06, + "loss": 0.0002, + "step": 20368 + }, + { + "epoch": 8.283448556323709, + "grad_norm": 0.022373381111500317, + "learning_rate": 6.145452065467126e-06, + "loss": 0.0002, + "step": 20369 + }, + { + "epoch": 8.283855225701505, + "grad_norm": 0.0020403738167404507, + "learning_rate": 6.144517189426764e-06, + "loss": 0.0, + "step": 20370 + }, + { + "epoch": 8.2842618950793, + "grad_norm": 13.7108046620338, + "learning_rate": 6.143582352964874e-06, + "loss": 0.2525, + "step": 20371 + }, + { + "epoch": 8.284668564457096, + "grad_norm": 1.1174596055580874, + "learning_rate": 6.142647556091052e-06, + "loss": 0.0103, + "step": 20372 + }, + { + "epoch": 8.285075233834892, + "grad_norm": 0.1223320476771252, + "learning_rate": 6.141712798814894e-06, + "loss": 0.0009, + "step": 20373 + }, + { + "epoch": 8.285481903212688, + "grad_norm": 0.0005087855906761547, + "learning_rate": 6.140778081145997e-06, + "loss": 0.0, + "step": 20374 + }, + { + "epoch": 8.285888572590483, + "grad_norm": 0.07601627091932003, + "learning_rate": 6.139843403093957e-06, + "loss": 0.0004, + "step": 20375 + }, + { + "epoch": 8.28629524196828, + "grad_norm": 6.329505458423186, + "learning_rate": 6.1389087646683656e-06, + "loss": 0.0677, + "step": 20376 + }, + { + "epoch": 8.286701911346075, + "grad_norm": 0.005963131188700606, + "learning_rate": 6.137974165878816e-06, + "loss": 0.0001, + "step": 20377 + }, + { + "epoch": 8.28710858072387, + "grad_norm": 4.777221083138408, + "learning_rate": 6.137039606734909e-06, + "loss": 0.0535, + "step": 20378 + }, + { + "epoch": 8.287515250101666, + "grad_norm": 0.14154071014149036, + "learning_rate": 6.136105087246233e-06, + "loss": 0.0016, + "step": 20379 + }, + { + "epoch": 8.287921919479464, + "grad_norm": 0.029244339161294527, + "learning_rate": 6.135170607422383e-06, + "loss": 0.0005, + "step": 20380 + }, + { + "epoch": 8.28832858885726, + "grad_norm": 1.2695645897183037, + "learning_rate": 6.134236167272952e-06, + "loss": 0.0165, + "step": 20381 + }, + { + "epoch": 8.288735258235056, + "grad_norm": 0.0034699248365046968, + "learning_rate": 6.133301766807533e-06, + "loss": 0.0001, + "step": 20382 + }, + { + "epoch": 8.289141927612851, + "grad_norm": 0.14669374023943946, + "learning_rate": 6.1323674060357165e-06, + "loss": 0.0011, + "step": 20383 + }, + { + "epoch": 8.289548596990647, + "grad_norm": 0.27660577334574393, + "learning_rate": 6.131433084967095e-06, + "loss": 0.0035, + "step": 20384 + }, + { + "epoch": 8.289955266368443, + "grad_norm": 1.082738264017689, + "learning_rate": 6.130498803611262e-06, + "loss": 0.0121, + "step": 20385 + }, + { + "epoch": 8.290361935746239, + "grad_norm": 0.39981454003511696, + "learning_rate": 6.129564561977803e-06, + "loss": 0.0038, + "step": 20386 + }, + { + "epoch": 8.290768605124034, + "grad_norm": 0.17628181335188284, + "learning_rate": 6.128630360076314e-06, + "loss": 0.0017, + "step": 20387 + }, + { + "epoch": 8.29117527450183, + "grad_norm": 0.023886069303133004, + "learning_rate": 6.127696197916383e-06, + "loss": 0.0003, + "step": 20388 + }, + { + "epoch": 8.291581943879626, + "grad_norm": 0.18222721673406822, + "learning_rate": 6.1267620755076e-06, + "loss": 0.0007, + "step": 20389 + }, + { + "epoch": 8.291988613257422, + "grad_norm": 0.04221913287566987, + "learning_rate": 6.12582799285955e-06, + "loss": 0.0006, + "step": 20390 + }, + { + "epoch": 8.292395282635217, + "grad_norm": 0.42623996368358874, + "learning_rate": 6.124893949981829e-06, + "loss": 0.0076, + "step": 20391 + }, + { + "epoch": 8.292801952013013, + "grad_norm": 2.5371962978231, + "learning_rate": 6.1239599468840215e-06, + "loss": 0.0289, + "step": 20392 + }, + { + "epoch": 8.293208621390809, + "grad_norm": 6.143370331954379, + "learning_rate": 6.1230259835757135e-06, + "loss": 0.1024, + "step": 20393 + }, + { + "epoch": 8.293615290768605, + "grad_norm": 0.07038239478958914, + "learning_rate": 6.122092060066497e-06, + "loss": 0.0008, + "step": 20394 + }, + { + "epoch": 8.2940219601464, + "grad_norm": 0.002839336881579517, + "learning_rate": 6.121158176365958e-06, + "loss": 0.0, + "step": 20395 + }, + { + "epoch": 8.294428629524196, + "grad_norm": 0.0013944786426904182, + "learning_rate": 6.1202243324836796e-06, + "loss": 0.0, + "step": 20396 + }, + { + "epoch": 8.294835298901992, + "grad_norm": 0.13211389396462345, + "learning_rate": 6.119290528429254e-06, + "loss": 0.0016, + "step": 20397 + }, + { + "epoch": 8.295241968279788, + "grad_norm": 3.909762922925012, + "learning_rate": 6.118356764212263e-06, + "loss": 0.0384, + "step": 20398 + }, + { + "epoch": 8.295648637657584, + "grad_norm": 0.9302605940420045, + "learning_rate": 6.117423039842292e-06, + "loss": 0.0091, + "step": 20399 + }, + { + "epoch": 8.296055307035381, + "grad_norm": 5.7934057559779735, + "learning_rate": 6.116489355328929e-06, + "loss": 0.1862, + "step": 20400 + }, + { + "epoch": 8.296461976413177, + "grad_norm": 0.4638813785841716, + "learning_rate": 6.115555710681757e-06, + "loss": 0.0038, + "step": 20401 + }, + { + "epoch": 8.296868645790973, + "grad_norm": 0.31530095692691784, + "learning_rate": 6.114622105910363e-06, + "loss": 0.0082, + "step": 20402 + }, + { + "epoch": 8.297275315168768, + "grad_norm": 0.005426830348936671, + "learning_rate": 6.113688541024325e-06, + "loss": 0.0001, + "step": 20403 + }, + { + "epoch": 8.297681984546564, + "grad_norm": 0.4415749288494634, + "learning_rate": 6.112755016033231e-06, + "loss": 0.0037, + "step": 20404 + }, + { + "epoch": 8.29808865392436, + "grad_norm": 0.0031058836035971972, + "learning_rate": 6.111821530946666e-06, + "loss": 0.0, + "step": 20405 + }, + { + "epoch": 8.298495323302156, + "grad_norm": 0.4211787109147177, + "learning_rate": 6.110888085774205e-06, + "loss": 0.0044, + "step": 20406 + }, + { + "epoch": 8.298901992679951, + "grad_norm": 0.11426658781663573, + "learning_rate": 6.109954680525438e-06, + "loss": 0.0012, + "step": 20407 + }, + { + "epoch": 8.299308662057747, + "grad_norm": 0.3008253489550834, + "learning_rate": 6.109021315209945e-06, + "loss": 0.0026, + "step": 20408 + }, + { + "epoch": 8.299715331435543, + "grad_norm": 0.058100289336492916, + "learning_rate": 6.1080879898373035e-06, + "loss": 0.0006, + "step": 20409 + }, + { + "epoch": 8.300122000813339, + "grad_norm": 0.2828836397427423, + "learning_rate": 6.1071547044171e-06, + "loss": 0.0027, + "step": 20410 + }, + { + "epoch": 8.300528670191134, + "grad_norm": 0.17267312323338072, + "learning_rate": 6.106221458958913e-06, + "loss": 0.0028, + "step": 20411 + }, + { + "epoch": 8.30093533956893, + "grad_norm": 0.16137607717781277, + "learning_rate": 6.10528825347232e-06, + "loss": 0.0016, + "step": 20412 + }, + { + "epoch": 8.301342008946726, + "grad_norm": 0.0012224034775282255, + "learning_rate": 6.104355087966906e-06, + "loss": 0.0, + "step": 20413 + }, + { + "epoch": 8.301748678324522, + "grad_norm": 0.054404930581391474, + "learning_rate": 6.103421962452246e-06, + "loss": 0.0006, + "step": 20414 + }, + { + "epoch": 8.302155347702318, + "grad_norm": 2.3980944851059047, + "learning_rate": 6.1024888769379224e-06, + "loss": 0.0225, + "step": 20415 + }, + { + "epoch": 8.302562017080113, + "grad_norm": 0.031349898825225465, + "learning_rate": 6.101555831433509e-06, + "loss": 0.0005, + "step": 20416 + }, + { + "epoch": 8.302968686457909, + "grad_norm": 0.6653545975764443, + "learning_rate": 6.100622825948589e-06, + "loss": 0.0071, + "step": 20417 + }, + { + "epoch": 8.303375355835705, + "grad_norm": 1.3728423867171364, + "learning_rate": 6.099689860492739e-06, + "loss": 0.0265, + "step": 20418 + }, + { + "epoch": 8.3037820252135, + "grad_norm": 18.31027351358447, + "learning_rate": 6.098756935075533e-06, + "loss": 0.2225, + "step": 20419 + }, + { + "epoch": 8.304188694591296, + "grad_norm": 0.011220861191902408, + "learning_rate": 6.097824049706552e-06, + "loss": 0.0001, + "step": 20420 + }, + { + "epoch": 8.304595363969094, + "grad_norm": 12.864011241635113, + "learning_rate": 6.096891204395372e-06, + "loss": 0.2746, + "step": 20421 + }, + { + "epoch": 8.30500203334689, + "grad_norm": 3.811000404468277, + "learning_rate": 6.095958399151565e-06, + "loss": 0.1213, + "step": 20422 + }, + { + "epoch": 8.305408702724685, + "grad_norm": 0.0007343134919273569, + "learning_rate": 6.0950256339847124e-06, + "loss": 0.0, + "step": 20423 + }, + { + "epoch": 8.305815372102481, + "grad_norm": 0.030026832736661974, + "learning_rate": 6.094092908904388e-06, + "loss": 0.0004, + "step": 20424 + }, + { + "epoch": 8.306222041480277, + "grad_norm": 0.1406503095339514, + "learning_rate": 6.093160223920163e-06, + "loss": 0.0014, + "step": 20425 + }, + { + "epoch": 8.306628710858073, + "grad_norm": 10.206069950657744, + "learning_rate": 6.0922275790416165e-06, + "loss": 0.1936, + "step": 20426 + }, + { + "epoch": 8.307035380235869, + "grad_norm": 1.3118464535431145, + "learning_rate": 6.091294974278321e-06, + "loss": 0.0153, + "step": 20427 + }, + { + "epoch": 8.307442049613664, + "grad_norm": 0.24696841034065717, + "learning_rate": 6.090362409639849e-06, + "loss": 0.0036, + "step": 20428 + }, + { + "epoch": 8.30784871899146, + "grad_norm": 0.003780126751164685, + "learning_rate": 6.089429885135774e-06, + "loss": 0.0, + "step": 20429 + }, + { + "epoch": 8.308255388369256, + "grad_norm": 0.07508385470465269, + "learning_rate": 6.0884974007756705e-06, + "loss": 0.0008, + "step": 20430 + }, + { + "epoch": 8.308662057747052, + "grad_norm": 0.8573323145135009, + "learning_rate": 6.08756495656911e-06, + "loss": 0.0084, + "step": 20431 + }, + { + "epoch": 8.309068727124847, + "grad_norm": 6.226101549226341, + "learning_rate": 6.086632552525661e-06, + "loss": 0.1082, + "step": 20432 + }, + { + "epoch": 8.309475396502643, + "grad_norm": 0.08711620881828608, + "learning_rate": 6.085700188654902e-06, + "loss": 0.0009, + "step": 20433 + }, + { + "epoch": 8.309882065880439, + "grad_norm": 0.005631199132512048, + "learning_rate": 6.084767864966399e-06, + "loss": 0.0001, + "step": 20434 + }, + { + "epoch": 8.310288735258235, + "grad_norm": 0.9437376034402853, + "learning_rate": 6.083835581469723e-06, + "loss": 0.0088, + "step": 20435 + }, + { + "epoch": 8.31069540463603, + "grad_norm": 0.12904672433239492, + "learning_rate": 6.082903338174449e-06, + "loss": 0.0013, + "step": 20436 + }, + { + "epoch": 8.311102074013826, + "grad_norm": 0.037595960339866126, + "learning_rate": 6.081971135090143e-06, + "loss": 0.0003, + "step": 20437 + }, + { + "epoch": 8.311508743391622, + "grad_norm": 0.4768091971782503, + "learning_rate": 6.0810389722263715e-06, + "loss": 0.0042, + "step": 20438 + }, + { + "epoch": 8.311915412769418, + "grad_norm": 1.1481323190475852, + "learning_rate": 6.0801068495927115e-06, + "loss": 0.013, + "step": 20439 + }, + { + "epoch": 8.312322082147213, + "grad_norm": 1.0568314142523407, + "learning_rate": 6.079174767198728e-06, + "loss": 0.0079, + "step": 20440 + }, + { + "epoch": 8.312728751525011, + "grad_norm": 0.003128450904726333, + "learning_rate": 6.078242725053987e-06, + "loss": 0.0, + "step": 20441 + }, + { + "epoch": 8.313135420902807, + "grad_norm": 0.32352738216638194, + "learning_rate": 6.0773107231680575e-06, + "loss": 0.0023, + "step": 20442 + }, + { + "epoch": 8.313542090280603, + "grad_norm": 1.3772837953048422, + "learning_rate": 6.076378761550507e-06, + "loss": 0.015, + "step": 20443 + }, + { + "epoch": 8.313948759658398, + "grad_norm": 7.580180708553004, + "learning_rate": 6.075446840210907e-06, + "loss": 0.0806, + "step": 20444 + }, + { + "epoch": 8.314355429036194, + "grad_norm": 2.24834840948578, + "learning_rate": 6.074514959158816e-06, + "loss": 0.0307, + "step": 20445 + }, + { + "epoch": 8.31476209841399, + "grad_norm": 0.09112651208408153, + "learning_rate": 6.073583118403807e-06, + "loss": 0.0012, + "step": 20446 + }, + { + "epoch": 8.315168767791786, + "grad_norm": 0.18131693038796204, + "learning_rate": 6.0726513179554445e-06, + "loss": 0.0005, + "step": 20447 + }, + { + "epoch": 8.315575437169581, + "grad_norm": 0.5058721903253306, + "learning_rate": 6.07171955782329e-06, + "loss": 0.0056, + "step": 20448 + }, + { + "epoch": 8.315982106547377, + "grad_norm": 0.533133349158657, + "learning_rate": 6.070787838016912e-06, + "loss": 0.0063, + "step": 20449 + }, + { + "epoch": 8.316388775925173, + "grad_norm": 0.09312778081068285, + "learning_rate": 6.069856158545877e-06, + "loss": 0.0014, + "step": 20450 + }, + { + "epoch": 8.316795445302969, + "grad_norm": 0.9420973403601404, + "learning_rate": 6.068924519419745e-06, + "loss": 0.0092, + "step": 20451 + }, + { + "epoch": 8.317202114680764, + "grad_norm": 0.2012602216206371, + "learning_rate": 6.067992920648083e-06, + "loss": 0.0025, + "step": 20452 + }, + { + "epoch": 8.31760878405856, + "grad_norm": 0.6890072072408913, + "learning_rate": 6.06706136224045e-06, + "loss": 0.0073, + "step": 20453 + }, + { + "epoch": 8.318015453436356, + "grad_norm": 0.3817529235965628, + "learning_rate": 6.066129844206412e-06, + "loss": 0.0048, + "step": 20454 + }, + { + "epoch": 8.318422122814152, + "grad_norm": 0.03699215809498515, + "learning_rate": 6.065198366555532e-06, + "loss": 0.0004, + "step": 20455 + }, + { + "epoch": 8.318828792191947, + "grad_norm": 0.058363706827589, + "learning_rate": 6.06426692929737e-06, + "loss": 0.0008, + "step": 20456 + }, + { + "epoch": 8.319235461569743, + "grad_norm": 0.6116138626845491, + "learning_rate": 6.063335532441488e-06, + "loss": 0.0059, + "step": 20457 + }, + { + "epoch": 8.319642130947539, + "grad_norm": 3.273397963246408, + "learning_rate": 6.062404175997449e-06, + "loss": 0.0294, + "step": 20458 + }, + { + "epoch": 8.320048800325335, + "grad_norm": 3.5048291297111245, + "learning_rate": 6.061472859974813e-06, + "loss": 0.0536, + "step": 20459 + }, + { + "epoch": 8.32045546970313, + "grad_norm": 0.013710385421008364, + "learning_rate": 6.060541584383139e-06, + "loss": 0.0001, + "step": 20460 + }, + { + "epoch": 8.320862139080926, + "grad_norm": 5.124603712874563, + "learning_rate": 6.05961034923199e-06, + "loss": 0.0882, + "step": 20461 + }, + { + "epoch": 8.321268808458724, + "grad_norm": 8.221133161731457, + "learning_rate": 6.058679154530924e-06, + "loss": 0.1373, + "step": 20462 + }, + { + "epoch": 8.32167547783652, + "grad_norm": 11.776434449224343, + "learning_rate": 6.057748000289498e-06, + "loss": 0.171, + "step": 20463 + }, + { + "epoch": 8.322082147214315, + "grad_norm": 0.8302180536220295, + "learning_rate": 6.056816886517275e-06, + "loss": 0.0088, + "step": 20464 + }, + { + "epoch": 8.322488816592111, + "grad_norm": 0.015500958582045086, + "learning_rate": 6.0558858132238115e-06, + "loss": 0.0002, + "step": 20465 + }, + { + "epoch": 8.322895485969907, + "grad_norm": 1.201688477350228, + "learning_rate": 6.0549547804186625e-06, + "loss": 0.0113, + "step": 20466 + }, + { + "epoch": 8.323302155347703, + "grad_norm": 0.06378366168558008, + "learning_rate": 6.05402378811139e-06, + "loss": 0.0006, + "step": 20467 + }, + { + "epoch": 8.323708824725498, + "grad_norm": 0.35235919379742064, + "learning_rate": 6.05309283631155e-06, + "loss": 0.0037, + "step": 20468 + }, + { + "epoch": 8.324115494103294, + "grad_norm": 0.048559479604476166, + "learning_rate": 6.052161925028699e-06, + "loss": 0.0007, + "step": 20469 + }, + { + "epoch": 8.32452216348109, + "grad_norm": 0.06558835583995284, + "learning_rate": 6.05123105427239e-06, + "loss": 0.0008, + "step": 20470 + }, + { + "epoch": 8.324928832858886, + "grad_norm": 0.1232783275407154, + "learning_rate": 6.0503002240521835e-06, + "loss": 0.0011, + "step": 20471 + }, + { + "epoch": 8.325335502236682, + "grad_norm": 0.006682051523174724, + "learning_rate": 6.0493694343776335e-06, + "loss": 0.0001, + "step": 20472 + }, + { + "epoch": 8.325742171614477, + "grad_norm": 0.10791223596176508, + "learning_rate": 6.048438685258291e-06, + "loss": 0.001, + "step": 20473 + }, + { + "epoch": 8.326148840992273, + "grad_norm": 7.594351666166405, + "learning_rate": 6.047507976703718e-06, + "loss": 0.1287, + "step": 20474 + }, + { + "epoch": 8.326555510370069, + "grad_norm": 0.6045284101505961, + "learning_rate": 6.0465773087234645e-06, + "loss": 0.0048, + "step": 20475 + }, + { + "epoch": 8.326962179747865, + "grad_norm": 0.20327016710665052, + "learning_rate": 6.045646681327082e-06, + "loss": 0.0019, + "step": 20476 + }, + { + "epoch": 8.32736884912566, + "grad_norm": 0.014224310295377624, + "learning_rate": 6.0447160945241305e-06, + "loss": 0.0002, + "step": 20477 + }, + { + "epoch": 8.327775518503456, + "grad_norm": 0.06155114047753597, + "learning_rate": 6.043785548324157e-06, + "loss": 0.0007, + "step": 20478 + }, + { + "epoch": 8.328182187881252, + "grad_norm": 0.29570377897374267, + "learning_rate": 6.042855042736712e-06, + "loss": 0.0031, + "step": 20479 + }, + { + "epoch": 8.328588857259048, + "grad_norm": 1.9585620892011446, + "learning_rate": 6.0419245777713576e-06, + "loss": 0.0333, + "step": 20480 + }, + { + "epoch": 8.328995526636843, + "grad_norm": 0.484414644839673, + "learning_rate": 6.040994153437638e-06, + "loss": 0.0053, + "step": 20481 + }, + { + "epoch": 8.329402196014641, + "grad_norm": 0.4381295848234779, + "learning_rate": 6.040063769745106e-06, + "loss": 0.0049, + "step": 20482 + }, + { + "epoch": 8.329808865392437, + "grad_norm": 0.10720680249681545, + "learning_rate": 6.0391334267033095e-06, + "loss": 0.0012, + "step": 20483 + }, + { + "epoch": 8.330215534770232, + "grad_norm": 0.21563400834864244, + "learning_rate": 6.038203124321804e-06, + "loss": 0.0027, + "step": 20484 + }, + { + "epoch": 8.330622204148028, + "grad_norm": 0.9251454529362909, + "learning_rate": 6.037272862610137e-06, + "loss": 0.0077, + "step": 20485 + }, + { + "epoch": 8.331028873525824, + "grad_norm": 0.002022560015034954, + "learning_rate": 6.036342641577858e-06, + "loss": 0.0, + "step": 20486 + }, + { + "epoch": 8.33143554290362, + "grad_norm": 0.044787795599401964, + "learning_rate": 6.035412461234518e-06, + "loss": 0.0005, + "step": 20487 + }, + { + "epoch": 8.331842212281416, + "grad_norm": 0.05927136144785496, + "learning_rate": 6.0344823215896655e-06, + "loss": 0.0007, + "step": 20488 + }, + { + "epoch": 8.332248881659211, + "grad_norm": 0.19196536961329044, + "learning_rate": 6.033552222652843e-06, + "loss": 0.0017, + "step": 20489 + }, + { + "epoch": 8.332655551037007, + "grad_norm": 0.023152376457150757, + "learning_rate": 6.032622164433608e-06, + "loss": 0.0002, + "step": 20490 + }, + { + "epoch": 8.333062220414803, + "grad_norm": 7.588132395629964, + "learning_rate": 6.031692146941501e-06, + "loss": 0.1504, + "step": 20491 + }, + { + "epoch": 8.333468889792599, + "grad_norm": 9.335314394495402, + "learning_rate": 6.030762170186073e-06, + "loss": 0.2153, + "step": 20492 + }, + { + "epoch": 8.333875559170394, + "grad_norm": 0.10985754702590654, + "learning_rate": 6.0298322341768665e-06, + "loss": 0.0011, + "step": 20493 + }, + { + "epoch": 8.33428222854819, + "grad_norm": 0.002095082820549089, + "learning_rate": 6.0289023389234326e-06, + "loss": 0.0, + "step": 20494 + }, + { + "epoch": 8.334688897925986, + "grad_norm": 0.06665366256745375, + "learning_rate": 6.027972484435313e-06, + "loss": 0.0009, + "step": 20495 + }, + { + "epoch": 8.335095567303782, + "grad_norm": 0.11256557603863693, + "learning_rate": 6.027042670722054e-06, + "loss": 0.0017, + "step": 20496 + }, + { + "epoch": 8.335502236681577, + "grad_norm": 0.0038138785411485345, + "learning_rate": 6.026112897793205e-06, + "loss": 0.0001, + "step": 20497 + }, + { + "epoch": 8.335908906059373, + "grad_norm": 0.7306151351770579, + "learning_rate": 6.025183165658306e-06, + "loss": 0.0075, + "step": 20498 + }, + { + "epoch": 8.336315575437169, + "grad_norm": 10.839771406258784, + "learning_rate": 6.0242534743269e-06, + "loss": 0.2312, + "step": 20499 + }, + { + "epoch": 8.336722244814965, + "grad_norm": 0.025043554580131012, + "learning_rate": 6.023323823808535e-06, + "loss": 0.0003, + "step": 20500 + }, + { + "epoch": 8.33712891419276, + "grad_norm": 0.5153638391443472, + "learning_rate": 6.0223942141127525e-06, + "loss": 0.0058, + "step": 20501 + }, + { + "epoch": 8.337535583570556, + "grad_norm": 0.3890888151271877, + "learning_rate": 6.021464645249093e-06, + "loss": 0.0041, + "step": 20502 + }, + { + "epoch": 8.337942252948354, + "grad_norm": 0.12349012111991431, + "learning_rate": 6.020535117227104e-06, + "loss": 0.0014, + "step": 20503 + }, + { + "epoch": 8.33834892232615, + "grad_norm": 1.172538897942569, + "learning_rate": 6.0196056300563245e-06, + "loss": 0.0106, + "step": 20504 + }, + { + "epoch": 8.338755591703945, + "grad_norm": 0.24385920038984468, + "learning_rate": 6.018676183746297e-06, + "loss": 0.0038, + "step": 20505 + }, + { + "epoch": 8.339162261081741, + "grad_norm": 0.07267430604255547, + "learning_rate": 6.017746778306559e-06, + "loss": 0.0006, + "step": 20506 + }, + { + "epoch": 8.339568930459537, + "grad_norm": 2.793564009089141, + "learning_rate": 6.016817413746657e-06, + "loss": 0.0558, + "step": 20507 + }, + { + "epoch": 8.339975599837333, + "grad_norm": 0.06933986636316782, + "learning_rate": 6.01588809007613e-06, + "loss": 0.0005, + "step": 20508 + }, + { + "epoch": 8.340382269215128, + "grad_norm": 2.904032240446944, + "learning_rate": 6.014958807304514e-06, + "loss": 0.037, + "step": 20509 + }, + { + "epoch": 8.340788938592924, + "grad_norm": 0.031983107168222315, + "learning_rate": 6.014029565441352e-06, + "loss": 0.0004, + "step": 20510 + }, + { + "epoch": 8.34119560797072, + "grad_norm": 0.13534812151849962, + "learning_rate": 6.013100364496185e-06, + "loss": 0.0006, + "step": 20511 + }, + { + "epoch": 8.341602277348516, + "grad_norm": 0.12848402530123554, + "learning_rate": 6.012171204478547e-06, + "loss": 0.0016, + "step": 20512 + }, + { + "epoch": 8.342008946726311, + "grad_norm": 5.795070240894096, + "learning_rate": 6.01124208539798e-06, + "loss": 0.1453, + "step": 20513 + }, + { + "epoch": 8.342415616104107, + "grad_norm": 0.13787161005775575, + "learning_rate": 6.010313007264021e-06, + "loss": 0.0017, + "step": 20514 + }, + { + "epoch": 8.342822285481903, + "grad_norm": 3.0047035189365, + "learning_rate": 6.009383970086205e-06, + "loss": 0.0213, + "step": 20515 + }, + { + "epoch": 8.343228954859699, + "grad_norm": 0.2866214144674875, + "learning_rate": 6.008454973874071e-06, + "loss": 0.0028, + "step": 20516 + }, + { + "epoch": 8.343635624237494, + "grad_norm": 0.06252642565373641, + "learning_rate": 6.0075260186371575e-06, + "loss": 0.0006, + "step": 20517 + }, + { + "epoch": 8.34404229361529, + "grad_norm": 0.2979876257548065, + "learning_rate": 6.006597104384999e-06, + "loss": 0.0019, + "step": 20518 + }, + { + "epoch": 8.344448962993086, + "grad_norm": 0.13923820407298432, + "learning_rate": 6.005668231127128e-06, + "loss": 0.0019, + "step": 20519 + }, + { + "epoch": 8.344855632370882, + "grad_norm": 0.16857428899331425, + "learning_rate": 6.004739398873086e-06, + "loss": 0.0024, + "step": 20520 + }, + { + "epoch": 8.345262301748678, + "grad_norm": 3.062128923399726, + "learning_rate": 6.003810607632404e-06, + "loss": 0.0657, + "step": 20521 + }, + { + "epoch": 8.345668971126473, + "grad_norm": 1.1667737923052781, + "learning_rate": 6.002881857414615e-06, + "loss": 0.012, + "step": 20522 + }, + { + "epoch": 8.34607564050427, + "grad_norm": 0.0520731868869856, + "learning_rate": 6.001953148229259e-06, + "loss": 0.0004, + "step": 20523 + }, + { + "epoch": 8.346482309882067, + "grad_norm": 0.6733496433047039, + "learning_rate": 6.001024480085865e-06, + "loss": 0.0064, + "step": 20524 + }, + { + "epoch": 8.346888979259862, + "grad_norm": 0.030998247775303443, + "learning_rate": 6.000095852993965e-06, + "loss": 0.0003, + "step": 20525 + }, + { + "epoch": 8.347295648637658, + "grad_norm": 0.28719742125487624, + "learning_rate": 5.999167266963097e-06, + "loss": 0.0024, + "step": 20526 + }, + { + "epoch": 8.347702318015454, + "grad_norm": 0.043254838776902775, + "learning_rate": 5.99823872200279e-06, + "loss": 0.0005, + "step": 20527 + }, + { + "epoch": 8.34810898739325, + "grad_norm": 0.4910361945398605, + "learning_rate": 5.9973102181225764e-06, + "loss": 0.005, + "step": 20528 + }, + { + "epoch": 8.348515656771045, + "grad_norm": 0.20939050183910277, + "learning_rate": 5.996381755331988e-06, + "loss": 0.0013, + "step": 20529 + }, + { + "epoch": 8.348922326148841, + "grad_norm": 0.43868418976169377, + "learning_rate": 5.995453333640555e-06, + "loss": 0.0028, + "step": 20530 + }, + { + "epoch": 8.349328995526637, + "grad_norm": 0.477174639210557, + "learning_rate": 5.994524953057811e-06, + "loss": 0.0086, + "step": 20531 + }, + { + "epoch": 8.349735664904433, + "grad_norm": 15.698470958508139, + "learning_rate": 5.993596613593282e-06, + "loss": 0.1779, + "step": 20532 + }, + { + "epoch": 8.350142334282229, + "grad_norm": 0.3132455261442909, + "learning_rate": 5.992668315256502e-06, + "loss": 0.0025, + "step": 20533 + }, + { + "epoch": 8.350549003660024, + "grad_norm": 0.047607735422816266, + "learning_rate": 5.991740058056998e-06, + "loss": 0.0005, + "step": 20534 + }, + { + "epoch": 8.35095567303782, + "grad_norm": 0.4865686006881723, + "learning_rate": 5.990811842004297e-06, + "loss": 0.0047, + "step": 20535 + }, + { + "epoch": 8.351362342415616, + "grad_norm": 0.029308121363213525, + "learning_rate": 5.989883667107933e-06, + "loss": 0.0004, + "step": 20536 + }, + { + "epoch": 8.351769011793412, + "grad_norm": 0.3487429348077639, + "learning_rate": 5.988955533377432e-06, + "loss": 0.0046, + "step": 20537 + }, + { + "epoch": 8.352175681171207, + "grad_norm": 2.7556263575393194, + "learning_rate": 5.988027440822318e-06, + "loss": 0.0223, + "step": 20538 + }, + { + "epoch": 8.352582350549003, + "grad_norm": 8.672017798558274, + "learning_rate": 5.987099389452124e-06, + "loss": 0.1678, + "step": 20539 + }, + { + "epoch": 8.352989019926799, + "grad_norm": 0.29645331953128357, + "learning_rate": 5.986171379276375e-06, + "loss": 0.003, + "step": 20540 + }, + { + "epoch": 8.353395689304595, + "grad_norm": 0.45236561691521926, + "learning_rate": 5.985243410304594e-06, + "loss": 0.0037, + "step": 20541 + }, + { + "epoch": 8.35380235868239, + "grad_norm": 0.0018770058122138885, + "learning_rate": 5.984315482546312e-06, + "loss": 0.0, + "step": 20542 + }, + { + "epoch": 8.354209028060186, + "grad_norm": 0.022001628306035075, + "learning_rate": 5.983387596011054e-06, + "loss": 0.0003, + "step": 20543 + }, + { + "epoch": 8.354615697437984, + "grad_norm": 0.007968817013197436, + "learning_rate": 5.982459750708343e-06, + "loss": 0.0001, + "step": 20544 + }, + { + "epoch": 8.35502236681578, + "grad_norm": 0.04930416021672524, + "learning_rate": 5.981531946647703e-06, + "loss": 0.0004, + "step": 20545 + }, + { + "epoch": 8.355429036193575, + "grad_norm": 2.971613563255719, + "learning_rate": 5.9806041838386604e-06, + "loss": 0.1516, + "step": 20546 + }, + { + "epoch": 8.355835705571371, + "grad_norm": 3.368398581624693, + "learning_rate": 5.97967646229074e-06, + "loss": 0.0334, + "step": 20547 + }, + { + "epoch": 8.356242374949167, + "grad_norm": 2.7799640543005477, + "learning_rate": 5.978748782013463e-06, + "loss": 0.0253, + "step": 20548 + }, + { + "epoch": 8.356649044326963, + "grad_norm": 1.1794916546936738, + "learning_rate": 5.9778211430163536e-06, + "loss": 0.0154, + "step": 20549 + }, + { + "epoch": 8.357055713704758, + "grad_norm": 0.004549321080283686, + "learning_rate": 5.976893545308936e-06, + "loss": 0.0, + "step": 20550 + }, + { + "epoch": 8.357462383082554, + "grad_norm": 0.07500167460515898, + "learning_rate": 5.97596598890073e-06, + "loss": 0.0007, + "step": 20551 + }, + { + "epoch": 8.35786905246035, + "grad_norm": 0.09282047300676795, + "learning_rate": 5.9750384738012615e-06, + "loss": 0.0013, + "step": 20552 + }, + { + "epoch": 8.358275721838146, + "grad_norm": 0.0490498746796717, + "learning_rate": 5.974111000020044e-06, + "loss": 0.0004, + "step": 20553 + }, + { + "epoch": 8.358682391215941, + "grad_norm": 5.86664659617013, + "learning_rate": 5.973183567566605e-06, + "loss": 0.1596, + "step": 20554 + }, + { + "epoch": 8.359089060593737, + "grad_norm": 0.0014364051286059692, + "learning_rate": 5.972256176450464e-06, + "loss": 0.0, + "step": 20555 + }, + { + "epoch": 8.359495729971533, + "grad_norm": 0.21610904295122293, + "learning_rate": 5.971328826681139e-06, + "loss": 0.0024, + "step": 20556 + }, + { + "epoch": 8.359902399349329, + "grad_norm": 0.015113497358223161, + "learning_rate": 5.970401518268152e-06, + "loss": 0.0001, + "step": 20557 + }, + { + "epoch": 8.360309068727124, + "grad_norm": 0.5762752594900079, + "learning_rate": 5.9694742512210215e-06, + "loss": 0.005, + "step": 20558 + }, + { + "epoch": 8.36071573810492, + "grad_norm": 0.019514431399920235, + "learning_rate": 5.968547025549267e-06, + "loss": 0.0002, + "step": 20559 + }, + { + "epoch": 8.361122407482716, + "grad_norm": 0.07173162052979228, + "learning_rate": 5.967619841262405e-06, + "loss": 0.0006, + "step": 20560 + }, + { + "epoch": 8.361529076860512, + "grad_norm": 1.514703120142873, + "learning_rate": 5.966692698369955e-06, + "loss": 0.0085, + "step": 20561 + }, + { + "epoch": 8.361935746238307, + "grad_norm": 1.7438110311667239, + "learning_rate": 5.965765596881434e-06, + "loss": 0.0221, + "step": 20562 + }, + { + "epoch": 8.362342415616103, + "grad_norm": 4.202128411288971, + "learning_rate": 5.9648385368063585e-06, + "loss": 0.0773, + "step": 20563 + }, + { + "epoch": 8.3627490849939, + "grad_norm": 2.7022951664945274, + "learning_rate": 5.963911518154249e-06, + "loss": 0.0351, + "step": 20564 + }, + { + "epoch": 8.363155754371697, + "grad_norm": 0.650422144700544, + "learning_rate": 5.96298454093462e-06, + "loss": 0.0078, + "step": 20565 + }, + { + "epoch": 8.363562423749492, + "grad_norm": 0.696314156564624, + "learning_rate": 5.962057605156983e-06, + "loss": 0.0058, + "step": 20566 + }, + { + "epoch": 8.363969093127288, + "grad_norm": 0.022637362147035385, + "learning_rate": 5.961130710830858e-06, + "loss": 0.0004, + "step": 20567 + }, + { + "epoch": 8.364375762505084, + "grad_norm": 0.009989657154451529, + "learning_rate": 5.960203857965761e-06, + "loss": 0.0001, + "step": 20568 + }, + { + "epoch": 8.36478243188288, + "grad_norm": 0.013336024517908855, + "learning_rate": 5.9592770465712e-06, + "loss": 0.0002, + "step": 20569 + }, + { + "epoch": 8.365189101260675, + "grad_norm": 1.3344128295546793, + "learning_rate": 5.958350276656697e-06, + "loss": 0.0127, + "step": 20570 + }, + { + "epoch": 8.365595770638471, + "grad_norm": 0.026392322686486752, + "learning_rate": 5.957423548231764e-06, + "loss": 0.0002, + "step": 20571 + }, + { + "epoch": 8.366002440016267, + "grad_norm": 0.07700491112018233, + "learning_rate": 5.956496861305912e-06, + "loss": 0.0007, + "step": 20572 + }, + { + "epoch": 8.366409109394063, + "grad_norm": 0.22130869105996617, + "learning_rate": 5.9555702158886505e-06, + "loss": 0.0021, + "step": 20573 + }, + { + "epoch": 8.366815778771858, + "grad_norm": 0.005380312541187, + "learning_rate": 5.954643611989501e-06, + "loss": 0.0001, + "step": 20574 + }, + { + "epoch": 8.367222448149654, + "grad_norm": 0.22133185798112393, + "learning_rate": 5.9537170496179704e-06, + "loss": 0.0016, + "step": 20575 + }, + { + "epoch": 8.36762911752745, + "grad_norm": 0.558635444087768, + "learning_rate": 5.952790528783568e-06, + "loss": 0.0061, + "step": 20576 + }, + { + "epoch": 8.368035786905246, + "grad_norm": 0.42010075145548487, + "learning_rate": 5.95186404949581e-06, + "loss": 0.0042, + "step": 20577 + }, + { + "epoch": 8.368442456283042, + "grad_norm": 0.007621421413254894, + "learning_rate": 5.950937611764205e-06, + "loss": 0.0001, + "step": 20578 + }, + { + "epoch": 8.368849125660837, + "grad_norm": 1.1119963512776705, + "learning_rate": 5.950011215598261e-06, + "loss": 0.008, + "step": 20579 + }, + { + "epoch": 8.369255795038633, + "grad_norm": 3.111146780010584, + "learning_rate": 5.949084861007492e-06, + "loss": 0.0575, + "step": 20580 + }, + { + "epoch": 8.369662464416429, + "grad_norm": 6.320511389751284, + "learning_rate": 5.948158548001407e-06, + "loss": 0.0814, + "step": 20581 + }, + { + "epoch": 8.370069133794225, + "grad_norm": 0.02589635155165859, + "learning_rate": 5.947232276589512e-06, + "loss": 0.0002, + "step": 20582 + }, + { + "epoch": 8.37047580317202, + "grad_norm": 0.055404936983869627, + "learning_rate": 5.946306046781317e-06, + "loss": 0.0008, + "step": 20583 + }, + { + "epoch": 8.370882472549816, + "grad_norm": 2.7551561027847375, + "learning_rate": 5.945379858586332e-06, + "loss": 0.0451, + "step": 20584 + }, + { + "epoch": 8.371289141927614, + "grad_norm": 3.2100618151894915, + "learning_rate": 5.9444537120140635e-06, + "loss": 0.0493, + "step": 20585 + }, + { + "epoch": 8.37169581130541, + "grad_norm": 7.236659756679144, + "learning_rate": 5.943527607074017e-06, + "loss": 0.1268, + "step": 20586 + }, + { + "epoch": 8.372102480683205, + "grad_norm": 0.0044853183029768436, + "learning_rate": 5.9426015437757025e-06, + "loss": 0.0, + "step": 20587 + }, + { + "epoch": 8.372509150061001, + "grad_norm": 0.048186600340661544, + "learning_rate": 5.941675522128625e-06, + "loss": 0.0005, + "step": 20588 + }, + { + "epoch": 8.372915819438797, + "grad_norm": 0.007187064426114322, + "learning_rate": 5.94074954214229e-06, + "loss": 0.0001, + "step": 20589 + }, + { + "epoch": 8.373322488816592, + "grad_norm": 0.6065155036091594, + "learning_rate": 5.939823603826205e-06, + "loss": 0.0054, + "step": 20590 + }, + { + "epoch": 8.373729158194388, + "grad_norm": 0.011580214907273283, + "learning_rate": 5.9388977071898746e-06, + "loss": 0.0002, + "step": 20591 + }, + { + "epoch": 8.374135827572184, + "grad_norm": 0.0013129142891266073, + "learning_rate": 5.937971852242801e-06, + "loss": 0.0, + "step": 20592 + }, + { + "epoch": 8.37454249694998, + "grad_norm": 5.195105733646938, + "learning_rate": 5.937046038994492e-06, + "loss": 0.1168, + "step": 20593 + }, + { + "epoch": 8.374949166327776, + "grad_norm": 0.053390471760316076, + "learning_rate": 5.936120267454451e-06, + "loss": 0.0006, + "step": 20594 + }, + { + "epoch": 8.375355835705571, + "grad_norm": 0.4421701792289192, + "learning_rate": 5.935194537632181e-06, + "loss": 0.0047, + "step": 20595 + }, + { + "epoch": 8.375762505083367, + "grad_norm": 0.9658404898476444, + "learning_rate": 5.934268849537182e-06, + "loss": 0.0112, + "step": 20596 + }, + { + "epoch": 8.376169174461163, + "grad_norm": 0.024658039584559362, + "learning_rate": 5.933343203178962e-06, + "loss": 0.0003, + "step": 20597 + }, + { + "epoch": 8.376575843838959, + "grad_norm": 0.05043170720330769, + "learning_rate": 5.93241759856702e-06, + "loss": 0.0003, + "step": 20598 + }, + { + "epoch": 8.376982513216754, + "grad_norm": 3.0382242888581827, + "learning_rate": 5.931492035710857e-06, + "loss": 0.0296, + "step": 20599 + }, + { + "epoch": 8.37738918259455, + "grad_norm": 0.1624650368114089, + "learning_rate": 5.930566514619977e-06, + "loss": 0.0015, + "step": 20600 + }, + { + "epoch": 8.377795851972346, + "grad_norm": 0.13047733886253363, + "learning_rate": 5.929641035303882e-06, + "loss": 0.0026, + "step": 20601 + }, + { + "epoch": 8.378202521350142, + "grad_norm": 0.04861229663648222, + "learning_rate": 5.928715597772068e-06, + "loss": 0.0004, + "step": 20602 + }, + { + "epoch": 8.378609190727937, + "grad_norm": 1.062957032186365, + "learning_rate": 5.927790202034038e-06, + "loss": 0.008, + "step": 20603 + }, + { + "epoch": 8.379015860105733, + "grad_norm": 0.15868082853171028, + "learning_rate": 5.926864848099292e-06, + "loss": 0.0015, + "step": 20604 + }, + { + "epoch": 8.37942252948353, + "grad_norm": 0.001245115958840264, + "learning_rate": 5.9259395359773255e-06, + "loss": 0.0, + "step": 20605 + }, + { + "epoch": 8.379829198861326, + "grad_norm": 25.136158213853086, + "learning_rate": 5.925014265677643e-06, + "loss": 0.2455, + "step": 20606 + }, + { + "epoch": 8.380235868239122, + "grad_norm": 0.29563716060903655, + "learning_rate": 5.9240890372097394e-06, + "loss": 0.0022, + "step": 20607 + }, + { + "epoch": 8.380642537616918, + "grad_norm": 0.9519844001598787, + "learning_rate": 5.923163850583114e-06, + "loss": 0.0101, + "step": 20608 + }, + { + "epoch": 8.381049206994714, + "grad_norm": 0.03331931038642054, + "learning_rate": 5.9222387058072604e-06, + "loss": 0.0005, + "step": 20609 + }, + { + "epoch": 8.38145587637251, + "grad_norm": 1.6758716668343325, + "learning_rate": 5.9213136028916804e-06, + "loss": 0.0173, + "step": 20610 + }, + { + "epoch": 8.381862545750305, + "grad_norm": 0.017072639785437854, + "learning_rate": 5.920388541845871e-06, + "loss": 0.0001, + "step": 20611 + }, + { + "epoch": 8.382269215128101, + "grad_norm": 0.601649273435692, + "learning_rate": 5.919463522679323e-06, + "loss": 0.0053, + "step": 20612 + }, + { + "epoch": 8.382675884505897, + "grad_norm": 13.04009479072848, + "learning_rate": 5.918538545401537e-06, + "loss": 0.4566, + "step": 20613 + }, + { + "epoch": 8.383082553883693, + "grad_norm": 0.018120768201845997, + "learning_rate": 5.917613610022008e-06, + "loss": 0.0001, + "step": 20614 + }, + { + "epoch": 8.383489223261488, + "grad_norm": 1.0836986673476143, + "learning_rate": 5.91668871655023e-06, + "loss": 0.0128, + "step": 20615 + }, + { + "epoch": 8.383895892639284, + "grad_norm": 0.22915348297102472, + "learning_rate": 5.915763864995696e-06, + "loss": 0.0032, + "step": 20616 + }, + { + "epoch": 8.38430256201708, + "grad_norm": 0.014047541654481225, + "learning_rate": 5.914839055367902e-06, + "loss": 0.0002, + "step": 20617 + }, + { + "epoch": 8.384709231394876, + "grad_norm": 0.21341334351775595, + "learning_rate": 5.9139142876763415e-06, + "loss": 0.0026, + "step": 20618 + }, + { + "epoch": 8.385115900772671, + "grad_norm": 0.20360894286648948, + "learning_rate": 5.912989561930507e-06, + "loss": 0.0016, + "step": 20619 + }, + { + "epoch": 8.385522570150467, + "grad_norm": 7.31326123230184, + "learning_rate": 5.912064878139893e-06, + "loss": 0.0915, + "step": 20620 + }, + { + "epoch": 8.385929239528263, + "grad_norm": 0.06628806301265097, + "learning_rate": 5.911140236313989e-06, + "loss": 0.0009, + "step": 20621 + }, + { + "epoch": 8.386335908906059, + "grad_norm": 0.004398826343134666, + "learning_rate": 5.910215636462286e-06, + "loss": 0.0001, + "step": 20622 + }, + { + "epoch": 8.386742578283854, + "grad_norm": 0.1782413836068129, + "learning_rate": 5.909291078594281e-06, + "loss": 0.0035, + "step": 20623 + }, + { + "epoch": 8.38714924766165, + "grad_norm": 1.7240404488558445, + "learning_rate": 5.908366562719463e-06, + "loss": 0.0158, + "step": 20624 + }, + { + "epoch": 8.387555917039446, + "grad_norm": 0.009556737033468314, + "learning_rate": 5.907442088847318e-06, + "loss": 0.0, + "step": 20625 + }, + { + "epoch": 8.387962586417244, + "grad_norm": 0.06527981577365893, + "learning_rate": 5.906517656987343e-06, + "loss": 0.0008, + "step": 20626 + }, + { + "epoch": 8.38836925579504, + "grad_norm": 1.1271182952377687, + "learning_rate": 5.905593267149024e-06, + "loss": 0.0123, + "step": 20627 + }, + { + "epoch": 8.388775925172835, + "grad_norm": 0.0010634218604060484, + "learning_rate": 5.904668919341847e-06, + "loss": 0.0, + "step": 20628 + }, + { + "epoch": 8.38918259455063, + "grad_norm": 0.2904578389996834, + "learning_rate": 5.903744613575309e-06, + "loss": 0.0033, + "step": 20629 + }, + { + "epoch": 8.389589263928427, + "grad_norm": 0.0118117588686034, + "learning_rate": 5.902820349858892e-06, + "loss": 0.0001, + "step": 20630 + }, + { + "epoch": 8.389995933306222, + "grad_norm": 2.807977277241362, + "learning_rate": 5.901896128202085e-06, + "loss": 0.0452, + "step": 20631 + }, + { + "epoch": 8.390402602684018, + "grad_norm": 0.03351114351349487, + "learning_rate": 5.90097194861438e-06, + "loss": 0.0003, + "step": 20632 + }, + { + "epoch": 8.390809272061814, + "grad_norm": 0.8951151568026942, + "learning_rate": 5.900047811105259e-06, + "loss": 0.0064, + "step": 20633 + }, + { + "epoch": 8.39121594143961, + "grad_norm": 0.08477490643976267, + "learning_rate": 5.899123715684212e-06, + "loss": 0.001, + "step": 20634 + }, + { + "epoch": 8.391622610817405, + "grad_norm": 4.243639711469398, + "learning_rate": 5.898199662360722e-06, + "loss": 0.0279, + "step": 20635 + }, + { + "epoch": 8.392029280195201, + "grad_norm": 0.058699045578947756, + "learning_rate": 5.897275651144279e-06, + "loss": 0.0009, + "step": 20636 + }, + { + "epoch": 8.392435949572997, + "grad_norm": 0.20437061552010158, + "learning_rate": 5.896351682044366e-06, + "loss": 0.0022, + "step": 20637 + }, + { + "epoch": 8.392842618950793, + "grad_norm": 0.005723436428543587, + "learning_rate": 5.895427755070465e-06, + "loss": 0.0001, + "step": 20638 + }, + { + "epoch": 8.393249288328589, + "grad_norm": 2.405130615013193, + "learning_rate": 5.894503870232067e-06, + "loss": 0.0234, + "step": 20639 + }, + { + "epoch": 8.393655957706384, + "grad_norm": 0.0485453579469768, + "learning_rate": 5.893580027538653e-06, + "loss": 0.0004, + "step": 20640 + }, + { + "epoch": 8.39406262708418, + "grad_norm": 1.2134650401316363, + "learning_rate": 5.8926562269997044e-06, + "loss": 0.011, + "step": 20641 + }, + { + "epoch": 8.394469296461976, + "grad_norm": 1.854051187216036, + "learning_rate": 5.891732468624709e-06, + "loss": 0.0195, + "step": 20642 + }, + { + "epoch": 8.394875965839772, + "grad_norm": 0.16050026901352676, + "learning_rate": 5.890808752423148e-06, + "loss": 0.0023, + "step": 20643 + }, + { + "epoch": 8.395282635217567, + "grad_norm": 0.907559941563137, + "learning_rate": 5.889885078404501e-06, + "loss": 0.0111, + "step": 20644 + }, + { + "epoch": 8.395689304595363, + "grad_norm": 0.012902388793919101, + "learning_rate": 5.888961446578254e-06, + "loss": 0.0002, + "step": 20645 + }, + { + "epoch": 8.39609597397316, + "grad_norm": 0.32609781678877275, + "learning_rate": 5.888037856953887e-06, + "loss": 0.0023, + "step": 20646 + }, + { + "epoch": 8.396502643350956, + "grad_norm": 11.673082374774337, + "learning_rate": 5.887114309540881e-06, + "loss": 0.2495, + "step": 20647 + }, + { + "epoch": 8.396909312728752, + "grad_norm": 0.007113102490042586, + "learning_rate": 5.886190804348715e-06, + "loss": 0.0001, + "step": 20648 + }, + { + "epoch": 8.397315982106548, + "grad_norm": 10.2716281802463, + "learning_rate": 5.885267341386872e-06, + "loss": 0.0792, + "step": 20649 + }, + { + "epoch": 8.397722651484344, + "grad_norm": 0.21495263001616086, + "learning_rate": 5.88434392066483e-06, + "loss": 0.0026, + "step": 20650 + }, + { + "epoch": 8.39812932086214, + "grad_norm": 0.004106915521753263, + "learning_rate": 5.883420542192073e-06, + "loss": 0.0001, + "step": 20651 + }, + { + "epoch": 8.398535990239935, + "grad_norm": 0.11490275221747048, + "learning_rate": 5.882497205978073e-06, + "loss": 0.0015, + "step": 20652 + }, + { + "epoch": 8.398942659617731, + "grad_norm": 0.13028476153232832, + "learning_rate": 5.881573912032309e-06, + "loss": 0.0017, + "step": 20653 + }, + { + "epoch": 8.399349328995527, + "grad_norm": 0.5980541274056833, + "learning_rate": 5.8806506603642644e-06, + "loss": 0.0065, + "step": 20654 + }, + { + "epoch": 8.399755998373323, + "grad_norm": 4.134493181298521, + "learning_rate": 5.879727450983412e-06, + "loss": 0.0477, + "step": 20655 + }, + { + "epoch": 8.400162667751118, + "grad_norm": 0.04148251222554834, + "learning_rate": 5.878804283899231e-06, + "loss": 0.0004, + "step": 20656 + }, + { + "epoch": 8.400569337128914, + "grad_norm": 5.726995293056298, + "learning_rate": 5.8778811591212e-06, + "loss": 0.047, + "step": 20657 + }, + { + "epoch": 8.40097600650671, + "grad_norm": 0.047923391396847294, + "learning_rate": 5.876958076658794e-06, + "loss": 0.0005, + "step": 20658 + }, + { + "epoch": 8.401382675884506, + "grad_norm": 2.1526794782743, + "learning_rate": 5.876035036521486e-06, + "loss": 0.0268, + "step": 20659 + }, + { + "epoch": 8.401789345262301, + "grad_norm": 0.6454326988831152, + "learning_rate": 5.8751120387187554e-06, + "loss": 0.0056, + "step": 20660 + }, + { + "epoch": 8.402196014640097, + "grad_norm": 0.43727685204462563, + "learning_rate": 5.8741890832600754e-06, + "loss": 0.0041, + "step": 20661 + }, + { + "epoch": 8.402602684017893, + "grad_norm": 4.748134076554621, + "learning_rate": 5.873266170154921e-06, + "loss": 0.0706, + "step": 20662 + }, + { + "epoch": 8.403009353395689, + "grad_norm": 3.942292525887839, + "learning_rate": 5.872343299412765e-06, + "loss": 0.0322, + "step": 20663 + }, + { + "epoch": 8.403416022773484, + "grad_norm": 0.0002708486125343145, + "learning_rate": 5.871420471043085e-06, + "loss": 0.0, + "step": 20664 + }, + { + "epoch": 8.40382269215128, + "grad_norm": 0.09296520317306024, + "learning_rate": 5.870497685055349e-06, + "loss": 0.0017, + "step": 20665 + }, + { + "epoch": 8.404229361529076, + "grad_norm": 1.3135974520351494, + "learning_rate": 5.869574941459033e-06, + "loss": 0.0073, + "step": 20666 + }, + { + "epoch": 8.404636030906874, + "grad_norm": 0.4849651791488953, + "learning_rate": 5.86865224026361e-06, + "loss": 0.0085, + "step": 20667 + }, + { + "epoch": 8.40504270028467, + "grad_norm": 0.23666020170943305, + "learning_rate": 5.867729581478553e-06, + "loss": 0.0016, + "step": 20668 + }, + { + "epoch": 8.405449369662465, + "grad_norm": 5.5901354027882455, + "learning_rate": 5.866806965113327e-06, + "loss": 0.0635, + "step": 20669 + }, + { + "epoch": 8.40585603904026, + "grad_norm": 0.006973496754549349, + "learning_rate": 5.865884391177412e-06, + "loss": 0.0001, + "step": 20670 + }, + { + "epoch": 8.406262708418057, + "grad_norm": 0.004440677094583152, + "learning_rate": 5.864961859680273e-06, + "loss": 0.0001, + "step": 20671 + }, + { + "epoch": 8.406669377795852, + "grad_norm": 0.0031737793138630053, + "learning_rate": 5.8640393706313805e-06, + "loss": 0.0, + "step": 20672 + }, + { + "epoch": 8.407076047173648, + "grad_norm": 1.8643514202280371, + "learning_rate": 5.863116924040207e-06, + "loss": 0.0159, + "step": 20673 + }, + { + "epoch": 8.407482716551444, + "grad_norm": 1.529683673322224, + "learning_rate": 5.862194519916219e-06, + "loss": 0.0163, + "step": 20674 + }, + { + "epoch": 8.40788938592924, + "grad_norm": 0.002175014551620027, + "learning_rate": 5.861272158268888e-06, + "loss": 0.0, + "step": 20675 + }, + { + "epoch": 8.408296055307035, + "grad_norm": 0.03560571553475933, + "learning_rate": 5.8603498391076796e-06, + "loss": 0.0003, + "step": 20676 + }, + { + "epoch": 8.408702724684831, + "grad_norm": 4.105780208240152, + "learning_rate": 5.859427562442064e-06, + "loss": 0.0922, + "step": 20677 + }, + { + "epoch": 8.409109394062627, + "grad_norm": 0.023512715282377052, + "learning_rate": 5.8585053282815094e-06, + "loss": 0.0004, + "step": 20678 + }, + { + "epoch": 8.409516063440423, + "grad_norm": 0.006687680559246166, + "learning_rate": 5.857583136635481e-06, + "loss": 0.0001, + "step": 20679 + }, + { + "epoch": 8.409922732818218, + "grad_norm": 0.03030404939197431, + "learning_rate": 5.856660987513447e-06, + "loss": 0.0004, + "step": 20680 + }, + { + "epoch": 8.410329402196014, + "grad_norm": 0.056422945042711774, + "learning_rate": 5.855738880924874e-06, + "loss": 0.0005, + "step": 20681 + }, + { + "epoch": 8.41073607157381, + "grad_norm": 0.025285683022055246, + "learning_rate": 5.8548168168792255e-06, + "loss": 0.0003, + "step": 20682 + }, + { + "epoch": 8.411142740951606, + "grad_norm": 0.41960666977427263, + "learning_rate": 5.853894795385971e-06, + "loss": 0.0044, + "step": 20683 + }, + { + "epoch": 8.411549410329402, + "grad_norm": 0.005894849242879533, + "learning_rate": 5.852972816454571e-06, + "loss": 0.0001, + "step": 20684 + }, + { + "epoch": 8.411956079707197, + "grad_norm": 0.2078121871667611, + "learning_rate": 5.852050880094491e-06, + "loss": 0.0028, + "step": 20685 + }, + { + "epoch": 8.412362749084993, + "grad_norm": 0.10893925139346923, + "learning_rate": 5.851128986315198e-06, + "loss": 0.001, + "step": 20686 + }, + { + "epoch": 8.41276941846279, + "grad_norm": 4.496854881734692, + "learning_rate": 5.850207135126156e-06, + "loss": 0.0831, + "step": 20687 + }, + { + "epoch": 8.413176087840586, + "grad_norm": 0.003203335546327521, + "learning_rate": 5.8492853265368245e-06, + "loss": 0.0, + "step": 20688 + }, + { + "epoch": 8.413582757218382, + "grad_norm": 0.464449287262141, + "learning_rate": 5.848363560556667e-06, + "loss": 0.0056, + "step": 20689 + }, + { + "epoch": 8.413989426596178, + "grad_norm": 0.06875402372056949, + "learning_rate": 5.847441837195149e-06, + "loss": 0.0004, + "step": 20690 + }, + { + "epoch": 8.414396095973974, + "grad_norm": 5.746711123463195, + "learning_rate": 5.84652015646173e-06, + "loss": 0.1531, + "step": 20691 + }, + { + "epoch": 8.41480276535177, + "grad_norm": 0.25757144545055466, + "learning_rate": 5.84559851836587e-06, + "loss": 0.0032, + "step": 20692 + }, + { + "epoch": 8.415209434729565, + "grad_norm": 0.13212388736594474, + "learning_rate": 5.844676922917034e-06, + "loss": 0.001, + "step": 20693 + }, + { + "epoch": 8.415616104107361, + "grad_norm": 0.000549543829415065, + "learning_rate": 5.843755370124682e-06, + "loss": 0.0, + "step": 20694 + }, + { + "epoch": 8.416022773485157, + "grad_norm": 0.42433827302057725, + "learning_rate": 5.842833859998269e-06, + "loss": 0.0052, + "step": 20695 + }, + { + "epoch": 8.416429442862952, + "grad_norm": 2.4449058028265918, + "learning_rate": 5.841912392547262e-06, + "loss": 0.0475, + "step": 20696 + }, + { + "epoch": 8.416836112240748, + "grad_norm": 0.07693607400764771, + "learning_rate": 5.840990967781117e-06, + "loss": 0.0011, + "step": 20697 + }, + { + "epoch": 8.417242781618544, + "grad_norm": 0.47319494314482724, + "learning_rate": 5.8400695857092905e-06, + "loss": 0.0036, + "step": 20698 + }, + { + "epoch": 8.41764945099634, + "grad_norm": 3.4721537184580424, + "learning_rate": 5.8391482463412455e-06, + "loss": 0.05, + "step": 20699 + }, + { + "epoch": 8.418056120374136, + "grad_norm": 0.08630151463752214, + "learning_rate": 5.838226949686435e-06, + "loss": 0.0007, + "step": 20700 + }, + { + "epoch": 8.418462789751931, + "grad_norm": 0.09411198024015938, + "learning_rate": 5.837305695754323e-06, + "loss": 0.0012, + "step": 20701 + }, + { + "epoch": 8.418869459129727, + "grad_norm": 1.6127144094667936, + "learning_rate": 5.836384484554362e-06, + "loss": 0.0215, + "step": 20702 + }, + { + "epoch": 8.419276128507523, + "grad_norm": 0.006586859606234077, + "learning_rate": 5.835463316096011e-06, + "loss": 0.0001, + "step": 20703 + }, + { + "epoch": 8.419682797885319, + "grad_norm": 0.8426069614464672, + "learning_rate": 5.8345421903887235e-06, + "loss": 0.0085, + "step": 20704 + }, + { + "epoch": 8.420089467263114, + "grad_norm": 4.6481923233015845, + "learning_rate": 5.833621107441957e-06, + "loss": 0.1166, + "step": 20705 + }, + { + "epoch": 8.42049613664091, + "grad_norm": 0.07565278234072267, + "learning_rate": 5.832700067265167e-06, + "loss": 0.0005, + "step": 20706 + }, + { + "epoch": 8.420902806018706, + "grad_norm": 0.6547185988031958, + "learning_rate": 5.831779069867805e-06, + "loss": 0.0075, + "step": 20707 + }, + { + "epoch": 8.421309475396503, + "grad_norm": 0.010065316762491477, + "learning_rate": 5.830858115259331e-06, + "loss": 0.0001, + "step": 20708 + }, + { + "epoch": 8.4217161447743, + "grad_norm": 0.0002685150571641442, + "learning_rate": 5.829937203449199e-06, + "loss": 0.0, + "step": 20709 + }, + { + "epoch": 8.422122814152095, + "grad_norm": 0.050927545029000126, + "learning_rate": 5.829016334446858e-06, + "loss": 0.0006, + "step": 20710 + }, + { + "epoch": 8.42252948352989, + "grad_norm": 0.015062905857031277, + "learning_rate": 5.828095508261764e-06, + "loss": 0.0002, + "step": 20711 + }, + { + "epoch": 8.422936152907686, + "grad_norm": 0.7574912696088671, + "learning_rate": 5.827174724903371e-06, + "loss": 0.0035, + "step": 20712 + }, + { + "epoch": 8.423342822285482, + "grad_norm": 0.03530704000751877, + "learning_rate": 5.826253984381124e-06, + "loss": 0.0004, + "step": 20713 + }, + { + "epoch": 8.423749491663278, + "grad_norm": 0.3370815670875443, + "learning_rate": 5.825333286704485e-06, + "loss": 0.0034, + "step": 20714 + }, + { + "epoch": 8.424156161041074, + "grad_norm": 0.24684057454909392, + "learning_rate": 5.824412631882901e-06, + "loss": 0.0026, + "step": 20715 + }, + { + "epoch": 8.42456283041887, + "grad_norm": 0.037401326953025435, + "learning_rate": 5.8234920199258224e-06, + "loss": 0.0004, + "step": 20716 + }, + { + "epoch": 8.424969499796665, + "grad_norm": 0.4548892618084865, + "learning_rate": 5.8225714508427e-06, + "loss": 0.0078, + "step": 20717 + }, + { + "epoch": 8.425376169174461, + "grad_norm": 0.04092993730034164, + "learning_rate": 5.821650924642985e-06, + "loss": 0.0003, + "step": 20718 + }, + { + "epoch": 8.425782838552257, + "grad_norm": 1.1508996837813659, + "learning_rate": 5.820730441336125e-06, + "loss": 0.0064, + "step": 20719 + }, + { + "epoch": 8.426189507930053, + "grad_norm": 0.0340443342243865, + "learning_rate": 5.819810000931567e-06, + "loss": 0.0004, + "step": 20720 + }, + { + "epoch": 8.426596177307848, + "grad_norm": 2.4065246160202363, + "learning_rate": 5.818889603438769e-06, + "loss": 0.0267, + "step": 20721 + }, + { + "epoch": 8.427002846685644, + "grad_norm": 0.005795515781254526, + "learning_rate": 5.817969248867171e-06, + "loss": 0.0001, + "step": 20722 + }, + { + "epoch": 8.42740951606344, + "grad_norm": 0.01815391492394389, + "learning_rate": 5.817048937226224e-06, + "loss": 0.0002, + "step": 20723 + }, + { + "epoch": 8.427816185441236, + "grad_norm": 1.3137280294359024, + "learning_rate": 5.816128668525376e-06, + "loss": 0.015, + "step": 20724 + }, + { + "epoch": 8.428222854819031, + "grad_norm": 0.03448969311651341, + "learning_rate": 5.815208442774072e-06, + "loss": 0.0002, + "step": 20725 + }, + { + "epoch": 8.428629524196827, + "grad_norm": 0.9364983207976265, + "learning_rate": 5.8142882599817555e-06, + "loss": 0.009, + "step": 20726 + }, + { + "epoch": 8.429036193574623, + "grad_norm": 0.002074637437323846, + "learning_rate": 5.813368120157881e-06, + "loss": 0.0, + "step": 20727 + }, + { + "epoch": 8.42944286295242, + "grad_norm": 0.027176958068284744, + "learning_rate": 5.812448023311889e-06, + "loss": 0.0003, + "step": 20728 + }, + { + "epoch": 8.429849532330216, + "grad_norm": 0.014514065618202177, + "learning_rate": 5.811527969453226e-06, + "loss": 0.0003, + "step": 20729 + }, + { + "epoch": 8.430256201708012, + "grad_norm": 0.052108096544156296, + "learning_rate": 5.810607958591337e-06, + "loss": 0.0006, + "step": 20730 + }, + { + "epoch": 8.430662871085808, + "grad_norm": 0.009396801685871826, + "learning_rate": 5.809687990735665e-06, + "loss": 0.0001, + "step": 20731 + }, + { + "epoch": 8.431069540463604, + "grad_norm": 0.1201995710666547, + "learning_rate": 5.808768065895656e-06, + "loss": 0.0011, + "step": 20732 + }, + { + "epoch": 8.4314762098414, + "grad_norm": 0.7949083522459507, + "learning_rate": 5.807848184080747e-06, + "loss": 0.0061, + "step": 20733 + }, + { + "epoch": 8.431882879219195, + "grad_norm": 0.01135665900469276, + "learning_rate": 5.80692834530039e-06, + "loss": 0.0001, + "step": 20734 + }, + { + "epoch": 8.43228954859699, + "grad_norm": 0.8881516977783219, + "learning_rate": 5.8060085495640235e-06, + "loss": 0.0116, + "step": 20735 + }, + { + "epoch": 8.432696217974787, + "grad_norm": 0.009594145596382712, + "learning_rate": 5.805088796881092e-06, + "loss": 0.0001, + "step": 20736 + }, + { + "epoch": 8.433102887352582, + "grad_norm": 2.1272902987724542, + "learning_rate": 5.804169087261033e-06, + "loss": 0.012, + "step": 20737 + }, + { + "epoch": 8.433509556730378, + "grad_norm": 0.07238201995576594, + "learning_rate": 5.803249420713293e-06, + "loss": 0.0006, + "step": 20738 + }, + { + "epoch": 8.433916226108174, + "grad_norm": 3.3123187433258128, + "learning_rate": 5.802329797247304e-06, + "loss": 0.0825, + "step": 20739 + }, + { + "epoch": 8.43432289548597, + "grad_norm": 0.060927016236986944, + "learning_rate": 5.801410216872516e-06, + "loss": 0.001, + "step": 20740 + }, + { + "epoch": 8.434729564863765, + "grad_norm": 0.03318486167765936, + "learning_rate": 5.800490679598365e-06, + "loss": 0.0005, + "step": 20741 + }, + { + "epoch": 8.435136234241561, + "grad_norm": 0.05122845650717972, + "learning_rate": 5.799571185434292e-06, + "loss": 0.0004, + "step": 20742 + }, + { + "epoch": 8.435542903619357, + "grad_norm": 8.428946031238251, + "learning_rate": 5.798651734389734e-06, + "loss": 0.2202, + "step": 20743 + }, + { + "epoch": 8.435949572997153, + "grad_norm": 0.012488480174826153, + "learning_rate": 5.79773232647413e-06, + "loss": 0.0002, + "step": 20744 + }, + { + "epoch": 8.436356242374949, + "grad_norm": 3.5969119993851786, + "learning_rate": 5.79681296169692e-06, + "loss": 0.0336, + "step": 20745 + }, + { + "epoch": 8.436762911752744, + "grad_norm": 0.002855443733648994, + "learning_rate": 5.795893640067536e-06, + "loss": 0.0, + "step": 20746 + }, + { + "epoch": 8.43716958113054, + "grad_norm": 0.08706703836673231, + "learning_rate": 5.794974361595423e-06, + "loss": 0.0009, + "step": 20747 + }, + { + "epoch": 8.437576250508336, + "grad_norm": 4.820785857136663, + "learning_rate": 5.794055126290015e-06, + "loss": 0.0388, + "step": 20748 + }, + { + "epoch": 8.437982919886133, + "grad_norm": 0.12394586249580812, + "learning_rate": 5.793135934160747e-06, + "loss": 0.002, + "step": 20749 + }, + { + "epoch": 8.43838958926393, + "grad_norm": 2.6937446822342537, + "learning_rate": 5.792216785217058e-06, + "loss": 0.0407, + "step": 20750 + }, + { + "epoch": 8.438796258641725, + "grad_norm": 0.02461587634361315, + "learning_rate": 5.791297679468379e-06, + "loss": 0.0002, + "step": 20751 + }, + { + "epoch": 8.43920292801952, + "grad_norm": 3.424952655290108, + "learning_rate": 5.7903786169241485e-06, + "loss": 0.0352, + "step": 20752 + }, + { + "epoch": 8.439609597397316, + "grad_norm": 2.4666830468926184, + "learning_rate": 5.789459597593801e-06, + "loss": 0.0333, + "step": 20753 + }, + { + "epoch": 8.440016266775112, + "grad_norm": 0.0623384034200843, + "learning_rate": 5.788540621486769e-06, + "loss": 0.0007, + "step": 20754 + }, + { + "epoch": 8.440422936152908, + "grad_norm": 0.45017689928219945, + "learning_rate": 5.7876216886124855e-06, + "loss": 0.0054, + "step": 20755 + }, + { + "epoch": 8.440829605530704, + "grad_norm": 2.8814617171011583, + "learning_rate": 5.786702798980388e-06, + "loss": 0.0554, + "step": 20756 + }, + { + "epoch": 8.4412362749085, + "grad_norm": 0.014886743672755724, + "learning_rate": 5.785783952599902e-06, + "loss": 0.0002, + "step": 20757 + }, + { + "epoch": 8.441642944286295, + "grad_norm": 0.23931896528456623, + "learning_rate": 5.784865149480468e-06, + "loss": 0.0023, + "step": 20758 + }, + { + "epoch": 8.442049613664091, + "grad_norm": 0.16828670949207078, + "learning_rate": 5.783946389631516e-06, + "loss": 0.002, + "step": 20759 + }, + { + "epoch": 8.442456283041887, + "grad_norm": 1.7920586446345703, + "learning_rate": 5.783027673062476e-06, + "loss": 0.0127, + "step": 20760 + }, + { + "epoch": 8.442862952419683, + "grad_norm": 0.004061937879018989, + "learning_rate": 5.782108999782778e-06, + "loss": 0.0, + "step": 20761 + }, + { + "epoch": 8.443269621797478, + "grad_norm": 0.16511106872544268, + "learning_rate": 5.781190369801855e-06, + "loss": 0.001, + "step": 20762 + }, + { + "epoch": 8.443676291175274, + "grad_norm": 1.9342545084164626, + "learning_rate": 5.7802717831291354e-06, + "loss": 0.0195, + "step": 20763 + }, + { + "epoch": 8.44408296055307, + "grad_norm": 1.3845897772323212, + "learning_rate": 5.779353239774046e-06, + "loss": 0.017, + "step": 20764 + }, + { + "epoch": 8.444489629930866, + "grad_norm": 0.003808262304160257, + "learning_rate": 5.778434739746024e-06, + "loss": 0.0, + "step": 20765 + }, + { + "epoch": 8.444896299308661, + "grad_norm": 0.9902347949951643, + "learning_rate": 5.777516283054494e-06, + "loss": 0.0122, + "step": 20766 + }, + { + "epoch": 8.445302968686457, + "grad_norm": 3.343003080265983, + "learning_rate": 5.776597869708884e-06, + "loss": 0.0937, + "step": 20767 + }, + { + "epoch": 8.445709638064255, + "grad_norm": 0.0011727506765015282, + "learning_rate": 5.7756794997186235e-06, + "loss": 0.0, + "step": 20768 + }, + { + "epoch": 8.44611630744205, + "grad_norm": 0.6892625272399895, + "learning_rate": 5.774761173093139e-06, + "loss": 0.0124, + "step": 20769 + }, + { + "epoch": 8.446522976819846, + "grad_norm": 0.011178007882846813, + "learning_rate": 5.773842889841852e-06, + "loss": 0.0002, + "step": 20770 + }, + { + "epoch": 8.446929646197642, + "grad_norm": 0.017902386519040046, + "learning_rate": 5.772924649974201e-06, + "loss": 0.0002, + "step": 20771 + }, + { + "epoch": 8.447336315575438, + "grad_norm": 0.055966002810685624, + "learning_rate": 5.772006453499603e-06, + "loss": 0.0007, + "step": 20772 + }, + { + "epoch": 8.447742984953234, + "grad_norm": 4.522122450249419, + "learning_rate": 5.77108830042749e-06, + "loss": 0.0624, + "step": 20773 + }, + { + "epoch": 8.44814965433103, + "grad_norm": 0.059469094642646304, + "learning_rate": 5.770170190767281e-06, + "loss": 0.0005, + "step": 20774 + }, + { + "epoch": 8.448556323708825, + "grad_norm": 0.35763970314281174, + "learning_rate": 5.769252124528406e-06, + "loss": 0.0026, + "step": 20775 + }, + { + "epoch": 8.44896299308662, + "grad_norm": 1.6992706890019629, + "learning_rate": 5.768334101720286e-06, + "loss": 0.0185, + "step": 20776 + }, + { + "epoch": 8.449369662464417, + "grad_norm": 11.89730636532591, + "learning_rate": 5.767416122352343e-06, + "loss": 0.2095, + "step": 20777 + }, + { + "epoch": 8.449776331842212, + "grad_norm": 0.0028721491548295785, + "learning_rate": 5.766498186434007e-06, + "loss": 0.0, + "step": 20778 + }, + { + "epoch": 8.450183001220008, + "grad_norm": 0.008388804219331964, + "learning_rate": 5.765580293974698e-06, + "loss": 0.0001, + "step": 20779 + }, + { + "epoch": 8.450589670597804, + "grad_norm": 4.380124474772405, + "learning_rate": 5.764662444983837e-06, + "loss": 0.0351, + "step": 20780 + }, + { + "epoch": 8.4509963399756, + "grad_norm": 0.0405359554789888, + "learning_rate": 5.76374463947085e-06, + "loss": 0.0005, + "step": 20781 + }, + { + "epoch": 8.451403009353395, + "grad_norm": 0.0713993685659457, + "learning_rate": 5.762826877445156e-06, + "loss": 0.0006, + "step": 20782 + }, + { + "epoch": 8.451809678731191, + "grad_norm": 0.11322058461285049, + "learning_rate": 5.761909158916176e-06, + "loss": 0.0009, + "step": 20783 + }, + { + "epoch": 8.452216348108987, + "grad_norm": 0.005837262797631465, + "learning_rate": 5.760991483893328e-06, + "loss": 0.0001, + "step": 20784 + }, + { + "epoch": 8.452623017486783, + "grad_norm": 11.35950950401253, + "learning_rate": 5.760073852386039e-06, + "loss": 0.3046, + "step": 20785 + }, + { + "epoch": 8.453029686864578, + "grad_norm": 2.1753063498512515, + "learning_rate": 5.759156264403726e-06, + "loss": 0.0202, + "step": 20786 + }, + { + "epoch": 8.453436356242374, + "grad_norm": 0.0001395351161853153, + "learning_rate": 5.758238719955809e-06, + "loss": 0.0, + "step": 20787 + }, + { + "epoch": 8.45384302562017, + "grad_norm": 2.8031491305010077, + "learning_rate": 5.7573212190517045e-06, + "loss": 0.0258, + "step": 20788 + }, + { + "epoch": 8.454249694997966, + "grad_norm": 2.7630870205305507, + "learning_rate": 5.756403761700835e-06, + "loss": 0.0135, + "step": 20789 + }, + { + "epoch": 8.454656364375763, + "grad_norm": 0.005569530640452908, + "learning_rate": 5.755486347912611e-06, + "loss": 0.0001, + "step": 20790 + }, + { + "epoch": 8.455063033753559, + "grad_norm": 1.1742664573380468, + "learning_rate": 5.754568977696462e-06, + "loss": 0.0167, + "step": 20791 + }, + { + "epoch": 8.455469703131355, + "grad_norm": 0.008663264290109227, + "learning_rate": 5.7536516510617975e-06, + "loss": 0.0001, + "step": 20792 + }, + { + "epoch": 8.45587637250915, + "grad_norm": 0.026295639446707424, + "learning_rate": 5.752734368018036e-06, + "loss": 0.0001, + "step": 20793 + }, + { + "epoch": 8.456283041886946, + "grad_norm": 6.141915665581475, + "learning_rate": 5.751817128574595e-06, + "loss": 0.0226, + "step": 20794 + }, + { + "epoch": 8.456689711264742, + "grad_norm": 0.07293737678147802, + "learning_rate": 5.7508999327408875e-06, + "loss": 0.0008, + "step": 20795 + }, + { + "epoch": 8.457096380642538, + "grad_norm": 3.4898497903804455, + "learning_rate": 5.749982780526333e-06, + "loss": 0.0345, + "step": 20796 + }, + { + "epoch": 8.457503050020334, + "grad_norm": 0.013059923374503062, + "learning_rate": 5.749065671940338e-06, + "loss": 0.0002, + "step": 20797 + }, + { + "epoch": 8.45790971939813, + "grad_norm": 1.4811252997718647, + "learning_rate": 5.748148606992328e-06, + "loss": 0.0119, + "step": 20798 + }, + { + "epoch": 8.458316388775925, + "grad_norm": 0.0018009017722231447, + "learning_rate": 5.747231585691713e-06, + "loss": 0.0, + "step": 20799 + }, + { + "epoch": 8.458723058153721, + "grad_norm": 0.7892863182952506, + "learning_rate": 5.746314608047906e-06, + "loss": 0.0077, + "step": 20800 + }, + { + "epoch": 8.459129727531517, + "grad_norm": 0.18261005870411, + "learning_rate": 5.7453976740703214e-06, + "loss": 0.0021, + "step": 20801 + }, + { + "epoch": 8.459536396909312, + "grad_norm": 0.0035733146293870735, + "learning_rate": 5.744480783768369e-06, + "loss": 0.0, + "step": 20802 + }, + { + "epoch": 8.459943066287108, + "grad_norm": 7.783544016496071, + "learning_rate": 5.74356393715146e-06, + "loss": 0.1223, + "step": 20803 + }, + { + "epoch": 8.460349735664904, + "grad_norm": 0.08969051294065082, + "learning_rate": 5.742647134229014e-06, + "loss": 0.0014, + "step": 20804 + }, + { + "epoch": 8.4607564050427, + "grad_norm": 0.015119388176053444, + "learning_rate": 5.741730375010437e-06, + "loss": 0.0001, + "step": 20805 + }, + { + "epoch": 8.461163074420496, + "grad_norm": 0.5575500203981918, + "learning_rate": 5.74081365950514e-06, + "loss": 0.0058, + "step": 20806 + }, + { + "epoch": 8.461569743798291, + "grad_norm": 0.031142690030191933, + "learning_rate": 5.739896987722535e-06, + "loss": 0.0004, + "step": 20807 + }, + { + "epoch": 8.461976413176087, + "grad_norm": 0.15281362298213924, + "learning_rate": 5.738980359672033e-06, + "loss": 0.0012, + "step": 20808 + }, + { + "epoch": 8.462383082553885, + "grad_norm": 0.028377346123968614, + "learning_rate": 5.738063775363041e-06, + "loss": 0.0002, + "step": 20809 + }, + { + "epoch": 8.46278975193168, + "grad_norm": 0.002009731106155973, + "learning_rate": 5.737147234804965e-06, + "loss": 0.0, + "step": 20810 + }, + { + "epoch": 8.463196421309476, + "grad_norm": 0.06027912598254396, + "learning_rate": 5.736230738007222e-06, + "loss": 0.0005, + "step": 20811 + }, + { + "epoch": 8.463603090687272, + "grad_norm": 0.04559922696949986, + "learning_rate": 5.735314284979216e-06, + "loss": 0.0003, + "step": 20812 + }, + { + "epoch": 8.464009760065068, + "grad_norm": 1.176129150347073, + "learning_rate": 5.734397875730357e-06, + "loss": 0.0105, + "step": 20813 + }, + { + "epoch": 8.464416429442863, + "grad_norm": 0.5201810590681324, + "learning_rate": 5.7334815102700505e-06, + "loss": 0.0051, + "step": 20814 + }, + { + "epoch": 8.46482309882066, + "grad_norm": 1.6940320397208808, + "learning_rate": 5.7325651886077016e-06, + "loss": 0.0203, + "step": 20815 + }, + { + "epoch": 8.465229768198455, + "grad_norm": 0.4989874195978331, + "learning_rate": 5.731648910752716e-06, + "loss": 0.0026, + "step": 20816 + }, + { + "epoch": 8.46563643757625, + "grad_norm": 0.351871147763111, + "learning_rate": 5.7307326767145074e-06, + "loss": 0.0018, + "step": 20817 + }, + { + "epoch": 8.466043106954046, + "grad_norm": 0.04786287033510944, + "learning_rate": 5.729816486502475e-06, + "loss": 0.0003, + "step": 20818 + }, + { + "epoch": 8.466449776331842, + "grad_norm": 5.747530066251101, + "learning_rate": 5.728900340126027e-06, + "loss": 0.0883, + "step": 20819 + }, + { + "epoch": 8.466856445709638, + "grad_norm": 0.07988360417582605, + "learning_rate": 5.727984237594566e-06, + "loss": 0.0009, + "step": 20820 + }, + { + "epoch": 8.467263115087434, + "grad_norm": 2.6197696962590276, + "learning_rate": 5.727068178917496e-06, + "loss": 0.0292, + "step": 20821 + }, + { + "epoch": 8.46766978446523, + "grad_norm": 0.13551903598822046, + "learning_rate": 5.726152164104223e-06, + "loss": 0.0015, + "step": 20822 + }, + { + "epoch": 8.468076453843025, + "grad_norm": 0.015269286437588472, + "learning_rate": 5.725236193164143e-06, + "loss": 0.0002, + "step": 20823 + }, + { + "epoch": 8.468483123220821, + "grad_norm": 1.2690974733324707, + "learning_rate": 5.724320266106671e-06, + "loss": 0.0195, + "step": 20824 + }, + { + "epoch": 8.468889792598617, + "grad_norm": 4.440886392213738, + "learning_rate": 5.723404382941202e-06, + "loss": 0.0355, + "step": 20825 + }, + { + "epoch": 8.469296461976413, + "grad_norm": 0.07423801557101369, + "learning_rate": 5.7224885436771405e-06, + "loss": 0.0009, + "step": 20826 + }, + { + "epoch": 8.469703131354208, + "grad_norm": 0.17066325107614322, + "learning_rate": 5.721572748323887e-06, + "loss": 0.0014, + "step": 20827 + }, + { + "epoch": 8.470109800732004, + "grad_norm": 2.483487690511664, + "learning_rate": 5.720656996890842e-06, + "loss": 0.0254, + "step": 20828 + }, + { + "epoch": 8.4705164701098, + "grad_norm": 0.022158421899302137, + "learning_rate": 5.719741289387403e-06, + "loss": 0.0003, + "step": 20829 + }, + { + "epoch": 8.470923139487596, + "grad_norm": 0.11981759610999584, + "learning_rate": 5.718825625822977e-06, + "loss": 0.0016, + "step": 20830 + }, + { + "epoch": 8.471329808865393, + "grad_norm": 0.0031317966210586135, + "learning_rate": 5.7179100062069615e-06, + "loss": 0.0, + "step": 20831 + }, + { + "epoch": 8.471736478243189, + "grad_norm": 0.3317827087993663, + "learning_rate": 5.7169944305487545e-06, + "loss": 0.0037, + "step": 20832 + }, + { + "epoch": 8.472143147620985, + "grad_norm": 8.229351860489569, + "learning_rate": 5.716078898857755e-06, + "loss": 0.0716, + "step": 20833 + }, + { + "epoch": 8.47254981699878, + "grad_norm": 0.0032969495619552458, + "learning_rate": 5.7151634111433615e-06, + "loss": 0.0, + "step": 20834 + }, + { + "epoch": 8.472956486376576, + "grad_norm": 1.1319153667990758, + "learning_rate": 5.714247967414973e-06, + "loss": 0.0124, + "step": 20835 + }, + { + "epoch": 8.473363155754372, + "grad_norm": 0.022816320768592364, + "learning_rate": 5.713332567681982e-06, + "loss": 0.0002, + "step": 20836 + }, + { + "epoch": 8.473769825132168, + "grad_norm": 0.3608620084356382, + "learning_rate": 5.712417211953793e-06, + "loss": 0.0034, + "step": 20837 + }, + { + "epoch": 8.474176494509964, + "grad_norm": 0.21592448937760636, + "learning_rate": 5.7115019002398e-06, + "loss": 0.0023, + "step": 20838 + }, + { + "epoch": 8.47458316388776, + "grad_norm": 0.22473294567878274, + "learning_rate": 5.710586632549397e-06, + "loss": 0.0014, + "step": 20839 + }, + { + "epoch": 8.474989833265555, + "grad_norm": 0.3187130696865446, + "learning_rate": 5.709671408891983e-06, + "loss": 0.0029, + "step": 20840 + }, + { + "epoch": 8.47539650264335, + "grad_norm": 1.134454764844831, + "learning_rate": 5.708756229276949e-06, + "loss": 0.011, + "step": 20841 + }, + { + "epoch": 8.475803172021147, + "grad_norm": 0.10906444642220153, + "learning_rate": 5.707841093713689e-06, + "loss": 0.0014, + "step": 20842 + }, + { + "epoch": 8.476209841398942, + "grad_norm": 2.0128241775353364, + "learning_rate": 5.706926002211605e-06, + "loss": 0.0154, + "step": 20843 + }, + { + "epoch": 8.476616510776738, + "grad_norm": 9.695875789315505, + "learning_rate": 5.706010954780086e-06, + "loss": 0.0312, + "step": 20844 + }, + { + "epoch": 8.477023180154534, + "grad_norm": 1.559351364008095, + "learning_rate": 5.705095951428525e-06, + "loss": 0.014, + "step": 20845 + }, + { + "epoch": 8.47742984953233, + "grad_norm": 0.011096111266652005, + "learning_rate": 5.7041809921663175e-06, + "loss": 0.0001, + "step": 20846 + }, + { + "epoch": 8.477836518910125, + "grad_norm": 0.03407656703925492, + "learning_rate": 5.703266077002852e-06, + "loss": 0.0004, + "step": 20847 + }, + { + "epoch": 8.478243188287921, + "grad_norm": 0.8558688152215543, + "learning_rate": 5.702351205947523e-06, + "loss": 0.0052, + "step": 20848 + }, + { + "epoch": 8.478649857665717, + "grad_norm": 0.0042365031597993985, + "learning_rate": 5.701436379009719e-06, + "loss": 0.0, + "step": 20849 + }, + { + "epoch": 8.479056527043515, + "grad_norm": 0.12477807067321889, + "learning_rate": 5.700521596198837e-06, + "loss": 0.0011, + "step": 20850 + }, + { + "epoch": 8.47946319642131, + "grad_norm": 0.37382284302577384, + "learning_rate": 5.699606857524269e-06, + "loss": 0.0044, + "step": 20851 + }, + { + "epoch": 8.479869865799106, + "grad_norm": 0.009523728260509111, + "learning_rate": 5.698692162995397e-06, + "loss": 0.0, + "step": 20852 + }, + { + "epoch": 8.480276535176902, + "grad_norm": 0.01792406870521248, + "learning_rate": 5.697777512621615e-06, + "loss": 0.0002, + "step": 20853 + }, + { + "epoch": 8.480683204554698, + "grad_norm": 4.997943075845578, + "learning_rate": 5.696862906412308e-06, + "loss": 0.0546, + "step": 20854 + }, + { + "epoch": 8.481089873932493, + "grad_norm": 0.1529638917758838, + "learning_rate": 5.695948344376873e-06, + "loss": 0.0015, + "step": 20855 + }, + { + "epoch": 8.48149654331029, + "grad_norm": 0.023698524754293147, + "learning_rate": 5.695033826524694e-06, + "loss": 0.0002, + "step": 20856 + }, + { + "epoch": 8.481903212688085, + "grad_norm": 10.610144406090086, + "learning_rate": 5.69411935286516e-06, + "loss": 0.1909, + "step": 20857 + }, + { + "epoch": 8.48230988206588, + "grad_norm": 9.063884940871775, + "learning_rate": 5.693204923407657e-06, + "loss": 0.1775, + "step": 20858 + }, + { + "epoch": 8.482716551443676, + "grad_norm": 2.90522754713087, + "learning_rate": 5.692290538161573e-06, + "loss": 0.024, + "step": 20859 + }, + { + "epoch": 8.483123220821472, + "grad_norm": 0.6319937486804345, + "learning_rate": 5.691376197136291e-06, + "loss": 0.0037, + "step": 20860 + }, + { + "epoch": 8.483529890199268, + "grad_norm": 0.005561107113746274, + "learning_rate": 5.690461900341206e-06, + "loss": 0.0001, + "step": 20861 + }, + { + "epoch": 8.483936559577064, + "grad_norm": 1.100556627689189, + "learning_rate": 5.6895476477856985e-06, + "loss": 0.0119, + "step": 20862 + }, + { + "epoch": 8.48434322895486, + "grad_norm": 6.576855528779641, + "learning_rate": 5.688633439479152e-06, + "loss": 0.0859, + "step": 20863 + }, + { + "epoch": 8.484749898332655, + "grad_norm": 0.03756625647585392, + "learning_rate": 5.687719275430954e-06, + "loss": 0.0005, + "step": 20864 + }, + { + "epoch": 8.485156567710451, + "grad_norm": 0.07161344994668331, + "learning_rate": 5.686805155650488e-06, + "loss": 0.0008, + "step": 20865 + }, + { + "epoch": 8.485563237088247, + "grad_norm": 0.001409629030042737, + "learning_rate": 5.685891080147138e-06, + "loss": 0.0, + "step": 20866 + }, + { + "epoch": 8.485969906466043, + "grad_norm": 0.05436930639811656, + "learning_rate": 5.684977048930282e-06, + "loss": 0.0007, + "step": 20867 + }, + { + "epoch": 8.486376575843838, + "grad_norm": 0.015916065595159755, + "learning_rate": 5.684063062009315e-06, + "loss": 0.0002, + "step": 20868 + }, + { + "epoch": 8.486783245221634, + "grad_norm": 0.0008158554694477863, + "learning_rate": 5.683149119393612e-06, + "loss": 0.0, + "step": 20869 + }, + { + "epoch": 8.48718991459943, + "grad_norm": 0.00884478661552684, + "learning_rate": 5.682235221092554e-06, + "loss": 0.0001, + "step": 20870 + }, + { + "epoch": 8.487596583977226, + "grad_norm": 0.2532456192615512, + "learning_rate": 5.681321367115526e-06, + "loss": 0.0031, + "step": 20871 + }, + { + "epoch": 8.488003253355023, + "grad_norm": 9.948844599344156, + "learning_rate": 5.6804075574719075e-06, + "loss": 0.1981, + "step": 20872 + }, + { + "epoch": 8.488409922732819, + "grad_norm": 0.02991196901714365, + "learning_rate": 5.679493792171076e-06, + "loss": 0.0005, + "step": 20873 + }, + { + "epoch": 8.488816592110615, + "grad_norm": 0.017669377403716144, + "learning_rate": 5.6785800712224185e-06, + "loss": 0.0001, + "step": 20874 + }, + { + "epoch": 8.48922326148841, + "grad_norm": 9.586578541239351, + "learning_rate": 5.677666394635311e-06, + "loss": 0.1875, + "step": 20875 + }, + { + "epoch": 8.489629930866206, + "grad_norm": 0.02641114664798883, + "learning_rate": 5.676752762419134e-06, + "loss": 0.0003, + "step": 20876 + }, + { + "epoch": 8.490036600244002, + "grad_norm": 0.34187825817839734, + "learning_rate": 5.675839174583266e-06, + "loss": 0.003, + "step": 20877 + }, + { + "epoch": 8.490443269621798, + "grad_norm": 0.005479610152432773, + "learning_rate": 5.674925631137085e-06, + "loss": 0.0001, + "step": 20878 + }, + { + "epoch": 8.490849938999594, + "grad_norm": 0.01098247048458971, + "learning_rate": 5.674012132089969e-06, + "loss": 0.0002, + "step": 20879 + }, + { + "epoch": 8.49125660837739, + "grad_norm": 0.0031065387492724052, + "learning_rate": 5.673098677451292e-06, + "loss": 0.0, + "step": 20880 + }, + { + "epoch": 8.491663277755185, + "grad_norm": 0.009718582556940131, + "learning_rate": 5.6721852672304375e-06, + "loss": 0.0001, + "step": 20881 + }, + { + "epoch": 8.49206994713298, + "grad_norm": 0.1902090424483435, + "learning_rate": 5.671271901436781e-06, + "loss": 0.0023, + "step": 20882 + }, + { + "epoch": 8.492476616510777, + "grad_norm": 0.037159049890847756, + "learning_rate": 5.670358580079696e-06, + "loss": 0.0003, + "step": 20883 + }, + { + "epoch": 8.492883285888572, + "grad_norm": 0.03335851768282632, + "learning_rate": 5.6694453031685596e-06, + "loss": 0.0005, + "step": 20884 + }, + { + "epoch": 8.493289955266368, + "grad_norm": 0.013680688224409536, + "learning_rate": 5.6685320707127466e-06, + "loss": 0.0001, + "step": 20885 + }, + { + "epoch": 8.493696624644164, + "grad_norm": 1.9783849428255063, + "learning_rate": 5.667618882721628e-06, + "loss": 0.0282, + "step": 20886 + }, + { + "epoch": 8.49410329402196, + "grad_norm": 0.7434641780200351, + "learning_rate": 5.666705739204586e-06, + "loss": 0.0029, + "step": 20887 + }, + { + "epoch": 8.494509963399755, + "grad_norm": 0.005441548897018483, + "learning_rate": 5.665792640170991e-06, + "loss": 0.0001, + "step": 20888 + }, + { + "epoch": 8.494916632777551, + "grad_norm": 4.723889676914372, + "learning_rate": 5.664879585630215e-06, + "loss": 0.0113, + "step": 20889 + }, + { + "epoch": 8.495323302155347, + "grad_norm": 0.9768621442757958, + "learning_rate": 5.663966575591633e-06, + "loss": 0.0089, + "step": 20890 + }, + { + "epoch": 8.495729971533144, + "grad_norm": 1.4234371826788375, + "learning_rate": 5.663053610064615e-06, + "loss": 0.0059, + "step": 20891 + }, + { + "epoch": 8.49613664091094, + "grad_norm": 0.00026454106563836847, + "learning_rate": 5.662140689058535e-06, + "loss": 0.0, + "step": 20892 + }, + { + "epoch": 8.496543310288736, + "grad_norm": 1.5679958655867536, + "learning_rate": 5.66122781258276e-06, + "loss": 0.02, + "step": 20893 + }, + { + "epoch": 8.496949979666532, + "grad_norm": 2.1619595544229613, + "learning_rate": 5.660314980646669e-06, + "loss": 0.0115, + "step": 20894 + }, + { + "epoch": 8.497356649044328, + "grad_norm": 0.08233159575238158, + "learning_rate": 5.6594021932596285e-06, + "loss": 0.0009, + "step": 20895 + }, + { + "epoch": 8.497763318422123, + "grad_norm": 2.2032584406130127, + "learning_rate": 5.658489450431011e-06, + "loss": 0.0146, + "step": 20896 + }, + { + "epoch": 8.498169987799919, + "grad_norm": 0.032266472025930755, + "learning_rate": 5.657576752170182e-06, + "loss": 0.0004, + "step": 20897 + }, + { + "epoch": 8.498576657177715, + "grad_norm": 0.004788721209817755, + "learning_rate": 5.656664098486515e-06, + "loss": 0.0, + "step": 20898 + }, + { + "epoch": 8.49898332655551, + "grad_norm": 1.59360307891796, + "learning_rate": 5.655751489389376e-06, + "loss": 0.0179, + "step": 20899 + }, + { + "epoch": 8.499389995933306, + "grad_norm": 0.059050395346464835, + "learning_rate": 5.654838924888131e-06, + "loss": 0.0008, + "step": 20900 + }, + { + "epoch": 8.499796665311102, + "grad_norm": 5.250731651379526, + "learning_rate": 5.653926404992154e-06, + "loss": 0.1321, + "step": 20901 + }, + { + "epoch": 8.500203334688898, + "grad_norm": 0.10442640487916224, + "learning_rate": 5.653013929710811e-06, + "loss": 0.0011, + "step": 20902 + }, + { + "epoch": 8.500610004066694, + "grad_norm": 2.240352711882032, + "learning_rate": 5.652101499053467e-06, + "loss": 0.017, + "step": 20903 + }, + { + "epoch": 8.50101667344449, + "grad_norm": 0.8247003630548663, + "learning_rate": 5.6511891130294894e-06, + "loss": 0.0042, + "step": 20904 + }, + { + "epoch": 8.501423342822285, + "grad_norm": 0.004528773095119737, + "learning_rate": 5.650276771648246e-06, + "loss": 0.0001, + "step": 20905 + }, + { + "epoch": 8.501830012200081, + "grad_norm": 0.3150188376502476, + "learning_rate": 5.6493644749190944e-06, + "loss": 0.0043, + "step": 20906 + }, + { + "epoch": 8.502236681577877, + "grad_norm": 1.8861477421693773, + "learning_rate": 5.64845222285141e-06, + "loss": 0.0219, + "step": 20907 + }, + { + "epoch": 8.502643350955672, + "grad_norm": 0.06076650924636122, + "learning_rate": 5.647540015454556e-06, + "loss": 0.0006, + "step": 20908 + }, + { + "epoch": 8.503050020333468, + "grad_norm": 0.1469346290250968, + "learning_rate": 5.646627852737893e-06, + "loss": 0.0012, + "step": 20909 + }, + { + "epoch": 8.503456689711264, + "grad_norm": 0.06642078160978496, + "learning_rate": 5.645715734710786e-06, + "loss": 0.0005, + "step": 20910 + }, + { + "epoch": 8.50386335908906, + "grad_norm": 0.38948406539248903, + "learning_rate": 5.644803661382598e-06, + "loss": 0.0039, + "step": 20911 + }, + { + "epoch": 8.504270028466856, + "grad_norm": 0.01864341973432317, + "learning_rate": 5.643891632762693e-06, + "loss": 0.0002, + "step": 20912 + }, + { + "epoch": 8.504676697844653, + "grad_norm": 0.021874170769171277, + "learning_rate": 5.6429796488604295e-06, + "loss": 0.0002, + "step": 20913 + }, + { + "epoch": 8.505083367222449, + "grad_norm": 0.0012555678687801277, + "learning_rate": 5.642067709685175e-06, + "loss": 0.0, + "step": 20914 + }, + { + "epoch": 8.505490036600245, + "grad_norm": 0.0168614541740693, + "learning_rate": 5.64115581524629e-06, + "loss": 0.0001, + "step": 20915 + }, + { + "epoch": 8.50589670597804, + "grad_norm": 0.6472542905393329, + "learning_rate": 5.640243965553134e-06, + "loss": 0.0061, + "step": 20916 + }, + { + "epoch": 8.506303375355836, + "grad_norm": 0.40285350745021775, + "learning_rate": 5.639332160615067e-06, + "loss": 0.004, + "step": 20917 + }, + { + "epoch": 8.506710044733632, + "grad_norm": 0.21510683942449316, + "learning_rate": 5.638420400441451e-06, + "loss": 0.0029, + "step": 20918 + }, + { + "epoch": 8.507116714111428, + "grad_norm": 0.011065835179806317, + "learning_rate": 5.637508685041642e-06, + "loss": 0.0002, + "step": 20919 + }, + { + "epoch": 8.507523383489223, + "grad_norm": 10.529436759040765, + "learning_rate": 5.636597014425003e-06, + "loss": 0.0566, + "step": 20920 + }, + { + "epoch": 8.50793005286702, + "grad_norm": 0.014992775418964268, + "learning_rate": 5.635685388600894e-06, + "loss": 0.0001, + "step": 20921 + }, + { + "epoch": 8.508336722244815, + "grad_norm": 0.07496711940664218, + "learning_rate": 5.63477380757867e-06, + "loss": 0.0007, + "step": 20922 + }, + { + "epoch": 8.50874339162261, + "grad_norm": 0.09054864054680684, + "learning_rate": 5.633862271367689e-06, + "loss": 0.0008, + "step": 20923 + }, + { + "epoch": 8.509150061000406, + "grad_norm": 0.053761616730482444, + "learning_rate": 5.632950779977311e-06, + "loss": 0.0006, + "step": 20924 + }, + { + "epoch": 8.509556730378202, + "grad_norm": 0.015490260553984373, + "learning_rate": 5.63203933341689e-06, + "loss": 0.0001, + "step": 20925 + }, + { + "epoch": 8.509963399755998, + "grad_norm": 0.11223554730428006, + "learning_rate": 5.631127931695782e-06, + "loss": 0.0013, + "step": 20926 + }, + { + "epoch": 8.510370069133794, + "grad_norm": 0.061825201427984124, + "learning_rate": 5.630216574823346e-06, + "loss": 0.0005, + "step": 20927 + }, + { + "epoch": 8.51077673851159, + "grad_norm": 8.801357550954686, + "learning_rate": 5.629305262808936e-06, + "loss": 0.1765, + "step": 20928 + }, + { + "epoch": 8.511183407889385, + "grad_norm": 0.02180419654679778, + "learning_rate": 5.6283939956619085e-06, + "loss": 0.0001, + "step": 20929 + }, + { + "epoch": 8.511590077267181, + "grad_norm": 0.08844624493454414, + "learning_rate": 5.627482773391616e-06, + "loss": 0.001, + "step": 20930 + }, + { + "epoch": 8.511996746644977, + "grad_norm": 0.0036739862599295763, + "learning_rate": 5.626571596007413e-06, + "loss": 0.0, + "step": 20931 + }, + { + "epoch": 8.512403416022774, + "grad_norm": 0.0006146882600517627, + "learning_rate": 5.625660463518651e-06, + "loss": 0.0, + "step": 20932 + }, + { + "epoch": 8.51281008540057, + "grad_norm": 0.026118928157020518, + "learning_rate": 5.62474937593469e-06, + "loss": 0.0003, + "step": 20933 + }, + { + "epoch": 8.513216754778366, + "grad_norm": 0.011791778529367813, + "learning_rate": 5.623838333264877e-06, + "loss": 0.0001, + "step": 20934 + }, + { + "epoch": 8.513623424156162, + "grad_norm": 0.019683074335695397, + "learning_rate": 5.622927335518567e-06, + "loss": 0.0002, + "step": 20935 + }, + { + "epoch": 8.514030093533957, + "grad_norm": 0.5729290417025624, + "learning_rate": 5.622016382705111e-06, + "loss": 0.0067, + "step": 20936 + }, + { + "epoch": 8.514436762911753, + "grad_norm": 0.11243929306743257, + "learning_rate": 5.62110547483386e-06, + "loss": 0.0012, + "step": 20937 + }, + { + "epoch": 8.514843432289549, + "grad_norm": 0.21420949558204283, + "learning_rate": 5.6201946119141646e-06, + "loss": 0.0025, + "step": 20938 + }, + { + "epoch": 8.515250101667345, + "grad_norm": 1.704273900359819, + "learning_rate": 5.619283793955372e-06, + "loss": 0.0165, + "step": 20939 + }, + { + "epoch": 8.51565677104514, + "grad_norm": 1.0406153164728198, + "learning_rate": 5.618373020966841e-06, + "loss": 0.0065, + "step": 20940 + }, + { + "epoch": 8.516063440422936, + "grad_norm": 0.0028867931385953604, + "learning_rate": 5.6174622929579155e-06, + "loss": 0.0, + "step": 20941 + }, + { + "epoch": 8.516470109800732, + "grad_norm": 6.420657475812777, + "learning_rate": 5.616551609937946e-06, + "loss": 0.2152, + "step": 20942 + }, + { + "epoch": 8.516876779178528, + "grad_norm": 0.055107297589281325, + "learning_rate": 5.615640971916279e-06, + "loss": 0.0006, + "step": 20943 + }, + { + "epoch": 8.517283448556324, + "grad_norm": 0.04029613473517507, + "learning_rate": 5.614730378902266e-06, + "loss": 0.0002, + "step": 20944 + }, + { + "epoch": 8.51769011793412, + "grad_norm": 0.3637084763689031, + "learning_rate": 5.6138198309052475e-06, + "loss": 0.004, + "step": 20945 + }, + { + "epoch": 8.518096787311915, + "grad_norm": 0.8376142841143893, + "learning_rate": 5.6129093279345794e-06, + "loss": 0.0057, + "step": 20946 + }, + { + "epoch": 8.51850345668971, + "grad_norm": 8.024570239337972, + "learning_rate": 5.6119988699996054e-06, + "loss": 0.0529, + "step": 20947 + }, + { + "epoch": 8.518910126067507, + "grad_norm": 0.04097355974183583, + "learning_rate": 5.611088457109672e-06, + "loss": 0.0003, + "step": 20948 + }, + { + "epoch": 8.519316795445302, + "grad_norm": 0.025948486404243108, + "learning_rate": 5.610178089274123e-06, + "loss": 0.0002, + "step": 20949 + }, + { + "epoch": 8.519723464823098, + "grad_norm": 0.11186463512508375, + "learning_rate": 5.609267766502307e-06, + "loss": 0.0017, + "step": 20950 + }, + { + "epoch": 8.520130134200894, + "grad_norm": 0.05825554105330423, + "learning_rate": 5.608357488803566e-06, + "loss": 0.0005, + "step": 20951 + }, + { + "epoch": 8.52053680357869, + "grad_norm": 0.011344503059150189, + "learning_rate": 5.607447256187246e-06, + "loss": 0.0001, + "step": 20952 + }, + { + "epoch": 8.520943472956485, + "grad_norm": 3.052056124335902, + "learning_rate": 5.606537068662692e-06, + "loss": 0.0137, + "step": 20953 + }, + { + "epoch": 8.521350142334283, + "grad_norm": 6.015519749352329, + "learning_rate": 5.605626926239245e-06, + "loss": 0.1266, + "step": 20954 + }, + { + "epoch": 8.521756811712079, + "grad_norm": 0.12771221791057086, + "learning_rate": 5.604716828926249e-06, + "loss": 0.0016, + "step": 20955 + }, + { + "epoch": 8.522163481089875, + "grad_norm": 0.006278687296327227, + "learning_rate": 5.603806776733047e-06, + "loss": 0.0001, + "step": 20956 + }, + { + "epoch": 8.52257015046767, + "grad_norm": 0.017717422205729905, + "learning_rate": 5.602896769668977e-06, + "loss": 0.0003, + "step": 20957 + }, + { + "epoch": 8.522976819845466, + "grad_norm": 0.014295021672808626, + "learning_rate": 5.601986807743388e-06, + "loss": 0.0001, + "step": 20958 + }, + { + "epoch": 8.523383489223262, + "grad_norm": 4.53683545406084, + "learning_rate": 5.601076890965619e-06, + "loss": 0.0473, + "step": 20959 + }, + { + "epoch": 8.523790158601058, + "grad_norm": 0.004050380028618661, + "learning_rate": 5.6001670193450085e-06, + "loss": 0.0001, + "step": 20960 + }, + { + "epoch": 8.524196827978853, + "grad_norm": 0.000939032234143273, + "learning_rate": 5.599257192890899e-06, + "loss": 0.0, + "step": 20961 + }, + { + "epoch": 8.52460349735665, + "grad_norm": 0.3401161643121529, + "learning_rate": 5.598347411612629e-06, + "loss": 0.0036, + "step": 20962 + }, + { + "epoch": 8.525010166734445, + "grad_norm": 0.4095399803241775, + "learning_rate": 5.597437675519535e-06, + "loss": 0.0034, + "step": 20963 + }, + { + "epoch": 8.52541683611224, + "grad_norm": 0.0028859477868478905, + "learning_rate": 5.596527984620961e-06, + "loss": 0.0, + "step": 20964 + }, + { + "epoch": 8.525823505490036, + "grad_norm": 0.003773020891443374, + "learning_rate": 5.5956183389262454e-06, + "loss": 0.0, + "step": 20965 + }, + { + "epoch": 8.526230174867832, + "grad_norm": 0.0011105726411143192, + "learning_rate": 5.594708738444723e-06, + "loss": 0.0, + "step": 20966 + }, + { + "epoch": 8.526636844245628, + "grad_norm": 4.575487447814331, + "learning_rate": 5.593799183185733e-06, + "loss": 0.1123, + "step": 20967 + }, + { + "epoch": 8.527043513623424, + "grad_norm": 7.607684894319336, + "learning_rate": 5.592889673158612e-06, + "loss": 0.1125, + "step": 20968 + }, + { + "epoch": 8.52745018300122, + "grad_norm": 12.21103329416531, + "learning_rate": 5.5919802083726955e-06, + "loss": 0.3373, + "step": 20969 + }, + { + "epoch": 8.527856852379015, + "grad_norm": 0.03638845747787702, + "learning_rate": 5.591070788837318e-06, + "loss": 0.0002, + "step": 20970 + }, + { + "epoch": 8.528263521756811, + "grad_norm": 0.06172126082302961, + "learning_rate": 5.590161414561822e-06, + "loss": 0.0009, + "step": 20971 + }, + { + "epoch": 8.528670191134607, + "grad_norm": 0.018143864933069537, + "learning_rate": 5.589252085555537e-06, + "loss": 0.0002, + "step": 20972 + }, + { + "epoch": 8.529076860512404, + "grad_norm": 0.4792652676015167, + "learning_rate": 5.588342801827802e-06, + "loss": 0.0082, + "step": 20973 + }, + { + "epoch": 8.5294835298902, + "grad_norm": 0.34261754028532776, + "learning_rate": 5.587433563387945e-06, + "loss": 0.0028, + "step": 20974 + }, + { + "epoch": 8.529890199267996, + "grad_norm": 3.985330086659319, + "learning_rate": 5.586524370245306e-06, + "loss": 0.017, + "step": 20975 + }, + { + "epoch": 8.530296868645792, + "grad_norm": 0.00905461633679145, + "learning_rate": 5.585615222409211e-06, + "loss": 0.0001, + "step": 20976 + }, + { + "epoch": 8.530703538023587, + "grad_norm": 0.004467118300913028, + "learning_rate": 5.584706119889001e-06, + "loss": 0.0001, + "step": 20977 + }, + { + "epoch": 8.531110207401383, + "grad_norm": 5.917051858895426, + "learning_rate": 5.583797062694004e-06, + "loss": 0.1492, + "step": 20978 + }, + { + "epoch": 8.531516876779179, + "grad_norm": 0.02678452355336427, + "learning_rate": 5.582888050833553e-06, + "loss": 0.0004, + "step": 20979 + }, + { + "epoch": 8.531923546156975, + "grad_norm": 16.963611757436013, + "learning_rate": 5.581979084316981e-06, + "loss": 0.6414, + "step": 20980 + }, + { + "epoch": 8.53233021553477, + "grad_norm": 0.794882159191407, + "learning_rate": 5.5810701631536155e-06, + "loss": 0.0055, + "step": 20981 + }, + { + "epoch": 8.532736884912566, + "grad_norm": 0.006354845131015999, + "learning_rate": 5.58016128735279e-06, + "loss": 0.0001, + "step": 20982 + }, + { + "epoch": 8.533143554290362, + "grad_norm": 0.45100669875859395, + "learning_rate": 5.579252456923828e-06, + "loss": 0.0038, + "step": 20983 + }, + { + "epoch": 8.533550223668158, + "grad_norm": 0.0010325019251457915, + "learning_rate": 5.578343671876069e-06, + "loss": 0.0, + "step": 20984 + }, + { + "epoch": 8.533956893045954, + "grad_norm": 0.006511690292297577, + "learning_rate": 5.577434932218838e-06, + "loss": 0.0001, + "step": 20985 + }, + { + "epoch": 8.53436356242375, + "grad_norm": 0.35525637768073937, + "learning_rate": 5.576526237961464e-06, + "loss": 0.0039, + "step": 20986 + }, + { + "epoch": 8.534770231801545, + "grad_norm": 0.005051295715965351, + "learning_rate": 5.5756175891132735e-06, + "loss": 0.0001, + "step": 20987 + }, + { + "epoch": 8.53517690117934, + "grad_norm": 0.03323618338554523, + "learning_rate": 5.574708985683595e-06, + "loss": 0.0003, + "step": 20988 + }, + { + "epoch": 8.535583570557137, + "grad_norm": 1.3240732424084223, + "learning_rate": 5.573800427681751e-06, + "loss": 0.0126, + "step": 20989 + }, + { + "epoch": 8.535990239934932, + "grad_norm": 0.05000916757723758, + "learning_rate": 5.5728919151170795e-06, + "loss": 0.0005, + "step": 20990 + }, + { + "epoch": 8.536396909312728, + "grad_norm": 0.11127907998966248, + "learning_rate": 5.571983447998899e-06, + "loss": 0.0009, + "step": 20991 + }, + { + "epoch": 8.536803578690524, + "grad_norm": 1.6350082416251057, + "learning_rate": 5.571075026336538e-06, + "loss": 0.0112, + "step": 20992 + }, + { + "epoch": 8.53721024806832, + "grad_norm": 0.008913136016689435, + "learning_rate": 5.5701666501393195e-06, + "loss": 0.0001, + "step": 20993 + }, + { + "epoch": 8.537616917446115, + "grad_norm": 0.025454700225308317, + "learning_rate": 5.56925831941657e-06, + "loss": 0.0003, + "step": 20994 + }, + { + "epoch": 8.538023586823913, + "grad_norm": 0.12002919542730374, + "learning_rate": 5.568350034177616e-06, + "loss": 0.0016, + "step": 20995 + }, + { + "epoch": 8.538430256201709, + "grad_norm": 0.2184728293954479, + "learning_rate": 5.567441794431773e-06, + "loss": 0.0024, + "step": 20996 + }, + { + "epoch": 8.538836925579504, + "grad_norm": 0.939463831469779, + "learning_rate": 5.566533600188375e-06, + "loss": 0.0063, + "step": 20997 + }, + { + "epoch": 8.5392435949573, + "grad_norm": 0.05765711366445635, + "learning_rate": 5.565625451456742e-06, + "loss": 0.0006, + "step": 20998 + }, + { + "epoch": 8.539650264335096, + "grad_norm": 0.1991769231194303, + "learning_rate": 5.564717348246196e-06, + "loss": 0.0013, + "step": 20999 + }, + { + "epoch": 8.540056933712892, + "grad_norm": 0.20529618001683064, + "learning_rate": 5.563809290566058e-06, + "loss": 0.0011, + "step": 21000 + }, + { + "epoch": 8.540463603090688, + "grad_norm": 9.295784673145947, + "learning_rate": 5.56290127842565e-06, + "loss": 0.204, + "step": 21001 + }, + { + "epoch": 8.540870272468483, + "grad_norm": 6.236331176876537, + "learning_rate": 5.561993311834293e-06, + "loss": 0.075, + "step": 21002 + }, + { + "epoch": 8.541276941846279, + "grad_norm": 0.011100222173844666, + "learning_rate": 5.561085390801304e-06, + "loss": 0.0001, + "step": 21003 + }, + { + "epoch": 8.541683611224075, + "grad_norm": 0.018351218576588186, + "learning_rate": 5.5601775153360125e-06, + "loss": 0.0001, + "step": 21004 + }, + { + "epoch": 8.54209028060187, + "grad_norm": 0.0004392653150752107, + "learning_rate": 5.5592696854477335e-06, + "loss": 0.0, + "step": 21005 + }, + { + "epoch": 8.542496949979666, + "grad_norm": 2.167531246854625, + "learning_rate": 5.558361901145786e-06, + "loss": 0.0218, + "step": 21006 + }, + { + "epoch": 8.542903619357462, + "grad_norm": 0.008458582502367846, + "learning_rate": 5.557454162439488e-06, + "loss": 0.0001, + "step": 21007 + }, + { + "epoch": 8.543310288735258, + "grad_norm": 5.672289977314636, + "learning_rate": 5.55654646933816e-06, + "loss": 0.1682, + "step": 21008 + }, + { + "epoch": 8.543716958113054, + "grad_norm": 2.3871925769152957, + "learning_rate": 5.555638821851115e-06, + "loss": 0.0623, + "step": 21009 + }, + { + "epoch": 8.54412362749085, + "grad_norm": 0.12409280725252948, + "learning_rate": 5.554731219987678e-06, + "loss": 0.0012, + "step": 21010 + }, + { + "epoch": 8.544530296868645, + "grad_norm": 0.04557823015187463, + "learning_rate": 5.553823663757162e-06, + "loss": 0.0004, + "step": 21011 + }, + { + "epoch": 8.544936966246441, + "grad_norm": 2.8950972108734634, + "learning_rate": 5.552916153168883e-06, + "loss": 0.0322, + "step": 21012 + }, + { + "epoch": 8.545343635624237, + "grad_norm": 9.54019530722191, + "learning_rate": 5.5520086882321596e-06, + "loss": 0.0852, + "step": 21013 + }, + { + "epoch": 8.545750305002034, + "grad_norm": 0.015557605191555638, + "learning_rate": 5.5511012689563045e-06, + "loss": 0.0002, + "step": 21014 + }, + { + "epoch": 8.54615697437983, + "grad_norm": 0.46811871593730886, + "learning_rate": 5.5501938953506354e-06, + "loss": 0.0053, + "step": 21015 + }, + { + "epoch": 8.546563643757626, + "grad_norm": 0.01386472139157828, + "learning_rate": 5.54928656742446e-06, + "loss": 0.0001, + "step": 21016 + }, + { + "epoch": 8.546970313135422, + "grad_norm": 0.05088983694459909, + "learning_rate": 5.548379285187102e-06, + "loss": 0.0005, + "step": 21017 + }, + { + "epoch": 8.547376982513217, + "grad_norm": 3.6461639120455493, + "learning_rate": 5.547472048647872e-06, + "loss": 0.0359, + "step": 21018 + }, + { + "epoch": 8.547783651891013, + "grad_norm": 0.5562600686208771, + "learning_rate": 5.546564857816081e-06, + "loss": 0.0052, + "step": 21019 + }, + { + "epoch": 8.548190321268809, + "grad_norm": 0.014951622538323033, + "learning_rate": 5.545657712701044e-06, + "loss": 0.0001, + "step": 21020 + }, + { + "epoch": 8.548596990646605, + "grad_norm": 0.026528101863846455, + "learning_rate": 5.544750613312073e-06, + "loss": 0.0003, + "step": 21021 + }, + { + "epoch": 8.5490036600244, + "grad_norm": 0.015262174606062528, + "learning_rate": 5.543843559658475e-06, + "loss": 0.0002, + "step": 21022 + }, + { + "epoch": 8.549410329402196, + "grad_norm": 0.16334998603803308, + "learning_rate": 5.542936551749568e-06, + "loss": 0.0019, + "step": 21023 + }, + { + "epoch": 8.549816998779992, + "grad_norm": 18.51015754808864, + "learning_rate": 5.542029589594662e-06, + "loss": 0.3196, + "step": 21024 + }, + { + "epoch": 8.550223668157788, + "grad_norm": 0.001903261925200116, + "learning_rate": 5.541122673203066e-06, + "loss": 0.0, + "step": 21025 + }, + { + "epoch": 8.550630337535583, + "grad_norm": 1.611679386230513, + "learning_rate": 5.540215802584089e-06, + "loss": 0.0215, + "step": 21026 + }, + { + "epoch": 8.55103700691338, + "grad_norm": 0.21477966291689435, + "learning_rate": 5.539308977747042e-06, + "loss": 0.0029, + "step": 21027 + }, + { + "epoch": 8.551443676291175, + "grad_norm": 1.402262364469185, + "learning_rate": 5.538402198701232e-06, + "loss": 0.0112, + "step": 21028 + }, + { + "epoch": 8.55185034566897, + "grad_norm": 1.2585843675173017e-05, + "learning_rate": 5.537495465455966e-06, + "loss": 0.0, + "step": 21029 + }, + { + "epoch": 8.552257015046766, + "grad_norm": 0.0050514576710732435, + "learning_rate": 5.536588778020559e-06, + "loss": 0.0001, + "step": 21030 + }, + { + "epoch": 8.552663684424562, + "grad_norm": 0.3284536370860346, + "learning_rate": 5.535682136404316e-06, + "loss": 0.0026, + "step": 21031 + }, + { + "epoch": 8.553070353802358, + "grad_norm": 1.2919924123464512, + "learning_rate": 5.5347755406165414e-06, + "loss": 0.0179, + "step": 21032 + }, + { + "epoch": 8.553477023180154, + "grad_norm": 12.99584014181909, + "learning_rate": 5.533868990666542e-06, + "loss": 0.032, + "step": 21033 + }, + { + "epoch": 8.55388369255795, + "grad_norm": 3.6991482598810204, + "learning_rate": 5.532962486563627e-06, + "loss": 0.0348, + "step": 21034 + }, + { + "epoch": 8.554290361935745, + "grad_norm": 0.016053718464920565, + "learning_rate": 5.532056028317095e-06, + "loss": 0.0002, + "step": 21035 + }, + { + "epoch": 8.554697031313543, + "grad_norm": 0.6207400547630033, + "learning_rate": 5.531149615936262e-06, + "loss": 0.0037, + "step": 21036 + }, + { + "epoch": 8.555103700691339, + "grad_norm": 0.24670624143664785, + "learning_rate": 5.530243249430426e-06, + "loss": 0.0028, + "step": 21037 + }, + { + "epoch": 8.555510370069134, + "grad_norm": 2.1317719196212757, + "learning_rate": 5.529336928808893e-06, + "loss": 0.0267, + "step": 21038 + }, + { + "epoch": 8.55591703944693, + "grad_norm": 0.07371067798708521, + "learning_rate": 5.528430654080966e-06, + "loss": 0.0007, + "step": 21039 + }, + { + "epoch": 8.556323708824726, + "grad_norm": 0.05639144184819865, + "learning_rate": 5.5275244252559505e-06, + "loss": 0.0005, + "step": 21040 + }, + { + "epoch": 8.556730378202522, + "grad_norm": 0.09873900079784713, + "learning_rate": 5.526618242343146e-06, + "loss": 0.0008, + "step": 21041 + }, + { + "epoch": 8.557137047580317, + "grad_norm": 0.020099104118131792, + "learning_rate": 5.525712105351853e-06, + "loss": 0.0002, + "step": 21042 + }, + { + "epoch": 8.557543716958113, + "grad_norm": 1.056208710328117, + "learning_rate": 5.524806014291382e-06, + "loss": 0.0129, + "step": 21043 + }, + { + "epoch": 8.557950386335909, + "grad_norm": 0.01605018166503721, + "learning_rate": 5.523899969171029e-06, + "loss": 0.0001, + "step": 21044 + }, + { + "epoch": 8.558357055713705, + "grad_norm": 0.01588513825238456, + "learning_rate": 5.522993970000094e-06, + "loss": 0.0001, + "step": 21045 + }, + { + "epoch": 8.5587637250915, + "grad_norm": 0.00025317266459298174, + "learning_rate": 5.522088016787881e-06, + "loss": 0.0, + "step": 21046 + }, + { + "epoch": 8.559170394469296, + "grad_norm": 0.6540763928305546, + "learning_rate": 5.5211821095436876e-06, + "loss": 0.0059, + "step": 21047 + }, + { + "epoch": 8.559577063847092, + "grad_norm": 2.647817559027255, + "learning_rate": 5.52027624827681e-06, + "loss": 0.0279, + "step": 21048 + }, + { + "epoch": 8.559983733224888, + "grad_norm": 6.40077112363063, + "learning_rate": 5.519370432996556e-06, + "loss": 0.2293, + "step": 21049 + }, + { + "epoch": 8.560390402602684, + "grad_norm": 4.548488261411509, + "learning_rate": 5.518464663712223e-06, + "loss": 0.0297, + "step": 21050 + }, + { + "epoch": 8.56079707198048, + "grad_norm": 0.7119465153856016, + "learning_rate": 5.517558940433102e-06, + "loss": 0.0067, + "step": 21051 + }, + { + "epoch": 8.561203741358275, + "grad_norm": 0.012410763555914563, + "learning_rate": 5.516653263168495e-06, + "loss": 0.0001, + "step": 21052 + }, + { + "epoch": 8.56161041073607, + "grad_norm": 0.032398913127718816, + "learning_rate": 5.515747631927693e-06, + "loss": 0.0005, + "step": 21053 + }, + { + "epoch": 8.562017080113867, + "grad_norm": 0.025285882037553407, + "learning_rate": 5.514842046720004e-06, + "loss": 0.0003, + "step": 21054 + }, + { + "epoch": 8.562423749491664, + "grad_norm": 7.900023387252996, + "learning_rate": 5.513936507554718e-06, + "loss": 0.1301, + "step": 21055 + }, + { + "epoch": 8.56283041886946, + "grad_norm": 0.44022409573902893, + "learning_rate": 5.51303101444113e-06, + "loss": 0.0031, + "step": 21056 + }, + { + "epoch": 8.563237088247256, + "grad_norm": 0.035686956021074315, + "learning_rate": 5.51212556738854e-06, + "loss": 0.0002, + "step": 21057 + }, + { + "epoch": 8.563643757625051, + "grad_norm": 6.128999943036823, + "learning_rate": 5.511220166406237e-06, + "loss": 0.1654, + "step": 21058 + }, + { + "epoch": 8.564050427002847, + "grad_norm": 0.0010209015337168952, + "learning_rate": 5.51031481150352e-06, + "loss": 0.0, + "step": 21059 + }, + { + "epoch": 8.564457096380643, + "grad_norm": 0.13116598422490688, + "learning_rate": 5.509409502689676e-06, + "loss": 0.0014, + "step": 21060 + }, + { + "epoch": 8.564863765758439, + "grad_norm": 0.8071106970960567, + "learning_rate": 5.508504239974009e-06, + "loss": 0.009, + "step": 21061 + }, + { + "epoch": 8.565270435136235, + "grad_norm": 0.007146813525417925, + "learning_rate": 5.507599023365805e-06, + "loss": 0.0001, + "step": 21062 + }, + { + "epoch": 8.56567710451403, + "grad_norm": 2.5960696709140683, + "learning_rate": 5.5066938528743585e-06, + "loss": 0.0265, + "step": 21063 + }, + { + "epoch": 8.566083773891826, + "grad_norm": 0.013357537330388544, + "learning_rate": 5.5057887285089606e-06, + "loss": 0.0002, + "step": 21064 + }, + { + "epoch": 8.566490443269622, + "grad_norm": 0.04619227559806856, + "learning_rate": 5.504883650278904e-06, + "loss": 0.0004, + "step": 21065 + }, + { + "epoch": 8.566897112647418, + "grad_norm": 0.2550815980582259, + "learning_rate": 5.503978618193475e-06, + "loss": 0.001, + "step": 21066 + }, + { + "epoch": 8.567303782025213, + "grad_norm": 0.01168463719146468, + "learning_rate": 5.503073632261971e-06, + "loss": 0.0002, + "step": 21067 + }, + { + "epoch": 8.56771045140301, + "grad_norm": 0.1346684118318616, + "learning_rate": 5.502168692493681e-06, + "loss": 0.0017, + "step": 21068 + }, + { + "epoch": 8.568117120780805, + "grad_norm": 0.89305103418084, + "learning_rate": 5.501263798897893e-06, + "loss": 0.0071, + "step": 21069 + }, + { + "epoch": 8.5685237901586, + "grad_norm": 11.772732455137774, + "learning_rate": 5.500358951483895e-06, + "loss": 0.2676, + "step": 21070 + }, + { + "epoch": 8.568930459536396, + "grad_norm": 0.0002635275222955491, + "learning_rate": 5.499454150260979e-06, + "loss": 0.0, + "step": 21071 + }, + { + "epoch": 8.569337128914192, + "grad_norm": 0.008562966094277227, + "learning_rate": 5.498549395238429e-06, + "loss": 0.0001, + "step": 21072 + }, + { + "epoch": 8.569743798291988, + "grad_norm": 0.01957450217292327, + "learning_rate": 5.497644686425532e-06, + "loss": 0.0002, + "step": 21073 + }, + { + "epoch": 8.570150467669784, + "grad_norm": 4.5601987408739415, + "learning_rate": 5.4967400238315825e-06, + "loss": 0.0538, + "step": 21074 + }, + { + "epoch": 8.57055713704758, + "grad_norm": 0.5123520779916703, + "learning_rate": 5.495835407465864e-06, + "loss": 0.005, + "step": 21075 + }, + { + "epoch": 8.570963806425375, + "grad_norm": 8.07198278073273, + "learning_rate": 5.49493083733766e-06, + "loss": 0.22, + "step": 21076 + }, + { + "epoch": 8.571370475803173, + "grad_norm": 0.04291972541035799, + "learning_rate": 5.49402631345626e-06, + "loss": 0.0003, + "step": 21077 + }, + { + "epoch": 8.571777145180969, + "grad_norm": 12.45722115863245, + "learning_rate": 5.493121835830946e-06, + "loss": 0.179, + "step": 21078 + }, + { + "epoch": 8.572183814558764, + "grad_norm": 1.125139070900598, + "learning_rate": 5.492217404471001e-06, + "loss": 0.0094, + "step": 21079 + }, + { + "epoch": 8.57259048393656, + "grad_norm": 0.03497574262359294, + "learning_rate": 5.491313019385718e-06, + "loss": 0.0003, + "step": 21080 + }, + { + "epoch": 8.572997153314356, + "grad_norm": 0.014561699727049023, + "learning_rate": 5.490408680584376e-06, + "loss": 0.0001, + "step": 21081 + }, + { + "epoch": 8.573403822692152, + "grad_norm": 0.2005122201529511, + "learning_rate": 5.4895043880762575e-06, + "loss": 0.0018, + "step": 21082 + }, + { + "epoch": 8.573810492069947, + "grad_norm": 2.2632929873236214, + "learning_rate": 5.488600141870648e-06, + "loss": 0.0412, + "step": 21083 + }, + { + "epoch": 8.574217161447743, + "grad_norm": 0.023477099602062148, + "learning_rate": 5.487695941976828e-06, + "loss": 0.0003, + "step": 21084 + }, + { + "epoch": 8.574623830825539, + "grad_norm": 7.652692563644338, + "learning_rate": 5.486791788404079e-06, + "loss": 0.1985, + "step": 21085 + }, + { + "epoch": 8.575030500203335, + "grad_norm": 0.0030906091344928065, + "learning_rate": 5.485887681161682e-06, + "loss": 0.0, + "step": 21086 + }, + { + "epoch": 8.57543716958113, + "grad_norm": 0.03729185562837781, + "learning_rate": 5.484983620258921e-06, + "loss": 0.0005, + "step": 21087 + }, + { + "epoch": 8.575843838958926, + "grad_norm": 7.960717709781024, + "learning_rate": 5.484079605705077e-06, + "loss": 0.3582, + "step": 21088 + }, + { + "epoch": 8.576250508336722, + "grad_norm": 0.071475054914987, + "learning_rate": 5.483175637509428e-06, + "loss": 0.0008, + "step": 21089 + }, + { + "epoch": 8.576657177714518, + "grad_norm": 0.032051613405040515, + "learning_rate": 5.482271715681256e-06, + "loss": 0.0005, + "step": 21090 + }, + { + "epoch": 8.577063847092314, + "grad_norm": 1.1332885014453407, + "learning_rate": 5.481367840229836e-06, + "loss": 0.0103, + "step": 21091 + }, + { + "epoch": 8.57747051647011, + "grad_norm": 0.0025457099309007035, + "learning_rate": 5.480464011164448e-06, + "loss": 0.0, + "step": 21092 + }, + { + "epoch": 8.577877185847905, + "grad_norm": 0.025814157537064553, + "learning_rate": 5.4795602284943735e-06, + "loss": 0.0003, + "step": 21093 + }, + { + "epoch": 8.5782838552257, + "grad_norm": 3.4477535516103646, + "learning_rate": 5.478656492228891e-06, + "loss": 0.0633, + "step": 21094 + }, + { + "epoch": 8.578690524603497, + "grad_norm": 0.05216518779821795, + "learning_rate": 5.477752802377272e-06, + "loss": 0.0006, + "step": 21095 + }, + { + "epoch": 8.579097193981294, + "grad_norm": 3.0390614675677, + "learning_rate": 5.4768491589487985e-06, + "loss": 0.0318, + "step": 21096 + }, + { + "epoch": 8.57950386335909, + "grad_norm": 1.226327386116952, + "learning_rate": 5.475945561952744e-06, + "loss": 0.0063, + "step": 21097 + }, + { + "epoch": 8.579910532736886, + "grad_norm": 0.0609726453873487, + "learning_rate": 5.475042011398386e-06, + "loss": 0.0006, + "step": 21098 + }, + { + "epoch": 8.580317202114681, + "grad_norm": 0.9383196255441705, + "learning_rate": 5.474138507294995e-06, + "loss": 0.0066, + "step": 21099 + }, + { + "epoch": 8.580723871492477, + "grad_norm": 4.836800828090027, + "learning_rate": 5.473235049651853e-06, + "loss": 0.0538, + "step": 21100 + }, + { + "epoch": 8.581130540870273, + "grad_norm": 0.030954671742622494, + "learning_rate": 5.472331638478232e-06, + "loss": 0.0003, + "step": 21101 + }, + { + "epoch": 8.581537210248069, + "grad_norm": 0.19602576251305262, + "learning_rate": 5.471428273783406e-06, + "loss": 0.0021, + "step": 21102 + }, + { + "epoch": 8.581943879625864, + "grad_norm": 0.005892258458503837, + "learning_rate": 5.470524955576647e-06, + "loss": 0.0001, + "step": 21103 + }, + { + "epoch": 8.58235054900366, + "grad_norm": 0.6375006081643079, + "learning_rate": 5.46962168386723e-06, + "loss": 0.0056, + "step": 21104 + }, + { + "epoch": 8.582757218381456, + "grad_norm": 41.45483733090403, + "learning_rate": 5.468718458664422e-06, + "loss": 0.13, + "step": 21105 + }, + { + "epoch": 8.583163887759252, + "grad_norm": 4.400529599143403, + "learning_rate": 5.467815279977503e-06, + "loss": 0.0492, + "step": 21106 + }, + { + "epoch": 8.583570557137048, + "grad_norm": 5.180906836495728, + "learning_rate": 5.466912147815741e-06, + "loss": 0.0693, + "step": 21107 + }, + { + "epoch": 8.583977226514843, + "grad_norm": 0.8068137864823837, + "learning_rate": 5.466009062188409e-06, + "loss": 0.0105, + "step": 21108 + }, + { + "epoch": 8.584383895892639, + "grad_norm": 0.044831895461842505, + "learning_rate": 5.465106023104774e-06, + "loss": 0.0004, + "step": 21109 + }, + { + "epoch": 8.584790565270435, + "grad_norm": 1.1478763835185057, + "learning_rate": 5.464203030574109e-06, + "loss": 0.0088, + "step": 21110 + }, + { + "epoch": 8.58519723464823, + "grad_norm": 6.916127220962047, + "learning_rate": 5.4633000846056825e-06, + "loss": 0.1701, + "step": 21111 + }, + { + "epoch": 8.585603904026026, + "grad_norm": 0.1698562576494447, + "learning_rate": 5.462397185208761e-06, + "loss": 0.0018, + "step": 21112 + }, + { + "epoch": 8.586010573403822, + "grad_norm": 0.12960114223193742, + "learning_rate": 5.461494332392619e-06, + "loss": 0.0013, + "step": 21113 + }, + { + "epoch": 8.586417242781618, + "grad_norm": 0.7250019629266645, + "learning_rate": 5.4605915261665206e-06, + "loss": 0.004, + "step": 21114 + }, + { + "epoch": 8.586823912159414, + "grad_norm": 0.056743731458542, + "learning_rate": 5.459688766539737e-06, + "loss": 0.001, + "step": 21115 + }, + { + "epoch": 8.58723058153721, + "grad_norm": 2.3622979029589177, + "learning_rate": 5.458786053521532e-06, + "loss": 0.0406, + "step": 21116 + }, + { + "epoch": 8.587637250915005, + "grad_norm": 3.0943018864744136, + "learning_rate": 5.457883387121175e-06, + "loss": 0.2069, + "step": 21117 + }, + { + "epoch": 8.588043920292803, + "grad_norm": 0.2845776221649903, + "learning_rate": 5.4569807673479305e-06, + "loss": 0.0031, + "step": 21118 + }, + { + "epoch": 8.588450589670598, + "grad_norm": 0.002175773990161392, + "learning_rate": 5.456078194211061e-06, + "loss": 0.0, + "step": 21119 + }, + { + "epoch": 8.588857259048394, + "grad_norm": 0.0802428960199419, + "learning_rate": 5.455175667719839e-06, + "loss": 0.0009, + "step": 21120 + }, + { + "epoch": 8.58926392842619, + "grad_norm": 4.013737344787954, + "learning_rate": 5.4542731878835255e-06, + "loss": 0.0489, + "step": 21121 + }, + { + "epoch": 8.589670597803986, + "grad_norm": 0.30971585271309443, + "learning_rate": 5.453370754711387e-06, + "loss": 0.0033, + "step": 21122 + }, + { + "epoch": 8.590077267181782, + "grad_norm": 0.0025521269089605783, + "learning_rate": 5.452468368212685e-06, + "loss": 0.0, + "step": 21123 + }, + { + "epoch": 8.590483936559577, + "grad_norm": 0.002877248106537634, + "learning_rate": 5.451566028396685e-06, + "loss": 0.0, + "step": 21124 + }, + { + "epoch": 8.590890605937373, + "grad_norm": 3.110507109478985, + "learning_rate": 5.450663735272644e-06, + "loss": 0.0751, + "step": 21125 + }, + { + "epoch": 8.591297275315169, + "grad_norm": 3.588374324873002, + "learning_rate": 5.449761488849834e-06, + "loss": 0.03, + "step": 21126 + }, + { + "epoch": 8.591703944692965, + "grad_norm": 5.89196202092148, + "learning_rate": 5.448859289137511e-06, + "loss": 0.1621, + "step": 21127 + }, + { + "epoch": 8.59211061407076, + "grad_norm": 0.9367410807593973, + "learning_rate": 5.44795713614494e-06, + "loss": 0.0096, + "step": 21128 + }, + { + "epoch": 8.592517283448556, + "grad_norm": 1.532648208675991, + "learning_rate": 5.4470550298813794e-06, + "loss": 0.0149, + "step": 21129 + }, + { + "epoch": 8.592923952826352, + "grad_norm": 0.07842333097481291, + "learning_rate": 5.44615297035609e-06, + "loss": 0.0013, + "step": 21130 + }, + { + "epoch": 8.593330622204148, + "grad_norm": 0.8971421343041385, + "learning_rate": 5.445250957578332e-06, + "loss": 0.0099, + "step": 21131 + }, + { + "epoch": 8.593737291581943, + "grad_norm": 0.07754682385617746, + "learning_rate": 5.444348991557362e-06, + "loss": 0.001, + "step": 21132 + }, + { + "epoch": 8.59414396095974, + "grad_norm": 0.09333005902059495, + "learning_rate": 5.443447072302446e-06, + "loss": 0.0015, + "step": 21133 + }, + { + "epoch": 8.594550630337535, + "grad_norm": 0.5655159604430959, + "learning_rate": 5.442545199822839e-06, + "loss": 0.0051, + "step": 21134 + }, + { + "epoch": 8.59495729971533, + "grad_norm": 0.2140919542033535, + "learning_rate": 5.441643374127798e-06, + "loss": 0.0021, + "step": 21135 + }, + { + "epoch": 8.595363969093126, + "grad_norm": 0.2536498938790814, + "learning_rate": 5.440741595226584e-06, + "loss": 0.0024, + "step": 21136 + }, + { + "epoch": 8.595770638470924, + "grad_norm": 0.006639422324980344, + "learning_rate": 5.439839863128451e-06, + "loss": 0.0001, + "step": 21137 + }, + { + "epoch": 8.59617730784872, + "grad_norm": 0.7281080935513925, + "learning_rate": 5.438938177842654e-06, + "loss": 0.0086, + "step": 21138 + }, + { + "epoch": 8.596583977226516, + "grad_norm": 0.024001106063856213, + "learning_rate": 5.438036539378455e-06, + "loss": 0.0003, + "step": 21139 + }, + { + "epoch": 8.596990646604311, + "grad_norm": 6.205277423821942, + "learning_rate": 5.4371349477451084e-06, + "loss": 0.1279, + "step": 21140 + }, + { + "epoch": 8.597397315982107, + "grad_norm": 0.00399672738583435, + "learning_rate": 5.436233402951866e-06, + "loss": 0.0, + "step": 21141 + }, + { + "epoch": 8.597803985359903, + "grad_norm": 0.00970777694686358, + "learning_rate": 5.435331905007985e-06, + "loss": 0.0001, + "step": 21142 + }, + { + "epoch": 8.598210654737699, + "grad_norm": 0.8724359949934178, + "learning_rate": 5.434430453922721e-06, + "loss": 0.0069, + "step": 21143 + }, + { + "epoch": 8.598617324115494, + "grad_norm": 0.8047218539661432, + "learning_rate": 5.433529049705324e-06, + "loss": 0.0106, + "step": 21144 + }, + { + "epoch": 8.59902399349329, + "grad_norm": 0.011112709074550269, + "learning_rate": 5.432627692365047e-06, + "loss": 0.0001, + "step": 21145 + }, + { + "epoch": 8.599430662871086, + "grad_norm": 0.42522679994911083, + "learning_rate": 5.431726381911149e-06, + "loss": 0.0039, + "step": 21146 + }, + { + "epoch": 8.599837332248882, + "grad_norm": 0.10393004993848522, + "learning_rate": 5.4308251183528794e-06, + "loss": 0.0013, + "step": 21147 + }, + { + "epoch": 8.600244001626677, + "grad_norm": 0.03357662278471989, + "learning_rate": 5.42992390169949e-06, + "loss": 0.0004, + "step": 21148 + }, + { + "epoch": 8.600650671004473, + "grad_norm": 6.783697433485477, + "learning_rate": 5.429022731960229e-06, + "loss": 0.1764, + "step": 21149 + }, + { + "epoch": 8.601057340382269, + "grad_norm": 0.05016062981811624, + "learning_rate": 5.428121609144352e-06, + "loss": 0.0004, + "step": 21150 + }, + { + "epoch": 8.601464009760065, + "grad_norm": 0.010125635304688302, + "learning_rate": 5.427220533261108e-06, + "loss": 0.0001, + "step": 21151 + }, + { + "epoch": 8.60187067913786, + "grad_norm": 0.13172012506390168, + "learning_rate": 5.426319504319748e-06, + "loss": 0.0015, + "step": 21152 + }, + { + "epoch": 8.602277348515656, + "grad_norm": 0.020834919530237323, + "learning_rate": 5.425418522329518e-06, + "loss": 0.0003, + "step": 21153 + }, + { + "epoch": 8.602684017893452, + "grad_norm": 0.013944100599049483, + "learning_rate": 5.424517587299669e-06, + "loss": 0.0002, + "step": 21154 + }, + { + "epoch": 8.603090687271248, + "grad_norm": 0.0989795202739612, + "learning_rate": 5.423616699239451e-06, + "loss": 0.0006, + "step": 21155 + }, + { + "epoch": 8.603497356649044, + "grad_norm": 0.5212522718476816, + "learning_rate": 5.4227158581581075e-06, + "loss": 0.0058, + "step": 21156 + }, + { + "epoch": 8.60390402602684, + "grad_norm": 0.14441476730071381, + "learning_rate": 5.421815064064891e-06, + "loss": 0.0011, + "step": 21157 + }, + { + "epoch": 8.604310695404635, + "grad_norm": 0.4896408809635884, + "learning_rate": 5.420914316969048e-06, + "loss": 0.0084, + "step": 21158 + }, + { + "epoch": 8.604717364782433, + "grad_norm": 0.01551783552601829, + "learning_rate": 5.420013616879824e-06, + "loss": 0.0002, + "step": 21159 + }, + { + "epoch": 8.605124034160228, + "grad_norm": 0.0027307915098255366, + "learning_rate": 5.419112963806468e-06, + "loss": 0.0, + "step": 21160 + }, + { + "epoch": 8.605530703538024, + "grad_norm": 0.5792185892703531, + "learning_rate": 5.418212357758219e-06, + "loss": 0.0047, + "step": 21161 + }, + { + "epoch": 8.60593737291582, + "grad_norm": 2.8887482488593674, + "learning_rate": 5.417311798744329e-06, + "loss": 0.0559, + "step": 21162 + }, + { + "epoch": 8.606344042293616, + "grad_norm": 2.1357611069102367, + "learning_rate": 5.416411286774035e-06, + "loss": 0.0222, + "step": 21163 + }, + { + "epoch": 8.606750711671411, + "grad_norm": 0.00040614083450723294, + "learning_rate": 5.415510821856589e-06, + "loss": 0.0, + "step": 21164 + }, + { + "epoch": 8.607157381049207, + "grad_norm": 0.10147849194722677, + "learning_rate": 5.414610404001232e-06, + "loss": 0.0007, + "step": 21165 + }, + { + "epoch": 8.607564050427003, + "grad_norm": 0.02172891238506897, + "learning_rate": 5.413710033217208e-06, + "loss": 0.0002, + "step": 21166 + }, + { + "epoch": 8.607970719804799, + "grad_norm": 2.519876280159088, + "learning_rate": 5.412809709513758e-06, + "loss": 0.0688, + "step": 21167 + }, + { + "epoch": 8.608377389182595, + "grad_norm": 0.03768295420669638, + "learning_rate": 5.411909432900124e-06, + "loss": 0.0004, + "step": 21168 + }, + { + "epoch": 8.60878405856039, + "grad_norm": 0.012371871322639541, + "learning_rate": 5.4110092033855465e-06, + "loss": 0.0002, + "step": 21169 + }, + { + "epoch": 8.609190727938186, + "grad_norm": 3.9716701305791235, + "learning_rate": 5.410109020979273e-06, + "loss": 0.0736, + "step": 21170 + }, + { + "epoch": 8.609597397315982, + "grad_norm": 3.8203461390556943, + "learning_rate": 5.40920888569054e-06, + "loss": 0.0489, + "step": 21171 + }, + { + "epoch": 8.610004066693778, + "grad_norm": 2.3352786049581393, + "learning_rate": 5.4083087975285875e-06, + "loss": 0.057, + "step": 21172 + }, + { + "epoch": 8.610410736071573, + "grad_norm": 1.1724955248552695, + "learning_rate": 5.4074087565026566e-06, + "loss": 0.0122, + "step": 21173 + }, + { + "epoch": 8.61081740544937, + "grad_norm": 1.0213267138469093, + "learning_rate": 5.406508762621987e-06, + "loss": 0.0068, + "step": 21174 + }, + { + "epoch": 8.611224074827165, + "grad_norm": 0.061156720857827945, + "learning_rate": 5.405608815895817e-06, + "loss": 0.0005, + "step": 21175 + }, + { + "epoch": 8.61163074420496, + "grad_norm": 0.00500651229962996, + "learning_rate": 5.40470891633338e-06, + "loss": 0.0001, + "step": 21176 + }, + { + "epoch": 8.612037413582756, + "grad_norm": 0.0866122632301995, + "learning_rate": 5.4038090639439236e-06, + "loss": 0.0014, + "step": 21177 + }, + { + "epoch": 8.612444082960554, + "grad_norm": 0.40854451717728474, + "learning_rate": 5.40290925873668e-06, + "loss": 0.0028, + "step": 21178 + }, + { + "epoch": 8.61285075233835, + "grad_norm": 2.3378652224280034, + "learning_rate": 5.4020095007208855e-06, + "loss": 0.0213, + "step": 21179 + }, + { + "epoch": 8.613257421716146, + "grad_norm": 0.2917554298019376, + "learning_rate": 5.401109789905778e-06, + "loss": 0.0024, + "step": 21180 + }, + { + "epoch": 8.613664091093941, + "grad_norm": 5.7817913626750315, + "learning_rate": 5.4002101263005934e-06, + "loss": 0.0827, + "step": 21181 + }, + { + "epoch": 8.614070760471737, + "grad_norm": 0.03694582928304047, + "learning_rate": 5.399310509914563e-06, + "loss": 0.0004, + "step": 21182 + }, + { + "epoch": 8.614477429849533, + "grad_norm": 0.05913279220361035, + "learning_rate": 5.398410940756928e-06, + "loss": 0.0009, + "step": 21183 + }, + { + "epoch": 8.614884099227329, + "grad_norm": 0.0006919397361870346, + "learning_rate": 5.397511418836923e-06, + "loss": 0.0, + "step": 21184 + }, + { + "epoch": 8.615290768605124, + "grad_norm": 1.8454320774168267, + "learning_rate": 5.396611944163779e-06, + "loss": 0.0208, + "step": 21185 + }, + { + "epoch": 8.61569743798292, + "grad_norm": 5.571342887250653, + "learning_rate": 5.39571251674673e-06, + "loss": 0.0805, + "step": 21186 + }, + { + "epoch": 8.616104107360716, + "grad_norm": 0.0009430392538330085, + "learning_rate": 5.39481313659501e-06, + "loss": 0.0, + "step": 21187 + }, + { + "epoch": 8.616510776738512, + "grad_norm": 8.867404571262506, + "learning_rate": 5.39391380371785e-06, + "loss": 0.0926, + "step": 21188 + }, + { + "epoch": 8.616917446116307, + "grad_norm": 0.019487240722320585, + "learning_rate": 5.3930145181244785e-06, + "loss": 0.0002, + "step": 21189 + }, + { + "epoch": 8.617324115494103, + "grad_norm": 0.46253749285761736, + "learning_rate": 5.392115279824137e-06, + "loss": 0.0035, + "step": 21190 + }, + { + "epoch": 8.617730784871899, + "grad_norm": 0.09669174937396531, + "learning_rate": 5.39121608882605e-06, + "loss": 0.0015, + "step": 21191 + }, + { + "epoch": 8.618137454249695, + "grad_norm": 0.12803049133133948, + "learning_rate": 5.390316945139451e-06, + "loss": 0.0013, + "step": 21192 + }, + { + "epoch": 8.61854412362749, + "grad_norm": 0.36969208654727914, + "learning_rate": 5.389417848773567e-06, + "loss": 0.0041, + "step": 21193 + }, + { + "epoch": 8.618950793005286, + "grad_norm": 0.0031082897886019188, + "learning_rate": 5.38851879973763e-06, + "loss": 0.0, + "step": 21194 + }, + { + "epoch": 8.619357462383082, + "grad_norm": 2.7240118202081547, + "learning_rate": 5.387619798040864e-06, + "loss": 0.025, + "step": 21195 + }, + { + "epoch": 8.619764131760878, + "grad_norm": 0.0446608438115229, + "learning_rate": 5.386720843692507e-06, + "loss": 0.0007, + "step": 21196 + }, + { + "epoch": 8.620170801138674, + "grad_norm": 0.33386328280155697, + "learning_rate": 5.3858219367017805e-06, + "loss": 0.0045, + "step": 21197 + }, + { + "epoch": 8.62057747051647, + "grad_norm": 3.9119896165442953, + "learning_rate": 5.384923077077916e-06, + "loss": 0.069, + "step": 21198 + }, + { + "epoch": 8.620984139894265, + "grad_norm": 0.05635302976708319, + "learning_rate": 5.384024264830139e-06, + "loss": 0.0006, + "step": 21199 + }, + { + "epoch": 8.621390809272063, + "grad_norm": 0.10077875265175967, + "learning_rate": 5.383125499967674e-06, + "loss": 0.0009, + "step": 21200 + }, + { + "epoch": 8.621797478649858, + "grad_norm": 0.7621175750457958, + "learning_rate": 5.382226782499751e-06, + "loss": 0.0074, + "step": 21201 + }, + { + "epoch": 8.622204148027654, + "grad_norm": 0.00273645230688663, + "learning_rate": 5.38132811243559e-06, + "loss": 0.0, + "step": 21202 + }, + { + "epoch": 8.62261081740545, + "grad_norm": 0.0050483058145141925, + "learning_rate": 5.380429489784424e-06, + "loss": 0.0001, + "step": 21203 + }, + { + "epoch": 8.623017486783246, + "grad_norm": 0.0031579553544392293, + "learning_rate": 5.379530914555475e-06, + "loss": 0.0, + "step": 21204 + }, + { + "epoch": 8.623424156161041, + "grad_norm": 0.02130602303701597, + "learning_rate": 5.378632386757966e-06, + "loss": 0.0004, + "step": 21205 + }, + { + "epoch": 8.623830825538837, + "grad_norm": 2.390662192692495, + "learning_rate": 5.377733906401121e-06, + "loss": 0.0283, + "step": 21206 + }, + { + "epoch": 8.624237494916633, + "grad_norm": 0.021612503712656363, + "learning_rate": 5.376835473494164e-06, + "loss": 0.0003, + "step": 21207 + }, + { + "epoch": 8.624644164294429, + "grad_norm": 0.0213440996320673, + "learning_rate": 5.375937088046314e-06, + "loss": 0.0002, + "step": 21208 + }, + { + "epoch": 8.625050833672224, + "grad_norm": 0.01922055827822889, + "learning_rate": 5.375038750066801e-06, + "loss": 0.0001, + "step": 21209 + }, + { + "epoch": 8.62545750305002, + "grad_norm": 0.35536503675287806, + "learning_rate": 5.3741404595648425e-06, + "loss": 0.0034, + "step": 21210 + }, + { + "epoch": 8.625864172427816, + "grad_norm": 0.19652341198521775, + "learning_rate": 5.373242216549661e-06, + "loss": 0.0018, + "step": 21211 + }, + { + "epoch": 8.626270841805612, + "grad_norm": 0.007036884650778904, + "learning_rate": 5.372344021030478e-06, + "loss": 0.0001, + "step": 21212 + }, + { + "epoch": 8.626677511183408, + "grad_norm": 0.0015832628794840346, + "learning_rate": 5.37144587301651e-06, + "loss": 0.0, + "step": 21213 + }, + { + "epoch": 8.627084180561203, + "grad_norm": 0.005014648296945217, + "learning_rate": 5.37054777251698e-06, + "loss": 0.0, + "step": 21214 + }, + { + "epoch": 8.627490849938999, + "grad_norm": 0.2973469297978501, + "learning_rate": 5.369649719541106e-06, + "loss": 0.0031, + "step": 21215 + }, + { + "epoch": 8.627897519316795, + "grad_norm": 0.06061580602684528, + "learning_rate": 5.368751714098109e-06, + "loss": 0.0005, + "step": 21216 + }, + { + "epoch": 8.62830418869459, + "grad_norm": 0.04050087613192571, + "learning_rate": 5.367853756197207e-06, + "loss": 0.0004, + "step": 21217 + }, + { + "epoch": 8.628710858072386, + "grad_norm": 0.03613762673274423, + "learning_rate": 5.366955845847619e-06, + "loss": 0.0003, + "step": 21218 + }, + { + "epoch": 8.629117527450184, + "grad_norm": 0.6961686657977936, + "learning_rate": 5.366057983058559e-06, + "loss": 0.0064, + "step": 21219 + }, + { + "epoch": 8.62952419682798, + "grad_norm": 0.26101631400657294, + "learning_rate": 5.365160167839247e-06, + "loss": 0.0024, + "step": 21220 + }, + { + "epoch": 8.629930866205775, + "grad_norm": 8.061374335755689, + "learning_rate": 5.364262400198894e-06, + "loss": 0.1366, + "step": 21221 + }, + { + "epoch": 8.630337535583571, + "grad_norm": 0.06194443778108536, + "learning_rate": 5.3633646801467255e-06, + "loss": 0.0008, + "step": 21222 + }, + { + "epoch": 8.630744204961367, + "grad_norm": 0.6703472540976242, + "learning_rate": 5.3624670076919516e-06, + "loss": 0.0087, + "step": 21223 + }, + { + "epoch": 8.631150874339163, + "grad_norm": 0.01652629272302341, + "learning_rate": 5.3615693828437885e-06, + "loss": 0.0002, + "step": 21224 + }, + { + "epoch": 8.631557543716958, + "grad_norm": 0.03305277769234838, + "learning_rate": 5.3606718056114505e-06, + "loss": 0.0003, + "step": 21225 + }, + { + "epoch": 8.631964213094754, + "grad_norm": 0.24888262437275432, + "learning_rate": 5.359774276004151e-06, + "loss": 0.0017, + "step": 21226 + }, + { + "epoch": 8.63237088247255, + "grad_norm": 1.5209840991050876, + "learning_rate": 5.358876794031103e-06, + "loss": 0.0215, + "step": 21227 + }, + { + "epoch": 8.632777551850346, + "grad_norm": 2.8898763163892895, + "learning_rate": 5.3579793597015165e-06, + "loss": 0.0256, + "step": 21228 + }, + { + "epoch": 8.633184221228142, + "grad_norm": 0.8336121290900098, + "learning_rate": 5.357081973024612e-06, + "loss": 0.0076, + "step": 21229 + }, + { + "epoch": 8.633590890605937, + "grad_norm": 17.934683763917374, + "learning_rate": 5.356184634009598e-06, + "loss": 0.4197, + "step": 21230 + }, + { + "epoch": 8.633997559983733, + "grad_norm": 0.11555342768349469, + "learning_rate": 5.355287342665687e-06, + "loss": 0.0012, + "step": 21231 + }, + { + "epoch": 8.634404229361529, + "grad_norm": 0.0021015180176323536, + "learning_rate": 5.3543900990020885e-06, + "loss": 0.0, + "step": 21232 + }, + { + "epoch": 8.634810898739325, + "grad_norm": 0.06632479787047992, + "learning_rate": 5.353492903028013e-06, + "loss": 0.0007, + "step": 21233 + }, + { + "epoch": 8.63521756811712, + "grad_norm": 0.4386365280009981, + "learning_rate": 5.352595754752672e-06, + "loss": 0.0051, + "step": 21234 + }, + { + "epoch": 8.635624237494916, + "grad_norm": 0.005058804472171339, + "learning_rate": 5.351698654185271e-06, + "loss": 0.0001, + "step": 21235 + }, + { + "epoch": 8.636030906872712, + "grad_norm": 0.3489680810737276, + "learning_rate": 5.350801601335026e-06, + "loss": 0.0027, + "step": 21236 + }, + { + "epoch": 8.636437576250508, + "grad_norm": 0.0006566487295631714, + "learning_rate": 5.349904596211143e-06, + "loss": 0.0, + "step": 21237 + }, + { + "epoch": 8.636844245628303, + "grad_norm": 0.000854063500028811, + "learning_rate": 5.34900763882283e-06, + "loss": 0.0, + "step": 21238 + }, + { + "epoch": 8.6372509150061, + "grad_norm": 0.014807587221173311, + "learning_rate": 5.348110729179293e-06, + "loss": 0.0002, + "step": 21239 + }, + { + "epoch": 8.637657584383895, + "grad_norm": 0.07241183672141918, + "learning_rate": 5.347213867289742e-06, + "loss": 0.0007, + "step": 21240 + }, + { + "epoch": 8.638064253761693, + "grad_norm": 2.7785027029695493, + "learning_rate": 5.3463170531633764e-06, + "loss": 0.0332, + "step": 21241 + }, + { + "epoch": 8.638470923139488, + "grad_norm": 1.4500421490126691, + "learning_rate": 5.345420286809415e-06, + "loss": 0.0131, + "step": 21242 + }, + { + "epoch": 8.638877592517284, + "grad_norm": 3.009166044858388, + "learning_rate": 5.344523568237056e-06, + "loss": 0.1096, + "step": 21243 + }, + { + "epoch": 8.63928426189508, + "grad_norm": 0.042767546184613946, + "learning_rate": 5.343626897455505e-06, + "loss": 0.0005, + "step": 21244 + }, + { + "epoch": 8.639690931272876, + "grad_norm": 0.024689009443956144, + "learning_rate": 5.3427302744739675e-06, + "loss": 0.0002, + "step": 21245 + }, + { + "epoch": 8.640097600650671, + "grad_norm": 0.16200218685443346, + "learning_rate": 5.3418336993016485e-06, + "loss": 0.0022, + "step": 21246 + }, + { + "epoch": 8.640504270028467, + "grad_norm": 0.08290091788053693, + "learning_rate": 5.34093717194775e-06, + "loss": 0.0006, + "step": 21247 + }, + { + "epoch": 8.640910939406263, + "grad_norm": 0.0028538143871429233, + "learning_rate": 5.340040692421473e-06, + "loss": 0.0, + "step": 21248 + }, + { + "epoch": 8.641317608784059, + "grad_norm": 0.008114154465049653, + "learning_rate": 5.339144260732027e-06, + "loss": 0.0001, + "step": 21249 + }, + { + "epoch": 8.641724278161854, + "grad_norm": 14.625014453864495, + "learning_rate": 5.338247876888616e-06, + "loss": 0.3897, + "step": 21250 + }, + { + "epoch": 8.64213094753965, + "grad_norm": 0.03010636381143001, + "learning_rate": 5.337351540900431e-06, + "loss": 0.0002, + "step": 21251 + }, + { + "epoch": 8.642537616917446, + "grad_norm": 0.09518186286434971, + "learning_rate": 5.33645525277668e-06, + "loss": 0.001, + "step": 21252 + }, + { + "epoch": 8.642944286295242, + "grad_norm": 0.015628678063440486, + "learning_rate": 5.335559012526559e-06, + "loss": 0.0003, + "step": 21253 + }, + { + "epoch": 8.643350955673037, + "grad_norm": 0.5888688739235296, + "learning_rate": 5.334662820159276e-06, + "loss": 0.0041, + "step": 21254 + }, + { + "epoch": 8.643757625050833, + "grad_norm": 0.009337147404887962, + "learning_rate": 5.333766675684028e-06, + "loss": 0.0001, + "step": 21255 + }, + { + "epoch": 8.644164294428629, + "grad_norm": 0.7187090152932161, + "learning_rate": 5.332870579110013e-06, + "loss": 0.0065, + "step": 21256 + }, + { + "epoch": 8.644570963806425, + "grad_norm": 0.0061154584240568815, + "learning_rate": 5.331974530446431e-06, + "loss": 0.0001, + "step": 21257 + }, + { + "epoch": 8.64497763318422, + "grad_norm": 0.1560142276158705, + "learning_rate": 5.331078529702478e-06, + "loss": 0.0014, + "step": 21258 + }, + { + "epoch": 8.645384302562016, + "grad_norm": 1.3793243703468963, + "learning_rate": 5.330182576887351e-06, + "loss": 0.011, + "step": 21259 + }, + { + "epoch": 8.645790971939814, + "grad_norm": 0.024450533828484465, + "learning_rate": 5.329286672010254e-06, + "loss": 0.0002, + "step": 21260 + }, + { + "epoch": 8.64619764131761, + "grad_norm": 9.129779483766596, + "learning_rate": 5.328390815080381e-06, + "loss": 0.1257, + "step": 21261 + }, + { + "epoch": 8.646604310695405, + "grad_norm": 0.007074049837464983, + "learning_rate": 5.327495006106926e-06, + "loss": 0.0001, + "step": 21262 + }, + { + "epoch": 8.647010980073201, + "grad_norm": 3.927456599826792, + "learning_rate": 5.326599245099088e-06, + "loss": 0.0142, + "step": 21263 + }, + { + "epoch": 8.647417649450997, + "grad_norm": 0.25176437960434295, + "learning_rate": 5.32570353206606e-06, + "loss": 0.0027, + "step": 21264 + }, + { + "epoch": 8.647824318828793, + "grad_norm": 0.9217367300904373, + "learning_rate": 5.3248078670170365e-06, + "loss": 0.0088, + "step": 21265 + }, + { + "epoch": 8.648230988206588, + "grad_norm": 0.06764903132261799, + "learning_rate": 5.32391224996121e-06, + "loss": 0.0009, + "step": 21266 + }, + { + "epoch": 8.648637657584384, + "grad_norm": 0.00024294437731152023, + "learning_rate": 5.323016680907782e-06, + "loss": 0.0, + "step": 21267 + }, + { + "epoch": 8.64904432696218, + "grad_norm": 8.336769393196533, + "learning_rate": 5.3221211598659426e-06, + "loss": 0.0593, + "step": 21268 + }, + { + "epoch": 8.649450996339976, + "grad_norm": 1.1450040736913962, + "learning_rate": 5.321225686844882e-06, + "loss": 0.0125, + "step": 21269 + }, + { + "epoch": 8.649857665717771, + "grad_norm": 0.3302790281057644, + "learning_rate": 5.320330261853797e-06, + "loss": 0.002, + "step": 21270 + }, + { + "epoch": 8.650264335095567, + "grad_norm": 0.07583789632214441, + "learning_rate": 5.319434884901874e-06, + "loss": 0.001, + "step": 21271 + }, + { + "epoch": 8.650671004473363, + "grad_norm": 0.42342368044996925, + "learning_rate": 5.318539555998306e-06, + "loss": 0.0041, + "step": 21272 + }, + { + "epoch": 8.651077673851159, + "grad_norm": 8.919561415159617, + "learning_rate": 5.3176442751522875e-06, + "loss": 0.1006, + "step": 21273 + }, + { + "epoch": 8.651484343228955, + "grad_norm": 0.0934302638469115, + "learning_rate": 5.316749042373008e-06, + "loss": 0.001, + "step": 21274 + }, + { + "epoch": 8.65189101260675, + "grad_norm": 0.3046867663079683, + "learning_rate": 5.3158538576696575e-06, + "loss": 0.0035, + "step": 21275 + }, + { + "epoch": 8.652297681984546, + "grad_norm": 0.08230722323822412, + "learning_rate": 5.314958721051424e-06, + "loss": 0.0009, + "step": 21276 + }, + { + "epoch": 8.652704351362342, + "grad_norm": 0.02736319299856146, + "learning_rate": 5.314063632527497e-06, + "loss": 0.0003, + "step": 21277 + }, + { + "epoch": 8.653111020740138, + "grad_norm": 0.03968197930482077, + "learning_rate": 5.313168592107066e-06, + "loss": 0.0004, + "step": 21278 + }, + { + "epoch": 8.653517690117933, + "grad_norm": 0.006036985508422239, + "learning_rate": 5.312273599799313e-06, + "loss": 0.0001, + "step": 21279 + }, + { + "epoch": 8.65392435949573, + "grad_norm": 0.07664311181052516, + "learning_rate": 5.311378655613435e-06, + "loss": 0.0007, + "step": 21280 + }, + { + "epoch": 8.654331028873525, + "grad_norm": 0.5141441972407667, + "learning_rate": 5.310483759558616e-06, + "loss": 0.004, + "step": 21281 + }, + { + "epoch": 8.654737698251322, + "grad_norm": 2.331977738529966, + "learning_rate": 5.309588911644039e-06, + "loss": 0.0431, + "step": 21282 + }, + { + "epoch": 8.655144367629118, + "grad_norm": 7.94901537925022, + "learning_rate": 5.308694111878895e-06, + "loss": 0.451, + "step": 21283 + }, + { + "epoch": 8.655551037006914, + "grad_norm": 0.024107828675548323, + "learning_rate": 5.307799360272366e-06, + "loss": 0.0002, + "step": 21284 + }, + { + "epoch": 8.65595770638471, + "grad_norm": 0.04209512815898564, + "learning_rate": 5.306904656833634e-06, + "loss": 0.0006, + "step": 21285 + }, + { + "epoch": 8.656364375762506, + "grad_norm": 0.6141514135426291, + "learning_rate": 5.306010001571892e-06, + "loss": 0.0063, + "step": 21286 + }, + { + "epoch": 8.656771045140301, + "grad_norm": 5.2097254116230785, + "learning_rate": 5.305115394496319e-06, + "loss": 0.0663, + "step": 21287 + }, + { + "epoch": 8.657177714518097, + "grad_norm": 3.3891751066866576, + "learning_rate": 5.304220835616101e-06, + "loss": 0.0753, + "step": 21288 + }, + { + "epoch": 8.657584383895893, + "grad_norm": 0.4888004844698953, + "learning_rate": 5.303326324940417e-06, + "loss": 0.0058, + "step": 21289 + }, + { + "epoch": 8.657991053273689, + "grad_norm": 0.0025321448385057367, + "learning_rate": 5.302431862478454e-06, + "loss": 0.0, + "step": 21290 + }, + { + "epoch": 8.658397722651484, + "grad_norm": 6.797641211106054, + "learning_rate": 5.301537448239391e-06, + "loss": 0.0775, + "step": 21291 + }, + { + "epoch": 8.65880439202928, + "grad_norm": 0.056389489079679035, + "learning_rate": 5.300643082232409e-06, + "loss": 0.0006, + "step": 21292 + }, + { + "epoch": 8.659211061407076, + "grad_norm": 4.525205451163341, + "learning_rate": 5.299748764466692e-06, + "loss": 0.0512, + "step": 21293 + }, + { + "epoch": 8.659617730784872, + "grad_norm": 7.7949528169029785, + "learning_rate": 5.298854494951422e-06, + "loss": 0.0989, + "step": 21294 + }, + { + "epoch": 8.660024400162667, + "grad_norm": 0.5651033266245248, + "learning_rate": 5.297960273695775e-06, + "loss": 0.0032, + "step": 21295 + }, + { + "epoch": 8.660431069540463, + "grad_norm": 9.006240352488783, + "learning_rate": 5.297066100708932e-06, + "loss": 0.1473, + "step": 21296 + }, + { + "epoch": 8.660837738918259, + "grad_norm": 2.301437375855754, + "learning_rate": 5.296171976000073e-06, + "loss": 0.0278, + "step": 21297 + }, + { + "epoch": 8.661244408296055, + "grad_norm": 0.8456009426314108, + "learning_rate": 5.295277899578373e-06, + "loss": 0.0095, + "step": 21298 + }, + { + "epoch": 8.66165107767385, + "grad_norm": 0.23852271547789142, + "learning_rate": 5.294383871453016e-06, + "loss": 0.0019, + "step": 21299 + }, + { + "epoch": 8.662057747051646, + "grad_norm": 6.16702932632028, + "learning_rate": 5.293489891633177e-06, + "loss": 0.1332, + "step": 21300 + }, + { + "epoch": 8.662464416429444, + "grad_norm": 3.222160129686437, + "learning_rate": 5.292595960128033e-06, + "loss": 0.0676, + "step": 21301 + }, + { + "epoch": 8.66287108580724, + "grad_norm": 0.880676218816284, + "learning_rate": 5.291702076946761e-06, + "loss": 0.0099, + "step": 21302 + }, + { + "epoch": 8.663277755185035, + "grad_norm": 0.3784788698942995, + "learning_rate": 5.290808242098537e-06, + "loss": 0.0046, + "step": 21303 + }, + { + "epoch": 8.663684424562831, + "grad_norm": 2.238809376714198, + "learning_rate": 5.289914455592536e-06, + "loss": 0.0275, + "step": 21304 + }, + { + "epoch": 8.664091093940627, + "grad_norm": 0.04143243452552231, + "learning_rate": 5.289020717437931e-06, + "loss": 0.0003, + "step": 21305 + }, + { + "epoch": 8.664497763318423, + "grad_norm": 0.4305881455598677, + "learning_rate": 5.288127027643901e-06, + "loss": 0.0058, + "step": 21306 + }, + { + "epoch": 8.664904432696218, + "grad_norm": 5.198873351867242, + "learning_rate": 5.28723338621962e-06, + "loss": 0.0571, + "step": 21307 + }, + { + "epoch": 8.665311102074014, + "grad_norm": 0.0011431820578257897, + "learning_rate": 5.28633979317426e-06, + "loss": 0.0, + "step": 21308 + }, + { + "epoch": 8.66571777145181, + "grad_norm": 0.08772112627167074, + "learning_rate": 5.2854462485169945e-06, + "loss": 0.0012, + "step": 21309 + }, + { + "epoch": 8.666124440829606, + "grad_norm": 7.299918159955289, + "learning_rate": 5.284552752256995e-06, + "loss": 0.1126, + "step": 21310 + }, + { + "epoch": 8.666531110207401, + "grad_norm": 12.372713006203695, + "learning_rate": 5.283659304403432e-06, + "loss": 1.0626, + "step": 21311 + }, + { + "epoch": 8.666937779585197, + "grad_norm": 6.267602104001165, + "learning_rate": 5.282765904965483e-06, + "loss": 0.1109, + "step": 21312 + }, + { + "epoch": 8.667344448962993, + "grad_norm": 6.7887091873954875, + "learning_rate": 5.281872553952317e-06, + "loss": 0.0795, + "step": 21313 + }, + { + "epoch": 8.667751118340789, + "grad_norm": 0.1864103685303639, + "learning_rate": 5.280979251373104e-06, + "loss": 0.0014, + "step": 21314 + }, + { + "epoch": 8.668157787718584, + "grad_norm": 0.057849103908674364, + "learning_rate": 5.280085997237012e-06, + "loss": 0.0008, + "step": 21315 + }, + { + "epoch": 8.66856445709638, + "grad_norm": 0.28558703517562944, + "learning_rate": 5.279192791553215e-06, + "loss": 0.0026, + "step": 21316 + }, + { + "epoch": 8.668971126474176, + "grad_norm": 0.0009572253827104467, + "learning_rate": 5.278299634330879e-06, + "loss": 0.0, + "step": 21317 + }, + { + "epoch": 8.669377795851972, + "grad_norm": 0.08759727290225379, + "learning_rate": 5.277406525579169e-06, + "loss": 0.0004, + "step": 21318 + }, + { + "epoch": 8.669784465229768, + "grad_norm": 0.03595448051098814, + "learning_rate": 5.276513465307262e-06, + "loss": 0.0006, + "step": 21319 + }, + { + "epoch": 8.670191134607563, + "grad_norm": 0.31509851283303986, + "learning_rate": 5.275620453524324e-06, + "loss": 0.0046, + "step": 21320 + }, + { + "epoch": 8.670597803985359, + "grad_norm": 2.727927129880009, + "learning_rate": 5.274727490239517e-06, + "loss": 0.0341, + "step": 21321 + }, + { + "epoch": 8.671004473363155, + "grad_norm": 0.040075064072106856, + "learning_rate": 5.2738345754620115e-06, + "loss": 0.0004, + "step": 21322 + }, + { + "epoch": 8.671411142740952, + "grad_norm": 0.002186979072062615, + "learning_rate": 5.272941709200972e-06, + "loss": 0.0, + "step": 21323 + }, + { + "epoch": 8.671817812118748, + "grad_norm": 9.883461328173736, + "learning_rate": 5.2720488914655624e-06, + "loss": 0.207, + "step": 21324 + }, + { + "epoch": 8.672224481496544, + "grad_norm": 0.026296217313319686, + "learning_rate": 5.271156122264953e-06, + "loss": 0.0003, + "step": 21325 + }, + { + "epoch": 8.67263115087434, + "grad_norm": 1.5774627055774921, + "learning_rate": 5.270263401608307e-06, + "loss": 0.02, + "step": 21326 + }, + { + "epoch": 8.673037820252135, + "grad_norm": 0.3154820461578236, + "learning_rate": 5.269370729504787e-06, + "loss": 0.0029, + "step": 21327 + }, + { + "epoch": 8.673444489629931, + "grad_norm": 0.05239553777881824, + "learning_rate": 5.268478105963557e-06, + "loss": 0.0004, + "step": 21328 + }, + { + "epoch": 8.673851159007727, + "grad_norm": 1.6381575947615734, + "learning_rate": 5.267585530993781e-06, + "loss": 0.015, + "step": 21329 + }, + { + "epoch": 8.674257828385523, + "grad_norm": 0.016144182692191696, + "learning_rate": 5.266693004604622e-06, + "loss": 0.0002, + "step": 21330 + }, + { + "epoch": 8.674664497763318, + "grad_norm": 0.12589747638094698, + "learning_rate": 5.265800526805236e-06, + "loss": 0.0017, + "step": 21331 + }, + { + "epoch": 8.675071167141114, + "grad_norm": 9.87537455829264, + "learning_rate": 5.264908097604795e-06, + "loss": 0.1533, + "step": 21332 + }, + { + "epoch": 8.67547783651891, + "grad_norm": 3.442568821937352, + "learning_rate": 5.2640157170124565e-06, + "loss": 0.0627, + "step": 21333 + }, + { + "epoch": 8.675884505896706, + "grad_norm": 6.80753924815372, + "learning_rate": 5.263123385037379e-06, + "loss": 0.0625, + "step": 21334 + }, + { + "epoch": 8.676291175274502, + "grad_norm": 0.0017697489297797309, + "learning_rate": 5.262231101688724e-06, + "loss": 0.0, + "step": 21335 + }, + { + "epoch": 8.676697844652297, + "grad_norm": 4.31913097761225, + "learning_rate": 5.261338866975652e-06, + "loss": 0.04, + "step": 21336 + }, + { + "epoch": 8.677104514030093, + "grad_norm": 0.7696691820747269, + "learning_rate": 5.260446680907321e-06, + "loss": 0.0097, + "step": 21337 + }, + { + "epoch": 8.677511183407889, + "grad_norm": 0.23069757196714638, + "learning_rate": 5.259554543492886e-06, + "loss": 0.0019, + "step": 21338 + }, + { + "epoch": 8.677917852785685, + "grad_norm": 3.129710484014171, + "learning_rate": 5.258662454741514e-06, + "loss": 0.0594, + "step": 21339 + }, + { + "epoch": 8.67832452216348, + "grad_norm": 13.0852260806155, + "learning_rate": 5.257770414662357e-06, + "loss": 0.2332, + "step": 21340 + }, + { + "epoch": 8.678731191541276, + "grad_norm": 7.917282833205222, + "learning_rate": 5.256878423264576e-06, + "loss": 0.2466, + "step": 21341 + }, + { + "epoch": 8.679137860919074, + "grad_norm": 0.05463159644608877, + "learning_rate": 5.2559864805573225e-06, + "loss": 0.0006, + "step": 21342 + }, + { + "epoch": 8.67954453029687, + "grad_norm": 0.0077369062200627866, + "learning_rate": 5.255094586549756e-06, + "loss": 0.0001, + "step": 21343 + }, + { + "epoch": 8.679951199674665, + "grad_norm": 9.981438528823153, + "learning_rate": 5.254202741251028e-06, + "loss": 0.0947, + "step": 21344 + }, + { + "epoch": 8.680357869052461, + "grad_norm": 5.042691711451445, + "learning_rate": 5.2533109446703004e-06, + "loss": 0.1111, + "step": 21345 + }, + { + "epoch": 8.680764538430257, + "grad_norm": 1.0352856583273933, + "learning_rate": 5.252419196816726e-06, + "loss": 0.0125, + "step": 21346 + }, + { + "epoch": 8.681171207808053, + "grad_norm": 0.02535639172721434, + "learning_rate": 5.2515274976994575e-06, + "loss": 0.0003, + "step": 21347 + }, + { + "epoch": 8.681577877185848, + "grad_norm": 3.1580043791673766, + "learning_rate": 5.25063584732765e-06, + "loss": 0.0705, + "step": 21348 + }, + { + "epoch": 8.681984546563644, + "grad_norm": 0.052454368283790916, + "learning_rate": 5.249744245710455e-06, + "loss": 0.0007, + "step": 21349 + }, + { + "epoch": 8.68239121594144, + "grad_norm": 10.607669465700246, + "learning_rate": 5.248852692857026e-06, + "loss": 0.2365, + "step": 21350 + }, + { + "epoch": 8.682797885319236, + "grad_norm": 1.0136078836902478, + "learning_rate": 5.2479611887765156e-06, + "loss": 0.0102, + "step": 21351 + }, + { + "epoch": 8.683204554697031, + "grad_norm": 1.4177676750526285, + "learning_rate": 5.247069733478075e-06, + "loss": 0.0175, + "step": 21352 + }, + { + "epoch": 8.683611224074827, + "grad_norm": 0.0015949275844406856, + "learning_rate": 5.246178326970856e-06, + "loss": 0.0, + "step": 21353 + }, + { + "epoch": 8.684017893452623, + "grad_norm": 6.826455075197355, + "learning_rate": 5.245286969264008e-06, + "loss": 0.1019, + "step": 21354 + }, + { + "epoch": 8.684424562830419, + "grad_norm": 0.0012558814340304448, + "learning_rate": 5.244395660366683e-06, + "loss": 0.0, + "step": 21355 + }, + { + "epoch": 8.684831232208214, + "grad_norm": 4.538655372784648, + "learning_rate": 5.2435044002880265e-06, + "loss": 0.0611, + "step": 21356 + }, + { + "epoch": 8.68523790158601, + "grad_norm": 0.022781110752674204, + "learning_rate": 5.242613189037196e-06, + "loss": 0.0002, + "step": 21357 + }, + { + "epoch": 8.685644570963806, + "grad_norm": 0.01831228195899311, + "learning_rate": 5.241722026623333e-06, + "loss": 0.0002, + "step": 21358 + }, + { + "epoch": 8.686051240341602, + "grad_norm": 0.9322104633643472, + "learning_rate": 5.2408309130555904e-06, + "loss": 0.0105, + "step": 21359 + }, + { + "epoch": 8.686457909719397, + "grad_norm": 6.90249893675711, + "learning_rate": 5.239939848343112e-06, + "loss": 0.1538, + "step": 21360 + }, + { + "epoch": 8.686864579097193, + "grad_norm": 1.6269297901508848, + "learning_rate": 5.239048832495049e-06, + "loss": 0.0095, + "step": 21361 + }, + { + "epoch": 8.687271248474989, + "grad_norm": 0.40467400325746145, + "learning_rate": 5.238157865520539e-06, + "loss": 0.0044, + "step": 21362 + }, + { + "epoch": 8.687677917852785, + "grad_norm": 0.5588696114520846, + "learning_rate": 5.2372669474287404e-06, + "loss": 0.0059, + "step": 21363 + }, + { + "epoch": 8.688084587230582, + "grad_norm": 0.0013207314346820758, + "learning_rate": 5.236376078228794e-06, + "loss": 0.0, + "step": 21364 + }, + { + "epoch": 8.688491256608378, + "grad_norm": 7.8613643773350725, + "learning_rate": 5.235485257929844e-06, + "loss": 0.1683, + "step": 21365 + }, + { + "epoch": 8.688897925986174, + "grad_norm": 0.031882920449052256, + "learning_rate": 5.234594486541036e-06, + "loss": 0.0003, + "step": 21366 + }, + { + "epoch": 8.68930459536397, + "grad_norm": 5.195183532949765, + "learning_rate": 5.233703764071514e-06, + "loss": 0.1212, + "step": 21367 + }, + { + "epoch": 8.689711264741765, + "grad_norm": 3.313924129039819, + "learning_rate": 5.232813090530422e-06, + "loss": 0.0482, + "step": 21368 + }, + { + "epoch": 8.690117934119561, + "grad_norm": 3.1996581438981995, + "learning_rate": 5.2319224659269e-06, + "loss": 0.0293, + "step": 21369 + }, + { + "epoch": 8.690524603497357, + "grad_norm": 0.725705397555483, + "learning_rate": 5.231031890270095e-06, + "loss": 0.0087, + "step": 21370 + }, + { + "epoch": 8.690931272875153, + "grad_norm": 0.024747710007076006, + "learning_rate": 5.23014136356915e-06, + "loss": 0.0003, + "step": 21371 + }, + { + "epoch": 8.691337942252948, + "grad_norm": 0.060485218262149074, + "learning_rate": 5.229250885833204e-06, + "loss": 0.0006, + "step": 21372 + }, + { + "epoch": 8.691744611630744, + "grad_norm": 3.0783543651043783, + "learning_rate": 5.228360457071398e-06, + "loss": 0.0402, + "step": 21373 + }, + { + "epoch": 8.69215128100854, + "grad_norm": 0.008936372365611874, + "learning_rate": 5.227470077292874e-06, + "loss": 0.0001, + "step": 21374 + }, + { + "epoch": 8.692557950386336, + "grad_norm": 0.09699903639732385, + "learning_rate": 5.226579746506767e-06, + "loss": 0.0013, + "step": 21375 + }, + { + "epoch": 8.692964619764131, + "grad_norm": 5.628685501503387, + "learning_rate": 5.225689464722226e-06, + "loss": 0.0638, + "step": 21376 + }, + { + "epoch": 8.693371289141927, + "grad_norm": 3.5555709144609597, + "learning_rate": 5.224799231948385e-06, + "loss": 0.0889, + "step": 21377 + }, + { + "epoch": 8.693777958519723, + "grad_norm": 0.019453356794144627, + "learning_rate": 5.223909048194383e-06, + "loss": 0.0003, + "step": 21378 + }, + { + "epoch": 8.694184627897519, + "grad_norm": 0.28390770244016506, + "learning_rate": 5.223018913469359e-06, + "loss": 0.0035, + "step": 21379 + }, + { + "epoch": 8.694591297275315, + "grad_norm": 0.18573027952986854, + "learning_rate": 5.222128827782449e-06, + "loss": 0.0023, + "step": 21380 + }, + { + "epoch": 8.69499796665311, + "grad_norm": 9.838458828083427, + "learning_rate": 5.221238791142793e-06, + "loss": 0.2868, + "step": 21381 + }, + { + "epoch": 8.695404636030906, + "grad_norm": 0.0119456715352609, + "learning_rate": 5.220348803559521e-06, + "loss": 0.0001, + "step": 21382 + }, + { + "epoch": 8.695811305408704, + "grad_norm": 0.010721670326408148, + "learning_rate": 5.219458865041777e-06, + "loss": 0.0001, + "step": 21383 + }, + { + "epoch": 8.6962179747865, + "grad_norm": 1.7155053335960135, + "learning_rate": 5.218568975598696e-06, + "loss": 0.0077, + "step": 21384 + }, + { + "epoch": 8.696624644164295, + "grad_norm": 1.2805187661717288, + "learning_rate": 5.21767913523941e-06, + "loss": 0.0124, + "step": 21385 + }, + { + "epoch": 8.697031313542091, + "grad_norm": 9.084055383372322, + "learning_rate": 5.216789343973054e-06, + "loss": 0.1873, + "step": 21386 + }, + { + "epoch": 8.697437982919887, + "grad_norm": 0.050689068625717426, + "learning_rate": 5.215899601808762e-06, + "loss": 0.0005, + "step": 21387 + }, + { + "epoch": 8.697844652297682, + "grad_norm": 1.017341744020501, + "learning_rate": 5.215009908755666e-06, + "loss": 0.0045, + "step": 21388 + }, + { + "epoch": 8.698251321675478, + "grad_norm": 0.003731445948347093, + "learning_rate": 5.214120264822905e-06, + "loss": 0.0, + "step": 21389 + }, + { + "epoch": 8.698657991053274, + "grad_norm": 3.5282422572343783, + "learning_rate": 5.213230670019609e-06, + "loss": 0.0551, + "step": 21390 + }, + { + "epoch": 8.69906466043107, + "grad_norm": 0.05813756963316289, + "learning_rate": 5.212341124354907e-06, + "loss": 0.0008, + "step": 21391 + }, + { + "epoch": 8.699471329808866, + "grad_norm": 0.06629035209465185, + "learning_rate": 5.211451627837936e-06, + "loss": 0.0007, + "step": 21392 + }, + { + "epoch": 8.699877999186661, + "grad_norm": 0.12345469761496408, + "learning_rate": 5.210562180477821e-06, + "loss": 0.0013, + "step": 21393 + }, + { + "epoch": 8.700284668564457, + "grad_norm": 0.6107125855120096, + "learning_rate": 5.209672782283697e-06, + "loss": 0.0054, + "step": 21394 + }, + { + "epoch": 8.700691337942253, + "grad_norm": 0.09063883727386463, + "learning_rate": 5.208783433264689e-06, + "loss": 0.0009, + "step": 21395 + }, + { + "epoch": 8.701098007320049, + "grad_norm": 1.1424040237109003, + "learning_rate": 5.207894133429934e-06, + "loss": 0.0161, + "step": 21396 + }, + { + "epoch": 8.701504676697844, + "grad_norm": 0.13933395993115177, + "learning_rate": 5.207004882788558e-06, + "loss": 0.0013, + "step": 21397 + }, + { + "epoch": 8.70191134607564, + "grad_norm": 0.012320098933294098, + "learning_rate": 5.2061156813496895e-06, + "loss": 0.0001, + "step": 21398 + }, + { + "epoch": 8.702318015453436, + "grad_norm": 0.9396860715303903, + "learning_rate": 5.205226529122456e-06, + "loss": 0.0034, + "step": 21399 + }, + { + "epoch": 8.702724684831232, + "grad_norm": 1.2681755836720772, + "learning_rate": 5.204337426115984e-06, + "loss": 0.01, + "step": 21400 + }, + { + "epoch": 8.703131354209027, + "grad_norm": 0.8017481589941322, + "learning_rate": 5.203448372339399e-06, + "loss": 0.007, + "step": 21401 + }, + { + "epoch": 8.703538023586823, + "grad_norm": 6.5201140962680855, + "learning_rate": 5.202559367801836e-06, + "loss": 0.1254, + "step": 21402 + }, + { + "epoch": 8.703944692964619, + "grad_norm": 0.0004138505412980723, + "learning_rate": 5.201670412512414e-06, + "loss": 0.0, + "step": 21403 + }, + { + "epoch": 8.704351362342415, + "grad_norm": 1.6977856826852986, + "learning_rate": 5.200781506480259e-06, + "loss": 0.0278, + "step": 21404 + }, + { + "epoch": 8.704758031720212, + "grad_norm": 0.03784532434119281, + "learning_rate": 5.199892649714499e-06, + "loss": 0.0004, + "step": 21405 + }, + { + "epoch": 8.705164701098008, + "grad_norm": 5.250314999235039, + "learning_rate": 5.199003842224257e-06, + "loss": 0.0573, + "step": 21406 + }, + { + "epoch": 8.705571370475804, + "grad_norm": 21.15260838055119, + "learning_rate": 5.198115084018655e-06, + "loss": 0.7467, + "step": 21407 + }, + { + "epoch": 8.7059780398536, + "grad_norm": 1.3114790531688052, + "learning_rate": 5.197226375106816e-06, + "loss": 0.0149, + "step": 21408 + }, + { + "epoch": 8.706384709231395, + "grad_norm": 0.014522673662942133, + "learning_rate": 5.19633771549787e-06, + "loss": 0.0002, + "step": 21409 + }, + { + "epoch": 8.706791378609191, + "grad_norm": 1.041772030826294, + "learning_rate": 5.195449105200934e-06, + "loss": 0.0149, + "step": 21410 + }, + { + "epoch": 8.707198047986987, + "grad_norm": 0.009088342517111206, + "learning_rate": 5.19456054422513e-06, + "loss": 0.0001, + "step": 21411 + }, + { + "epoch": 8.707604717364783, + "grad_norm": 2.5424183174362547, + "learning_rate": 5.1936720325795816e-06, + "loss": 0.0309, + "step": 21412 + }, + { + "epoch": 8.708011386742578, + "grad_norm": 0.04537531142688942, + "learning_rate": 5.192783570273409e-06, + "loss": 0.0004, + "step": 21413 + }, + { + "epoch": 8.708418056120374, + "grad_norm": 2.0315582197363367, + "learning_rate": 5.191895157315728e-06, + "loss": 0.024, + "step": 21414 + }, + { + "epoch": 8.70882472549817, + "grad_norm": 3.9454439769215046, + "learning_rate": 5.191006793715667e-06, + "loss": 0.0754, + "step": 21415 + }, + { + "epoch": 8.709231394875966, + "grad_norm": 0.46026311599410985, + "learning_rate": 5.190118479482342e-06, + "loss": 0.0071, + "step": 21416 + }, + { + "epoch": 8.709638064253761, + "grad_norm": 0.012128962676777343, + "learning_rate": 5.189230214624872e-06, + "loss": 0.0001, + "step": 21417 + }, + { + "epoch": 8.710044733631557, + "grad_norm": 0.5294023883895472, + "learning_rate": 5.188341999152373e-06, + "loss": 0.005, + "step": 21418 + }, + { + "epoch": 8.710451403009353, + "grad_norm": 0.0927490936219819, + "learning_rate": 5.187453833073968e-06, + "loss": 0.0005, + "step": 21419 + }, + { + "epoch": 8.710858072387149, + "grad_norm": 0.2944205645338158, + "learning_rate": 5.186565716398769e-06, + "loss": 0.0019, + "step": 21420 + }, + { + "epoch": 8.711264741764944, + "grad_norm": 0.03566050374461913, + "learning_rate": 5.185677649135893e-06, + "loss": 0.0004, + "step": 21421 + }, + { + "epoch": 8.71167141114274, + "grad_norm": 3.05388244888973, + "learning_rate": 5.184789631294463e-06, + "loss": 0.0353, + "step": 21422 + }, + { + "epoch": 8.712078080520536, + "grad_norm": 4.884512002010716, + "learning_rate": 5.18390166288359e-06, + "loss": 0.0701, + "step": 21423 + }, + { + "epoch": 8.712484749898334, + "grad_norm": 0.03613911771280799, + "learning_rate": 5.183013743912391e-06, + "loss": 0.0003, + "step": 21424 + }, + { + "epoch": 8.71289141927613, + "grad_norm": 0.0029188253633278477, + "learning_rate": 5.182125874389979e-06, + "loss": 0.0, + "step": 21425 + }, + { + "epoch": 8.713298088653925, + "grad_norm": 0.16674422617786153, + "learning_rate": 5.1812380543254705e-06, + "loss": 0.0025, + "step": 21426 + }, + { + "epoch": 8.71370475803172, + "grad_norm": 13.602200458646726, + "learning_rate": 5.1803502837279754e-06, + "loss": 0.4531, + "step": 21427 + }, + { + "epoch": 8.714111427409517, + "grad_norm": 0.06501317978613418, + "learning_rate": 5.179462562606613e-06, + "loss": 0.0005, + "step": 21428 + }, + { + "epoch": 8.714518096787312, + "grad_norm": 0.03666642277615821, + "learning_rate": 5.178574890970494e-06, + "loss": 0.0005, + "step": 21429 + }, + { + "epoch": 8.714924766165108, + "grad_norm": 9.425148340753971, + "learning_rate": 5.177687268828731e-06, + "loss": 0.1136, + "step": 21430 + }, + { + "epoch": 8.715331435542904, + "grad_norm": 0.006646615514777335, + "learning_rate": 5.176799696190433e-06, + "loss": 0.0001, + "step": 21431 + }, + { + "epoch": 8.7157381049207, + "grad_norm": 0.06387590263813622, + "learning_rate": 5.1759121730647166e-06, + "loss": 0.0007, + "step": 21432 + }, + { + "epoch": 8.716144774298495, + "grad_norm": 0.5251542143560941, + "learning_rate": 5.175024699460688e-06, + "loss": 0.0044, + "step": 21433 + }, + { + "epoch": 8.716551443676291, + "grad_norm": 0.0025735466448540785, + "learning_rate": 5.174137275387455e-06, + "loss": 0.0, + "step": 21434 + }, + { + "epoch": 8.716958113054087, + "grad_norm": 4.3887742403669385, + "learning_rate": 5.173249900854136e-06, + "loss": 0.0543, + "step": 21435 + }, + { + "epoch": 8.717364782431883, + "grad_norm": 0.005492064250423456, + "learning_rate": 5.172362575869836e-06, + "loss": 0.0001, + "step": 21436 + }, + { + "epoch": 8.717771451809678, + "grad_norm": 0.030263657587164993, + "learning_rate": 5.171475300443665e-06, + "loss": 0.0003, + "step": 21437 + }, + { + "epoch": 8.718178121187474, + "grad_norm": 0.2118319926944195, + "learning_rate": 5.170588074584728e-06, + "loss": 0.0026, + "step": 21438 + }, + { + "epoch": 8.71858479056527, + "grad_norm": 0.02218627832135622, + "learning_rate": 5.169700898302137e-06, + "loss": 0.0003, + "step": 21439 + }, + { + "epoch": 8.718991459943066, + "grad_norm": 0.05435387602613706, + "learning_rate": 5.168813771604992e-06, + "loss": 0.0005, + "step": 21440 + }, + { + "epoch": 8.719398129320862, + "grad_norm": 1.6544400213419448, + "learning_rate": 5.16792669450241e-06, + "loss": 0.0254, + "step": 21441 + }, + { + "epoch": 8.719804798698657, + "grad_norm": 0.11198522922661015, + "learning_rate": 5.167039667003493e-06, + "loss": 0.0009, + "step": 21442 + }, + { + "epoch": 8.720211468076453, + "grad_norm": 0.35152535473064533, + "learning_rate": 5.166152689117345e-06, + "loss": 0.0049, + "step": 21443 + }, + { + "epoch": 8.720618137454249, + "grad_norm": 0.0022884626062792198, + "learning_rate": 5.165265760853072e-06, + "loss": 0.0, + "step": 21444 + }, + { + "epoch": 8.721024806832045, + "grad_norm": 0.604765978386796, + "learning_rate": 5.164378882219781e-06, + "loss": 0.0089, + "step": 21445 + }, + { + "epoch": 8.721431476209842, + "grad_norm": 0.1468928392731672, + "learning_rate": 5.163492053226573e-06, + "loss": 0.0018, + "step": 21446 + }, + { + "epoch": 8.721838145587638, + "grad_norm": 0.06247606615155923, + "learning_rate": 5.162605273882549e-06, + "loss": 0.0006, + "step": 21447 + }, + { + "epoch": 8.722244814965434, + "grad_norm": 0.16400581489779548, + "learning_rate": 5.161718544196821e-06, + "loss": 0.002, + "step": 21448 + }, + { + "epoch": 8.72265148434323, + "grad_norm": 3.8942260805563502, + "learning_rate": 5.160831864178491e-06, + "loss": 0.0159, + "step": 21449 + }, + { + "epoch": 8.723058153721025, + "grad_norm": 3.33509962909165, + "learning_rate": 5.159945233836654e-06, + "loss": 0.0444, + "step": 21450 + }, + { + "epoch": 8.723464823098821, + "grad_norm": 0.2263041211649802, + "learning_rate": 5.159058653180416e-06, + "loss": 0.0022, + "step": 21451 + }, + { + "epoch": 8.723871492476617, + "grad_norm": 1.576311414006335, + "learning_rate": 5.158172122218872e-06, + "loss": 0.0219, + "step": 21452 + }, + { + "epoch": 8.724278161854413, + "grad_norm": 3.6953469605230254, + "learning_rate": 5.157285640961132e-06, + "loss": 0.0953, + "step": 21453 + }, + { + "epoch": 8.724684831232208, + "grad_norm": 0.0032670179432475544, + "learning_rate": 5.1563992094162916e-06, + "loss": 0.0, + "step": 21454 + }, + { + "epoch": 8.725091500610004, + "grad_norm": 0.05037392527674828, + "learning_rate": 5.155512827593452e-06, + "loss": 0.0006, + "step": 21455 + }, + { + "epoch": 8.7254981699878, + "grad_norm": 0.32312005475686395, + "learning_rate": 5.154626495501711e-06, + "loss": 0.005, + "step": 21456 + }, + { + "epoch": 8.725904839365596, + "grad_norm": 12.378597134684702, + "learning_rate": 5.153740213150169e-06, + "loss": 0.2394, + "step": 21457 + }, + { + "epoch": 8.726311508743391, + "grad_norm": 0.004284440535125811, + "learning_rate": 5.152853980547922e-06, + "loss": 0.0, + "step": 21458 + }, + { + "epoch": 8.726718178121187, + "grad_norm": 2.3194915582749833, + "learning_rate": 5.151967797704064e-06, + "loss": 0.03, + "step": 21459 + }, + { + "epoch": 8.727124847498983, + "grad_norm": 0.03532679726709393, + "learning_rate": 5.1510816646277e-06, + "loss": 0.0004, + "step": 21460 + }, + { + "epoch": 8.727531516876779, + "grad_norm": 0.04692599981781375, + "learning_rate": 5.150195581327924e-06, + "loss": 0.0006, + "step": 21461 + }, + { + "epoch": 8.727938186254574, + "grad_norm": 0.22106728835954015, + "learning_rate": 5.149309547813831e-06, + "loss": 0.0022, + "step": 21462 + }, + { + "epoch": 8.72834485563237, + "grad_norm": 3.968220596597956, + "learning_rate": 5.148423564094517e-06, + "loss": 0.0792, + "step": 21463 + }, + { + "epoch": 8.728751525010166, + "grad_norm": 0.05329012608332395, + "learning_rate": 5.147537630179076e-06, + "loss": 0.0007, + "step": 21464 + }, + { + "epoch": 8.729158194387963, + "grad_norm": 0.9580180680895517, + "learning_rate": 5.146651746076601e-06, + "loss": 0.0098, + "step": 21465 + }, + { + "epoch": 8.72956486376576, + "grad_norm": 0.5468829436449028, + "learning_rate": 5.145765911796191e-06, + "loss": 0.0038, + "step": 21466 + }, + { + "epoch": 8.729971533143555, + "grad_norm": 3.461197992027757, + "learning_rate": 5.144880127346937e-06, + "loss": 0.0355, + "step": 21467 + }, + { + "epoch": 8.73037820252135, + "grad_norm": 0.06291005850900486, + "learning_rate": 5.143994392737934e-06, + "loss": 0.0005, + "step": 21468 + }, + { + "epoch": 8.730784871899147, + "grad_norm": 1.1607158987096222, + "learning_rate": 5.143108707978271e-06, + "loss": 0.0116, + "step": 21469 + }, + { + "epoch": 8.731191541276942, + "grad_norm": 1.3371049231942829, + "learning_rate": 5.142223073077041e-06, + "loss": 0.0153, + "step": 21470 + }, + { + "epoch": 8.731598210654738, + "grad_norm": 0.054136366738765, + "learning_rate": 5.141337488043337e-06, + "loss": 0.0003, + "step": 21471 + }, + { + "epoch": 8.732004880032534, + "grad_norm": 0.10249845011471585, + "learning_rate": 5.140451952886244e-06, + "loss": 0.001, + "step": 21472 + }, + { + "epoch": 8.73241154941033, + "grad_norm": 0.03450789792617832, + "learning_rate": 5.139566467614862e-06, + "loss": 0.0003, + "step": 21473 + }, + { + "epoch": 8.732818218788125, + "grad_norm": 0.017426189020332394, + "learning_rate": 5.1386810322382766e-06, + "loss": 0.0002, + "step": 21474 + }, + { + "epoch": 8.733224888165921, + "grad_norm": 6.566016554118939, + "learning_rate": 5.137795646765578e-06, + "loss": 0.2267, + "step": 21475 + }, + { + "epoch": 8.733631557543717, + "grad_norm": 2.109465913143491, + "learning_rate": 5.136910311205852e-06, + "loss": 0.0236, + "step": 21476 + }, + { + "epoch": 8.734038226921513, + "grad_norm": 2.6928172514228486, + "learning_rate": 5.13602502556819e-06, + "loss": 0.0375, + "step": 21477 + }, + { + "epoch": 8.734444896299308, + "grad_norm": 0.5094984868446606, + "learning_rate": 5.135139789861676e-06, + "loss": 0.0041, + "step": 21478 + }, + { + "epoch": 8.734851565677104, + "grad_norm": 0.43549849296790544, + "learning_rate": 5.134254604095403e-06, + "loss": 0.0045, + "step": 21479 + }, + { + "epoch": 8.7352582350549, + "grad_norm": 8.152553132287848, + "learning_rate": 5.1333694682784575e-06, + "loss": 0.1356, + "step": 21480 + }, + { + "epoch": 8.735664904432696, + "grad_norm": 0.15894655612280273, + "learning_rate": 5.132484382419922e-06, + "loss": 0.0019, + "step": 21481 + }, + { + "epoch": 8.736071573810491, + "grad_norm": 0.6703500224621188, + "learning_rate": 5.131599346528884e-06, + "loss": 0.0045, + "step": 21482 + }, + { + "epoch": 8.736478243188287, + "grad_norm": 0.014066979180889455, + "learning_rate": 5.13071436061443e-06, + "loss": 0.0002, + "step": 21483 + }, + { + "epoch": 8.736884912566083, + "grad_norm": 0.03325792488329312, + "learning_rate": 5.129829424685644e-06, + "loss": 0.0004, + "step": 21484 + }, + { + "epoch": 8.737291581943879, + "grad_norm": 0.45414711636555544, + "learning_rate": 5.128944538751604e-06, + "loss": 0.0051, + "step": 21485 + }, + { + "epoch": 8.737698251321675, + "grad_norm": 0.7201189487269938, + "learning_rate": 5.128059702821407e-06, + "loss": 0.0079, + "step": 21486 + }, + { + "epoch": 8.738104920699472, + "grad_norm": 0.13375112261855945, + "learning_rate": 5.127174916904126e-06, + "loss": 0.0017, + "step": 21487 + }, + { + "epoch": 8.738511590077268, + "grad_norm": 0.7000188718267791, + "learning_rate": 5.126290181008848e-06, + "loss": 0.0038, + "step": 21488 + }, + { + "epoch": 8.738918259455064, + "grad_norm": 3.0148594360420455, + "learning_rate": 5.125405495144656e-06, + "loss": 0.0276, + "step": 21489 + }, + { + "epoch": 8.73932492883286, + "grad_norm": 3.211872968413721, + "learning_rate": 5.124520859320628e-06, + "loss": 0.0307, + "step": 21490 + }, + { + "epoch": 8.739731598210655, + "grad_norm": 0.03890139923238181, + "learning_rate": 5.1236362735458426e-06, + "loss": 0.0007, + "step": 21491 + }, + { + "epoch": 8.740138267588451, + "grad_norm": 7.4405767133620735, + "learning_rate": 5.12275173782939e-06, + "loss": 0.0632, + "step": 21492 + }, + { + "epoch": 8.740544936966247, + "grad_norm": 1.6213232804474322, + "learning_rate": 5.121867252180346e-06, + "loss": 0.0233, + "step": 21493 + }, + { + "epoch": 8.740951606344042, + "grad_norm": 0.0183577037556097, + "learning_rate": 5.1209828166077894e-06, + "loss": 0.0002, + "step": 21494 + }, + { + "epoch": 8.741358275721838, + "grad_norm": 1.9210357827818394, + "learning_rate": 5.120098431120799e-06, + "loss": 0.011, + "step": 21495 + }, + { + "epoch": 8.741764945099634, + "grad_norm": 0.14302223015014787, + "learning_rate": 5.119214095728455e-06, + "loss": 0.0013, + "step": 21496 + }, + { + "epoch": 8.74217161447743, + "grad_norm": 0.07285797425768518, + "learning_rate": 5.118329810439835e-06, + "loss": 0.0007, + "step": 21497 + }, + { + "epoch": 8.742578283855226, + "grad_norm": 0.2132580799206795, + "learning_rate": 5.117445575264012e-06, + "loss": 0.0021, + "step": 21498 + }, + { + "epoch": 8.742984953233021, + "grad_norm": 0.037500917342114526, + "learning_rate": 5.116561390210071e-06, + "loss": 0.0004, + "step": 21499 + }, + { + "epoch": 8.743391622610817, + "grad_norm": 3.597459920922075, + "learning_rate": 5.115677255287086e-06, + "loss": 0.0466, + "step": 21500 + }, + { + "epoch": 8.743798291988613, + "grad_norm": 7.19272458934167, + "learning_rate": 5.114793170504131e-06, + "loss": 0.22, + "step": 21501 + }, + { + "epoch": 8.744204961366409, + "grad_norm": 0.06215064327605993, + "learning_rate": 5.113909135870284e-06, + "loss": 0.0006, + "step": 21502 + }, + { + "epoch": 8.744611630744204, + "grad_norm": 1.3190838908682785, + "learning_rate": 5.1130251513946185e-06, + "loss": 0.0113, + "step": 21503 + }, + { + "epoch": 8.745018300122, + "grad_norm": 0.02343256878122368, + "learning_rate": 5.112141217086205e-06, + "loss": 0.0002, + "step": 21504 + }, + { + "epoch": 8.745424969499796, + "grad_norm": 0.02228979340640503, + "learning_rate": 5.111257332954127e-06, + "loss": 0.0002, + "step": 21505 + }, + { + "epoch": 8.745831638877593, + "grad_norm": 6.42898545105735, + "learning_rate": 5.110373499007451e-06, + "loss": 0.0707, + "step": 21506 + }, + { + "epoch": 8.74623830825539, + "grad_norm": 0.06310138496780666, + "learning_rate": 5.109489715255253e-06, + "loss": 0.0007, + "step": 21507 + }, + { + "epoch": 8.746644977633185, + "grad_norm": 0.008512089173190811, + "learning_rate": 5.108605981706604e-06, + "loss": 0.0001, + "step": 21508 + }, + { + "epoch": 8.74705164701098, + "grad_norm": 0.23952247064385998, + "learning_rate": 5.107722298370577e-06, + "loss": 0.002, + "step": 21509 + }, + { + "epoch": 8.747458316388776, + "grad_norm": 4.505313424352887, + "learning_rate": 5.106838665256244e-06, + "loss": 0.0316, + "step": 21510 + }, + { + "epoch": 8.747864985766572, + "grad_norm": 0.011481436040887517, + "learning_rate": 5.105955082372669e-06, + "loss": 0.0001, + "step": 21511 + }, + { + "epoch": 8.748271655144368, + "grad_norm": 3.075583890510709, + "learning_rate": 5.1050715497289325e-06, + "loss": 0.0393, + "step": 21512 + }, + { + "epoch": 8.748678324522164, + "grad_norm": 0.004546049913340913, + "learning_rate": 5.1041880673341005e-06, + "loss": 0.0001, + "step": 21513 + }, + { + "epoch": 8.74908499389996, + "grad_norm": 0.4778895146078557, + "learning_rate": 5.103304635197242e-06, + "loss": 0.0034, + "step": 21514 + }, + { + "epoch": 8.749491663277755, + "grad_norm": 1.3332303277165194, + "learning_rate": 5.102421253327425e-06, + "loss": 0.0106, + "step": 21515 + }, + { + "epoch": 8.749898332655551, + "grad_norm": 0.017955265770418432, + "learning_rate": 5.1015379217337194e-06, + "loss": 0.0002, + "step": 21516 + }, + { + "epoch": 8.750305002033347, + "grad_norm": 0.003070910431955073, + "learning_rate": 5.100654640425189e-06, + "loss": 0.0, + "step": 21517 + }, + { + "epoch": 8.750711671411143, + "grad_norm": 0.11607989147626552, + "learning_rate": 5.099771409410907e-06, + "loss": 0.0013, + "step": 21518 + }, + { + "epoch": 8.751118340788938, + "grad_norm": 0.2731457173387671, + "learning_rate": 5.0988882286999386e-06, + "loss": 0.0028, + "step": 21519 + }, + { + "epoch": 8.751525010166734, + "grad_norm": 0.025961564109744345, + "learning_rate": 5.0980050983013496e-06, + "loss": 0.0004, + "step": 21520 + }, + { + "epoch": 8.75193167954453, + "grad_norm": 0.05927444678852836, + "learning_rate": 5.097122018224205e-06, + "loss": 0.0006, + "step": 21521 + }, + { + "epoch": 8.752338348922326, + "grad_norm": 0.03754477973770186, + "learning_rate": 5.096238988477571e-06, + "loss": 0.0005, + "step": 21522 + }, + { + "epoch": 8.752745018300121, + "grad_norm": 0.128000124503078, + "learning_rate": 5.095356009070512e-06, + "loss": 0.0018, + "step": 21523 + }, + { + "epoch": 8.753151687677917, + "grad_norm": 0.42216304772847985, + "learning_rate": 5.094473080012085e-06, + "loss": 0.0047, + "step": 21524 + }, + { + "epoch": 8.753558357055713, + "grad_norm": 0.02428622884652016, + "learning_rate": 5.093590201311368e-06, + "loss": 0.0003, + "step": 21525 + }, + { + "epoch": 8.753965026433509, + "grad_norm": 8.248332741205447, + "learning_rate": 5.092707372977417e-06, + "loss": 0.1454, + "step": 21526 + }, + { + "epoch": 8.754371695811304, + "grad_norm": 0.018813699195872148, + "learning_rate": 5.0918245950192925e-06, + "loss": 0.0003, + "step": 21527 + }, + { + "epoch": 8.754778365189102, + "grad_norm": 0.004478543541669783, + "learning_rate": 5.09094186744606e-06, + "loss": 0.0001, + "step": 21528 + }, + { + "epoch": 8.755185034566898, + "grad_norm": 1.707326083422489, + "learning_rate": 5.090059190266779e-06, + "loss": 0.0519, + "step": 21529 + }, + { + "epoch": 8.755591703944694, + "grad_norm": 2.1024680283436283, + "learning_rate": 5.0891765634905084e-06, + "loss": 0.0221, + "step": 21530 + }, + { + "epoch": 8.75599837332249, + "grad_norm": 0.022135021942304017, + "learning_rate": 5.088293987126316e-06, + "loss": 0.0002, + "step": 21531 + }, + { + "epoch": 8.756405042700285, + "grad_norm": 0.013307956309693607, + "learning_rate": 5.087411461183258e-06, + "loss": 0.0001, + "step": 21532 + }, + { + "epoch": 8.75681171207808, + "grad_norm": 0.3091311999469593, + "learning_rate": 5.0865289856703935e-06, + "loss": 0.0026, + "step": 21533 + }, + { + "epoch": 8.757218381455877, + "grad_norm": 5.092701499043191, + "learning_rate": 5.085646560596782e-06, + "loss": 0.0743, + "step": 21534 + }, + { + "epoch": 8.757625050833672, + "grad_norm": 0.4528949054897652, + "learning_rate": 5.08476418597148e-06, + "loss": 0.005, + "step": 21535 + }, + { + "epoch": 8.758031720211468, + "grad_norm": 1.273165812084636, + "learning_rate": 5.08388186180355e-06, + "loss": 0.0097, + "step": 21536 + }, + { + "epoch": 8.758438389589264, + "grad_norm": 4.2732767527298305, + "learning_rate": 5.082999588102042e-06, + "loss": 0.0396, + "step": 21537 + }, + { + "epoch": 8.75884505896706, + "grad_norm": 5.743436339549631, + "learning_rate": 5.0821173648760225e-06, + "loss": 0.0733, + "step": 21538 + }, + { + "epoch": 8.759251728344855, + "grad_norm": 0.5492940979904753, + "learning_rate": 5.081235192134541e-06, + "loss": 0.0061, + "step": 21539 + }, + { + "epoch": 8.759658397722651, + "grad_norm": 0.16626757281867208, + "learning_rate": 5.080353069886658e-06, + "loss": 0.001, + "step": 21540 + }, + { + "epoch": 8.760065067100447, + "grad_norm": 0.4358543379334258, + "learning_rate": 5.079470998141427e-06, + "loss": 0.0048, + "step": 21541 + }, + { + "epoch": 8.760471736478243, + "grad_norm": 0.00887554978323153, + "learning_rate": 5.078588976907902e-06, + "loss": 0.0001, + "step": 21542 + }, + { + "epoch": 8.760878405856038, + "grad_norm": 0.14237946373750937, + "learning_rate": 5.077707006195134e-06, + "loss": 0.0009, + "step": 21543 + }, + { + "epoch": 8.761285075233834, + "grad_norm": 0.0020418128620497598, + "learning_rate": 5.0768250860121846e-06, + "loss": 0.0, + "step": 21544 + }, + { + "epoch": 8.76169174461163, + "grad_norm": 2.9380502517546234, + "learning_rate": 5.0759432163681045e-06, + "loss": 0.0393, + "step": 21545 + }, + { + "epoch": 8.762098413989426, + "grad_norm": 0.763200828564484, + "learning_rate": 5.075061397271945e-06, + "loss": 0.0075, + "step": 21546 + }, + { + "epoch": 8.762505083367223, + "grad_norm": 0.013458672802980133, + "learning_rate": 5.074179628732758e-06, + "loss": 0.0001, + "step": 21547 + }, + { + "epoch": 8.762911752745019, + "grad_norm": 1.482613311030577, + "learning_rate": 5.073297910759598e-06, + "loss": 0.0072, + "step": 21548 + }, + { + "epoch": 8.763318422122815, + "grad_norm": 0.33736283382196613, + "learning_rate": 5.072416243361513e-06, + "loss": 0.0026, + "step": 21549 + }, + { + "epoch": 8.76372509150061, + "grad_norm": 0.005976765607459793, + "learning_rate": 5.0715346265475565e-06, + "loss": 0.0001, + "step": 21550 + }, + { + "epoch": 8.764131760878406, + "grad_norm": 1.701073768674135, + "learning_rate": 5.070653060326776e-06, + "loss": 0.0257, + "step": 21551 + }, + { + "epoch": 8.764538430256202, + "grad_norm": 0.005062482079172276, + "learning_rate": 5.069771544708224e-06, + "loss": 0.0001, + "step": 21552 + }, + { + "epoch": 8.764945099633998, + "grad_norm": 0.11514507550610018, + "learning_rate": 5.068890079700948e-06, + "loss": 0.0006, + "step": 21553 + }, + { + "epoch": 8.765351769011794, + "grad_norm": 0.012340069945997019, + "learning_rate": 5.068008665313997e-06, + "loss": 0.0001, + "step": 21554 + }, + { + "epoch": 8.76575843838959, + "grad_norm": 0.11024364075712166, + "learning_rate": 5.067127301556415e-06, + "loss": 0.0012, + "step": 21555 + }, + { + "epoch": 8.766165107767385, + "grad_norm": 6.0209105139616295, + "learning_rate": 5.066245988437257e-06, + "loss": 0.2157, + "step": 21556 + }, + { + "epoch": 8.766571777145181, + "grad_norm": 0.049002200442488646, + "learning_rate": 5.065364725965569e-06, + "loss": 0.0006, + "step": 21557 + }, + { + "epoch": 8.766978446522977, + "grad_norm": 0.2349939718074164, + "learning_rate": 5.0644835141503954e-06, + "loss": 0.0025, + "step": 21558 + }, + { + "epoch": 8.767385115900773, + "grad_norm": 1.0932676661273402, + "learning_rate": 5.0636023530007815e-06, + "loss": 0.0096, + "step": 21559 + }, + { + "epoch": 8.767791785278568, + "grad_norm": 0.08922716592957257, + "learning_rate": 5.062721242525774e-06, + "loss": 0.001, + "step": 21560 + }, + { + "epoch": 8.768198454656364, + "grad_norm": 1.6335575783658576, + "learning_rate": 5.061840182734419e-06, + "loss": 0.0397, + "step": 21561 + }, + { + "epoch": 8.76860512403416, + "grad_norm": 0.39109754612579517, + "learning_rate": 5.060959173635755e-06, + "loss": 0.0035, + "step": 21562 + }, + { + "epoch": 8.769011793411956, + "grad_norm": 0.013989620259361506, + "learning_rate": 5.060078215238835e-06, + "loss": 0.0001, + "step": 21563 + }, + { + "epoch": 8.769418462789751, + "grad_norm": 0.8162903411105589, + "learning_rate": 5.059197307552698e-06, + "loss": 0.0076, + "step": 21564 + }, + { + "epoch": 8.769825132167547, + "grad_norm": 1.2160429549121476, + "learning_rate": 5.058316450586387e-06, + "loss": 0.0135, + "step": 21565 + }, + { + "epoch": 8.770231801545343, + "grad_norm": 0.013748800119581742, + "learning_rate": 5.057435644348945e-06, + "loss": 0.0001, + "step": 21566 + }, + { + "epoch": 8.770638470923139, + "grad_norm": 0.0429154699601875, + "learning_rate": 5.056554888849415e-06, + "loss": 0.0005, + "step": 21567 + }, + { + "epoch": 8.771045140300934, + "grad_norm": 5.596329774769388, + "learning_rate": 5.055674184096831e-06, + "loss": 0.055, + "step": 21568 + }, + { + "epoch": 8.771451809678732, + "grad_norm": 5.770188676392348, + "learning_rate": 5.054793530100246e-06, + "loss": 0.0785, + "step": 21569 + }, + { + "epoch": 8.771858479056528, + "grad_norm": 6.826657478990105, + "learning_rate": 5.053912926868693e-06, + "loss": 0.1324, + "step": 21570 + }, + { + "epoch": 8.772265148434323, + "grad_norm": 0.029799104754319385, + "learning_rate": 5.053032374411213e-06, + "loss": 0.0003, + "step": 21571 + }, + { + "epoch": 8.77267181781212, + "grad_norm": 2.3580492428511564, + "learning_rate": 5.052151872736845e-06, + "loss": 0.03, + "step": 21572 + }, + { + "epoch": 8.773078487189915, + "grad_norm": 2.005480314756116, + "learning_rate": 5.051271421854629e-06, + "loss": 0.0237, + "step": 21573 + }, + { + "epoch": 8.77348515656771, + "grad_norm": 5.174691197392103, + "learning_rate": 5.050391021773603e-06, + "loss": 0.0614, + "step": 21574 + }, + { + "epoch": 8.773891825945507, + "grad_norm": 0.8643495749059934, + "learning_rate": 5.0495106725028e-06, + "loss": 0.0095, + "step": 21575 + }, + { + "epoch": 8.774298495323302, + "grad_norm": 7.404657897459394, + "learning_rate": 5.048630374051265e-06, + "loss": 0.0478, + "step": 21576 + }, + { + "epoch": 8.774705164701098, + "grad_norm": 0.739913433216616, + "learning_rate": 5.04775012642803e-06, + "loss": 0.0107, + "step": 21577 + }, + { + "epoch": 8.775111834078894, + "grad_norm": 6.601779084913252, + "learning_rate": 5.046869929642135e-06, + "loss": 0.1003, + "step": 21578 + }, + { + "epoch": 8.77551850345669, + "grad_norm": 0.0239305328752138, + "learning_rate": 5.045989783702612e-06, + "loss": 0.0003, + "step": 21579 + }, + { + "epoch": 8.775925172834485, + "grad_norm": 2.5476562100058495, + "learning_rate": 5.045109688618497e-06, + "loss": 0.0404, + "step": 21580 + }, + { + "epoch": 8.776331842212281, + "grad_norm": 2.122079536086154, + "learning_rate": 5.044229644398821e-06, + "loss": 0.0225, + "step": 21581 + }, + { + "epoch": 8.776738511590077, + "grad_norm": 4.440000533768791, + "learning_rate": 5.043349651052625e-06, + "loss": 0.0204, + "step": 21582 + }, + { + "epoch": 8.777145180967873, + "grad_norm": 0.006984346518332397, + "learning_rate": 5.04246970858894e-06, + "loss": 0.0001, + "step": 21583 + }, + { + "epoch": 8.777551850345668, + "grad_norm": 1.8528083483389217, + "learning_rate": 5.041589817016798e-06, + "loss": 0.0224, + "step": 21584 + }, + { + "epoch": 8.777958519723464, + "grad_norm": 0.02196904522568188, + "learning_rate": 5.0407099763452325e-06, + "loss": 0.0002, + "step": 21585 + }, + { + "epoch": 8.77836518910126, + "grad_norm": 2.07629906370131, + "learning_rate": 5.039830186583276e-06, + "loss": 0.0238, + "step": 21586 + }, + { + "epoch": 8.778771858479056, + "grad_norm": 0.4479782242540706, + "learning_rate": 5.0389504477399585e-06, + "loss": 0.0059, + "step": 21587 + }, + { + "epoch": 8.779178527856853, + "grad_norm": 0.058374182371191095, + "learning_rate": 5.0380707598243075e-06, + "loss": 0.0006, + "step": 21588 + }, + { + "epoch": 8.779585197234649, + "grad_norm": 0.012639646284871449, + "learning_rate": 5.03719112284536e-06, + "loss": 0.0002, + "step": 21589 + }, + { + "epoch": 8.779991866612445, + "grad_norm": 15.50648096167056, + "learning_rate": 5.036311536812145e-06, + "loss": 0.1367, + "step": 21590 + }, + { + "epoch": 8.78039853599024, + "grad_norm": 0.2782768808402991, + "learning_rate": 5.0354320017336885e-06, + "loss": 0.0027, + "step": 21591 + }, + { + "epoch": 8.780805205368036, + "grad_norm": 0.004469817197754813, + "learning_rate": 5.034552517619023e-06, + "loss": 0.0001, + "step": 21592 + }, + { + "epoch": 8.781211874745832, + "grad_norm": 0.014366065942601683, + "learning_rate": 5.033673084477175e-06, + "loss": 0.0002, + "step": 21593 + }, + { + "epoch": 8.781618544123628, + "grad_norm": 0.28629017283966607, + "learning_rate": 5.032793702317167e-06, + "loss": 0.003, + "step": 21594 + }, + { + "epoch": 8.782025213501424, + "grad_norm": 0.4078395150717397, + "learning_rate": 5.0319143711480366e-06, + "loss": 0.0031, + "step": 21595 + }, + { + "epoch": 8.78243188287922, + "grad_norm": 0.003961418604521402, + "learning_rate": 5.031035090978806e-06, + "loss": 0.0001, + "step": 21596 + }, + { + "epoch": 8.782838552257015, + "grad_norm": 0.017962834737383155, + "learning_rate": 5.030155861818499e-06, + "loss": 0.0003, + "step": 21597 + }, + { + "epoch": 8.783245221634811, + "grad_norm": 0.0034836638603372434, + "learning_rate": 5.029276683676145e-06, + "loss": 0.0, + "step": 21598 + }, + { + "epoch": 8.783651891012607, + "grad_norm": 1.4454906944182133, + "learning_rate": 5.028397556560767e-06, + "loss": 0.04, + "step": 21599 + }, + { + "epoch": 8.784058560390402, + "grad_norm": 0.014752205950215085, + "learning_rate": 5.027518480481391e-06, + "loss": 0.0001, + "step": 21600 + }, + { + "epoch": 8.784465229768198, + "grad_norm": 0.09700293101924536, + "learning_rate": 5.026639455447035e-06, + "loss": 0.001, + "step": 21601 + }, + { + "epoch": 8.784871899145994, + "grad_norm": 2.7217971540642143, + "learning_rate": 5.0257604814667324e-06, + "loss": 0.0128, + "step": 21602 + }, + { + "epoch": 8.78527856852379, + "grad_norm": 3.4319288316407, + "learning_rate": 5.024881558549503e-06, + "loss": 0.0096, + "step": 21603 + }, + { + "epoch": 8.785685237901586, + "grad_norm": 3.5688821368444392, + "learning_rate": 5.0240026867043675e-06, + "loss": 0.0198, + "step": 21604 + }, + { + "epoch": 8.786091907279381, + "grad_norm": 0.036162982786662294, + "learning_rate": 5.023123865940349e-06, + "loss": 0.0002, + "step": 21605 + }, + { + "epoch": 8.786498576657177, + "grad_norm": 10.293719342495303, + "learning_rate": 5.022245096266468e-06, + "loss": 0.164, + "step": 21606 + }, + { + "epoch": 8.786905246034973, + "grad_norm": 0.0011811369046205496, + "learning_rate": 5.021366377691743e-06, + "loss": 0.0, + "step": 21607 + }, + { + "epoch": 8.787311915412769, + "grad_norm": 0.02517618041217611, + "learning_rate": 5.0204877102252025e-06, + "loss": 0.0002, + "step": 21608 + }, + { + "epoch": 8.787718584790564, + "grad_norm": 0.004524515291587518, + "learning_rate": 5.019609093875861e-06, + "loss": 0.0001, + "step": 21609 + }, + { + "epoch": 8.788125254168362, + "grad_norm": 7.949962894653753, + "learning_rate": 5.0187305286527385e-06, + "loss": 0.1384, + "step": 21610 + }, + { + "epoch": 8.788531923546158, + "grad_norm": 1.1378287415172303, + "learning_rate": 5.0178520145648546e-06, + "loss": 0.0092, + "step": 21611 + }, + { + "epoch": 8.788938592923953, + "grad_norm": 7.1925470570381345, + "learning_rate": 5.016973551621227e-06, + "loss": 0.0967, + "step": 21612 + }, + { + "epoch": 8.78934526230175, + "grad_norm": 0.27341707234434814, + "learning_rate": 5.0160951398308745e-06, + "loss": 0.0029, + "step": 21613 + }, + { + "epoch": 8.789751931679545, + "grad_norm": 1.4439198249979925, + "learning_rate": 5.015216779202809e-06, + "loss": 0.0136, + "step": 21614 + }, + { + "epoch": 8.79015860105734, + "grad_norm": 0.6749020417533531, + "learning_rate": 5.0143384697460566e-06, + "loss": 0.0069, + "step": 21615 + }, + { + "epoch": 8.790565270435136, + "grad_norm": 0.014036116459047808, + "learning_rate": 5.01346021146963e-06, + "loss": 0.0002, + "step": 21616 + }, + { + "epoch": 8.790971939812932, + "grad_norm": 3.6917541176004383, + "learning_rate": 5.0125820043825426e-06, + "loss": 0.0793, + "step": 21617 + }, + { + "epoch": 8.791378609190728, + "grad_norm": 0.07789127537913837, + "learning_rate": 5.011703848493809e-06, + "loss": 0.0009, + "step": 21618 + }, + { + "epoch": 8.791785278568524, + "grad_norm": 0.0006584293661768392, + "learning_rate": 5.010825743812449e-06, + "loss": 0.0, + "step": 21619 + }, + { + "epoch": 8.79219194794632, + "grad_norm": 0.07242759417463772, + "learning_rate": 5.00994769034747e-06, + "loss": 0.0008, + "step": 21620 + }, + { + "epoch": 8.792598617324115, + "grad_norm": 2.5105836691941623, + "learning_rate": 5.009069688107892e-06, + "loss": 0.0277, + "step": 21621 + }, + { + "epoch": 8.793005286701911, + "grad_norm": 0.020681541359367814, + "learning_rate": 5.008191737102725e-06, + "loss": 0.0003, + "step": 21622 + }, + { + "epoch": 8.793411956079707, + "grad_norm": 0.8465350747630973, + "learning_rate": 5.007313837340984e-06, + "loss": 0.0061, + "step": 21623 + }, + { + "epoch": 8.793818625457503, + "grad_norm": 0.10653478482244037, + "learning_rate": 5.006435988831677e-06, + "loss": 0.0009, + "step": 21624 + }, + { + "epoch": 8.794225294835298, + "grad_norm": 0.0006232151106825262, + "learning_rate": 5.005558191583819e-06, + "loss": 0.0, + "step": 21625 + }, + { + "epoch": 8.794631964213094, + "grad_norm": 5.8370790459846225, + "learning_rate": 5.00468044560642e-06, + "loss": 0.0785, + "step": 21626 + }, + { + "epoch": 8.79503863359089, + "grad_norm": 0.14538692350634805, + "learning_rate": 5.003802750908486e-06, + "loss": 0.0011, + "step": 21627 + }, + { + "epoch": 8.795445302968686, + "grad_norm": 0.024486048794147713, + "learning_rate": 5.002925107499035e-06, + "loss": 0.0002, + "step": 21628 + }, + { + "epoch": 8.795851972346483, + "grad_norm": 0.6176174061787005, + "learning_rate": 5.002047515387074e-06, + "loss": 0.005, + "step": 21629 + }, + { + "epoch": 8.796258641724279, + "grad_norm": 0.07233991205070961, + "learning_rate": 5.00116997458161e-06, + "loss": 0.0008, + "step": 21630 + }, + { + "epoch": 8.796665311102075, + "grad_norm": 0.9206697525410371, + "learning_rate": 5.0002924850916515e-06, + "loss": 0.0082, + "step": 21631 + }, + { + "epoch": 8.79707198047987, + "grad_norm": 0.025043681250690702, + "learning_rate": 4.999415046926207e-06, + "loss": 0.0001, + "step": 21632 + }, + { + "epoch": 8.797478649857666, + "grad_norm": 9.823962388381442, + "learning_rate": 4.99853766009428e-06, + "loss": 0.1529, + "step": 21633 + }, + { + "epoch": 8.797885319235462, + "grad_norm": 0.0036822530175077716, + "learning_rate": 4.997660324604885e-06, + "loss": 0.0, + "step": 21634 + }, + { + "epoch": 8.798291988613258, + "grad_norm": 0.12919179084811389, + "learning_rate": 4.996783040467025e-06, + "loss": 0.0015, + "step": 21635 + }, + { + "epoch": 8.798698657991054, + "grad_norm": 2.781645777655641, + "learning_rate": 4.9959058076897045e-06, + "loss": 0.025, + "step": 21636 + }, + { + "epoch": 8.79910532736885, + "grad_norm": 2.2092163223345773, + "learning_rate": 4.995028626281929e-06, + "loss": 0.104, + "step": 21637 + }, + { + "epoch": 8.799511996746645, + "grad_norm": 0.0006691467214449292, + "learning_rate": 4.994151496252705e-06, + "loss": 0.0, + "step": 21638 + }, + { + "epoch": 8.79991866612444, + "grad_norm": 0.5275109799123731, + "learning_rate": 4.993274417611035e-06, + "loss": 0.0035, + "step": 21639 + }, + { + "epoch": 8.800325335502237, + "grad_norm": 6.003205019232876, + "learning_rate": 4.992397390365918e-06, + "loss": 0.1094, + "step": 21640 + }, + { + "epoch": 8.800732004880032, + "grad_norm": 0.37768194332935845, + "learning_rate": 4.991520414526366e-06, + "loss": 0.0023, + "step": 21641 + }, + { + "epoch": 8.801138674257828, + "grad_norm": 9.369586415089868, + "learning_rate": 4.990643490101379e-06, + "loss": 0.2517, + "step": 21642 + }, + { + "epoch": 8.801545343635624, + "grad_norm": 0.0018158528414512001, + "learning_rate": 4.989766617099956e-06, + "loss": 0.0, + "step": 21643 + }, + { + "epoch": 8.80195201301342, + "grad_norm": 0.0002641836913583348, + "learning_rate": 4.988889795531101e-06, + "loss": 0.0, + "step": 21644 + }, + { + "epoch": 8.802358682391215, + "grad_norm": 10.929454621191969, + "learning_rate": 4.988013025403815e-06, + "loss": 0.5016, + "step": 21645 + }, + { + "epoch": 8.802765351769011, + "grad_norm": 0.021160157052170774, + "learning_rate": 4.9871363067270925e-06, + "loss": 0.0003, + "step": 21646 + }, + { + "epoch": 8.803172021146807, + "grad_norm": 0.0008249625681155737, + "learning_rate": 4.986259639509944e-06, + "loss": 0.0, + "step": 21647 + }, + { + "epoch": 8.803578690524603, + "grad_norm": 0.15366437454977636, + "learning_rate": 4.985383023761368e-06, + "loss": 0.0024, + "step": 21648 + }, + { + "epoch": 8.803985359902398, + "grad_norm": 4.894532591461096, + "learning_rate": 4.9845064594903534e-06, + "loss": 0.0416, + "step": 21649 + }, + { + "epoch": 8.804392029280194, + "grad_norm": 0.007463079780645073, + "learning_rate": 4.9836299467059046e-06, + "loss": 0.0001, + "step": 21650 + }, + { + "epoch": 8.804798698657992, + "grad_norm": 6.816741467297108, + "learning_rate": 4.98275348541702e-06, + "loss": 0.1442, + "step": 21651 + }, + { + "epoch": 8.805205368035788, + "grad_norm": 0.06004093158463951, + "learning_rate": 4.981877075632692e-06, + "loss": 0.0008, + "step": 21652 + }, + { + "epoch": 8.805612037413583, + "grad_norm": 0.11675798945304951, + "learning_rate": 4.981000717361925e-06, + "loss": 0.0008, + "step": 21653 + }, + { + "epoch": 8.806018706791379, + "grad_norm": 0.0038716469823664944, + "learning_rate": 4.980124410613711e-06, + "loss": 0.0, + "step": 21654 + }, + { + "epoch": 8.806425376169175, + "grad_norm": 2.943526293371349, + "learning_rate": 4.9792481553970475e-06, + "loss": 0.0406, + "step": 21655 + }, + { + "epoch": 8.80683204554697, + "grad_norm": 8.167128941404654, + "learning_rate": 4.978371951720928e-06, + "loss": 0.1046, + "step": 21656 + }, + { + "epoch": 8.807238714924766, + "grad_norm": 0.042973289350564055, + "learning_rate": 4.977495799594348e-06, + "loss": 0.0003, + "step": 21657 + }, + { + "epoch": 8.807645384302562, + "grad_norm": 7.322718016161375, + "learning_rate": 4.976619699026296e-06, + "loss": 0.0884, + "step": 21658 + }, + { + "epoch": 8.808052053680358, + "grad_norm": 5.463797350480634, + "learning_rate": 4.975743650025776e-06, + "loss": 0.0477, + "step": 21659 + }, + { + "epoch": 8.808458723058154, + "grad_norm": 0.029038755323666187, + "learning_rate": 4.974867652601777e-06, + "loss": 0.0003, + "step": 21660 + }, + { + "epoch": 8.80886539243595, + "grad_norm": 0.13764732702000346, + "learning_rate": 4.973991706763289e-06, + "loss": 0.0011, + "step": 21661 + }, + { + "epoch": 8.809272061813745, + "grad_norm": 9.329095086736677, + "learning_rate": 4.973115812519307e-06, + "loss": 0.1223, + "step": 21662 + }, + { + "epoch": 8.809678731191541, + "grad_norm": 0.18329825226991495, + "learning_rate": 4.9722399698788205e-06, + "loss": 0.0019, + "step": 21663 + }, + { + "epoch": 8.810085400569337, + "grad_norm": 0.09332758552411212, + "learning_rate": 4.9713641788508215e-06, + "loss": 0.0009, + "step": 21664 + }, + { + "epoch": 8.810492069947133, + "grad_norm": 0.36280149692224883, + "learning_rate": 4.970488439444296e-06, + "loss": 0.0023, + "step": 21665 + }, + { + "epoch": 8.810898739324928, + "grad_norm": 0.5541257777843748, + "learning_rate": 4.969612751668242e-06, + "loss": 0.0048, + "step": 21666 + }, + { + "epoch": 8.811305408702724, + "grad_norm": 0.3687465473153327, + "learning_rate": 4.968737115531644e-06, + "loss": 0.0024, + "step": 21667 + }, + { + "epoch": 8.81171207808052, + "grad_norm": 1.0195936689964022, + "learning_rate": 4.967861531043493e-06, + "loss": 0.0106, + "step": 21668 + }, + { + "epoch": 8.812118747458316, + "grad_norm": 1.5282583274398065, + "learning_rate": 4.966985998212776e-06, + "loss": 0.0112, + "step": 21669 + }, + { + "epoch": 8.812525416836113, + "grad_norm": 0.005208986565094996, + "learning_rate": 4.966110517048479e-06, + "loss": 0.0001, + "step": 21670 + }, + { + "epoch": 8.812932086213909, + "grad_norm": 0.24812962439515363, + "learning_rate": 4.965235087559589e-06, + "loss": 0.0017, + "step": 21671 + }, + { + "epoch": 8.813338755591705, + "grad_norm": 1.9536941754579176, + "learning_rate": 4.964359709755097e-06, + "loss": 0.0217, + "step": 21672 + }, + { + "epoch": 8.8137454249695, + "grad_norm": 0.7928647249971256, + "learning_rate": 4.963484383643988e-06, + "loss": 0.0068, + "step": 21673 + }, + { + "epoch": 8.814152094347296, + "grad_norm": 0.05558999999048874, + "learning_rate": 4.962609109235248e-06, + "loss": 0.0005, + "step": 21674 + }, + { + "epoch": 8.814558763725092, + "grad_norm": 0.027109632489463725, + "learning_rate": 4.961733886537858e-06, + "loss": 0.0002, + "step": 21675 + }, + { + "epoch": 8.814965433102888, + "grad_norm": 0.021922477623001032, + "learning_rate": 4.960858715560808e-06, + "loss": 0.0002, + "step": 21676 + }, + { + "epoch": 8.815372102480683, + "grad_norm": 0.004011419387707703, + "learning_rate": 4.959983596313077e-06, + "loss": 0.0, + "step": 21677 + }, + { + "epoch": 8.81577877185848, + "grad_norm": 8.81586203418736, + "learning_rate": 4.959108528803649e-06, + "loss": 0.2294, + "step": 21678 + }, + { + "epoch": 8.816185441236275, + "grad_norm": 0.005017458590657996, + "learning_rate": 4.9582335130415115e-06, + "loss": 0.0001, + "step": 21679 + }, + { + "epoch": 8.81659211061407, + "grad_norm": 0.6011349895130839, + "learning_rate": 4.957358549035645e-06, + "loss": 0.0046, + "step": 21680 + }, + { + "epoch": 8.816998779991867, + "grad_norm": 2.308665366088663, + "learning_rate": 4.956483636795032e-06, + "loss": 0.0617, + "step": 21681 + }, + { + "epoch": 8.817405449369662, + "grad_norm": 12.898961674617599, + "learning_rate": 4.955608776328651e-06, + "loss": 0.3747, + "step": 21682 + }, + { + "epoch": 8.817812118747458, + "grad_norm": 0.0058887288596789844, + "learning_rate": 4.954733967645485e-06, + "loss": 0.0001, + "step": 21683 + }, + { + "epoch": 8.818218788125254, + "grad_norm": 0.0636457321039617, + "learning_rate": 4.953859210754512e-06, + "loss": 0.0006, + "step": 21684 + }, + { + "epoch": 8.81862545750305, + "grad_norm": 9.203878661676876, + "learning_rate": 4.952984505664715e-06, + "loss": 0.2011, + "step": 21685 + }, + { + "epoch": 8.819032126880845, + "grad_norm": 0.11098085145355535, + "learning_rate": 4.952109852385074e-06, + "loss": 0.0004, + "step": 21686 + }, + { + "epoch": 8.819438796258641, + "grad_norm": 11.81003545035788, + "learning_rate": 4.951235250924565e-06, + "loss": 0.2019, + "step": 21687 + }, + { + "epoch": 8.819845465636437, + "grad_norm": 8.345281795648066, + "learning_rate": 4.9503607012921685e-06, + "loss": 0.084, + "step": 21688 + }, + { + "epoch": 8.820252135014233, + "grad_norm": 1.7653054620025153, + "learning_rate": 4.949486203496859e-06, + "loss": 0.0171, + "step": 21689 + }, + { + "epoch": 8.820658804392028, + "grad_norm": 0.023271731006388796, + "learning_rate": 4.948611757547617e-06, + "loss": 0.0003, + "step": 21690 + }, + { + "epoch": 8.821065473769824, + "grad_norm": 1.7990911799583342, + "learning_rate": 4.9477373634534135e-06, + "loss": 0.0253, + "step": 21691 + }, + { + "epoch": 8.821472143147622, + "grad_norm": 3.6012081588793112, + "learning_rate": 4.946863021223233e-06, + "loss": 0.0732, + "step": 21692 + }, + { + "epoch": 8.821878812525418, + "grad_norm": 0.005253163425794245, + "learning_rate": 4.945988730866045e-06, + "loss": 0.0001, + "step": 21693 + }, + { + "epoch": 8.822285481903213, + "grad_norm": 0.6151197938904123, + "learning_rate": 4.945114492390828e-06, + "loss": 0.0078, + "step": 21694 + }, + { + "epoch": 8.822692151281009, + "grad_norm": 0.16601216557173124, + "learning_rate": 4.944240305806553e-06, + "loss": 0.0021, + "step": 21695 + }, + { + "epoch": 8.823098820658805, + "grad_norm": 0.04054590900246668, + "learning_rate": 4.943366171122198e-06, + "loss": 0.0003, + "step": 21696 + }, + { + "epoch": 8.8235054900366, + "grad_norm": 4.029745158236539, + "learning_rate": 4.9424920883467285e-06, + "loss": 0.0774, + "step": 21697 + }, + { + "epoch": 8.823912159414396, + "grad_norm": 0.004761613496063091, + "learning_rate": 4.9416180574891285e-06, + "loss": 0.0001, + "step": 21698 + }, + { + "epoch": 8.824318828792192, + "grad_norm": 0.00013099552221630933, + "learning_rate": 4.940744078558364e-06, + "loss": 0.0, + "step": 21699 + }, + { + "epoch": 8.824725498169988, + "grad_norm": 11.845759383356548, + "learning_rate": 4.939870151563409e-06, + "loss": 0.1732, + "step": 21700 + }, + { + "epoch": 8.825132167547784, + "grad_norm": 1.9602182665566639, + "learning_rate": 4.938996276513232e-06, + "loss": 0.0326, + "step": 21701 + }, + { + "epoch": 8.82553883692558, + "grad_norm": 11.168452304113107, + "learning_rate": 4.938122453416806e-06, + "loss": 0.1004, + "step": 21702 + }, + { + "epoch": 8.825945506303375, + "grad_norm": 0.016404614664623958, + "learning_rate": 4.9372486822831e-06, + "loss": 0.0002, + "step": 21703 + }, + { + "epoch": 8.826352175681171, + "grad_norm": 4.7312494274804235, + "learning_rate": 4.9363749631210814e-06, + "loss": 0.0464, + "step": 21704 + }, + { + "epoch": 8.826758845058967, + "grad_norm": 0.14035093731068504, + "learning_rate": 4.935501295939725e-06, + "loss": 0.0017, + "step": 21705 + }, + { + "epoch": 8.827165514436762, + "grad_norm": 10.878020931634843, + "learning_rate": 4.934627680747998e-06, + "loss": 0.3254, + "step": 21706 + }, + { + "epoch": 8.827572183814558, + "grad_norm": 4.304001343712305, + "learning_rate": 4.933754117554866e-06, + "loss": 0.0358, + "step": 21707 + }, + { + "epoch": 8.827978853192354, + "grad_norm": 0.022491577805128626, + "learning_rate": 4.932880606369299e-06, + "loss": 0.0002, + "step": 21708 + }, + { + "epoch": 8.82838552257015, + "grad_norm": 0.0009479748426897918, + "learning_rate": 4.9320071472002625e-06, + "loss": 0.0, + "step": 21709 + }, + { + "epoch": 8.828792191947946, + "grad_norm": 0.055539127091595565, + "learning_rate": 4.931133740056719e-06, + "loss": 0.0008, + "step": 21710 + }, + { + "epoch": 8.829198861325743, + "grad_norm": 6.891983627868383, + "learning_rate": 4.930260384947643e-06, + "loss": 0.0449, + "step": 21711 + }, + { + "epoch": 8.829605530703539, + "grad_norm": 7.703553185449482, + "learning_rate": 4.929387081881996e-06, + "loss": 0.1918, + "step": 21712 + }, + { + "epoch": 8.830012200081335, + "grad_norm": 4.207526909159999, + "learning_rate": 4.928513830868742e-06, + "loss": 0.0632, + "step": 21713 + }, + { + "epoch": 8.83041886945913, + "grad_norm": 0.4748030795259063, + "learning_rate": 4.927640631916846e-06, + "loss": 0.0042, + "step": 21714 + }, + { + "epoch": 8.830825538836926, + "grad_norm": 0.04791216265426266, + "learning_rate": 4.926767485035271e-06, + "loss": 0.0004, + "step": 21715 + }, + { + "epoch": 8.831232208214722, + "grad_norm": 0.024238885748722027, + "learning_rate": 4.925894390232982e-06, + "loss": 0.0002, + "step": 21716 + }, + { + "epoch": 8.831638877592518, + "grad_norm": 4.919426376488931, + "learning_rate": 4.925021347518937e-06, + "loss": 0.0611, + "step": 21717 + }, + { + "epoch": 8.832045546970313, + "grad_norm": 3.064691652175684, + "learning_rate": 4.924148356902107e-06, + "loss": 0.0718, + "step": 21718 + }, + { + "epoch": 8.83245221634811, + "grad_norm": 0.004268972478430047, + "learning_rate": 4.9232754183914465e-06, + "loss": 0.0001, + "step": 21719 + }, + { + "epoch": 8.832858885725905, + "grad_norm": 0.6367940595231335, + "learning_rate": 4.92240253199592e-06, + "loss": 0.0062, + "step": 21720 + }, + { + "epoch": 8.8332655551037, + "grad_norm": 0.0696118468770099, + "learning_rate": 4.921529697724487e-06, + "loss": 0.0006, + "step": 21721 + }, + { + "epoch": 8.833672224481496, + "grad_norm": 0.5189808081211258, + "learning_rate": 4.920656915586108e-06, + "loss": 0.0058, + "step": 21722 + }, + { + "epoch": 8.834078893859292, + "grad_norm": 0.2779215710456998, + "learning_rate": 4.919784185589738e-06, + "loss": 0.0029, + "step": 21723 + }, + { + "epoch": 8.834485563237088, + "grad_norm": 0.4326275258222902, + "learning_rate": 4.918911507744343e-06, + "loss": 0.0045, + "step": 21724 + }, + { + "epoch": 8.834892232614884, + "grad_norm": 5.962485200610677, + "learning_rate": 4.91803888205888e-06, + "loss": 0.1005, + "step": 21725 + }, + { + "epoch": 8.83529890199268, + "grad_norm": 0.030088069674880726, + "learning_rate": 4.917166308542305e-06, + "loss": 0.0003, + "step": 21726 + }, + { + "epoch": 8.835705571370475, + "grad_norm": 3.6312773834811187, + "learning_rate": 4.916293787203575e-06, + "loss": 0.0641, + "step": 21727 + }, + { + "epoch": 8.836112240748271, + "grad_norm": 0.08693598940626679, + "learning_rate": 4.9154213180516495e-06, + "loss": 0.0011, + "step": 21728 + }, + { + "epoch": 8.836518910126067, + "grad_norm": 7.119971548392612, + "learning_rate": 4.914548901095483e-06, + "loss": 0.0885, + "step": 21729 + }, + { + "epoch": 8.836925579503863, + "grad_norm": 0.09762141874551114, + "learning_rate": 4.913676536344026e-06, + "loss": 0.0014, + "step": 21730 + }, + { + "epoch": 8.837332248881658, + "grad_norm": 0.060654951583217286, + "learning_rate": 4.9128042238062445e-06, + "loss": 0.0005, + "step": 21731 + }, + { + "epoch": 8.837738918259454, + "grad_norm": 0.005181627427122162, + "learning_rate": 4.911931963491088e-06, + "loss": 0.0001, + "step": 21732 + }, + { + "epoch": 8.838145587637252, + "grad_norm": 0.08123967578869568, + "learning_rate": 4.91105975540751e-06, + "loss": 0.0006, + "step": 21733 + }, + { + "epoch": 8.838552257015047, + "grad_norm": 0.006045594547417904, + "learning_rate": 4.910187599564466e-06, + "loss": 0.0001, + "step": 21734 + }, + { + "epoch": 8.838958926392843, + "grad_norm": 6.4694830221726285, + "learning_rate": 4.909315495970907e-06, + "loss": 0.0582, + "step": 21735 + }, + { + "epoch": 8.839365595770639, + "grad_norm": 1.5321472426580443, + "learning_rate": 4.908443444635783e-06, + "loss": 0.0172, + "step": 21736 + }, + { + "epoch": 8.839772265148435, + "grad_norm": 0.00105883355918658, + "learning_rate": 4.907571445568053e-06, + "loss": 0.0, + "step": 21737 + }, + { + "epoch": 8.84017893452623, + "grad_norm": 0.1363808830677111, + "learning_rate": 4.906699498776666e-06, + "loss": 0.0014, + "step": 21738 + }, + { + "epoch": 8.840585603904026, + "grad_norm": 0.22901447114404774, + "learning_rate": 4.905827604270571e-06, + "loss": 0.0022, + "step": 21739 + }, + { + "epoch": 8.840992273281822, + "grad_norm": 1.0669243826389019, + "learning_rate": 4.90495576205872e-06, + "loss": 0.0073, + "step": 21740 + }, + { + "epoch": 8.841398942659618, + "grad_norm": 0.015428557307615779, + "learning_rate": 4.904083972150062e-06, + "loss": 0.0001, + "step": 21741 + }, + { + "epoch": 8.841805612037414, + "grad_norm": 0.1373125215296845, + "learning_rate": 4.903212234553547e-06, + "loss": 0.0017, + "step": 21742 + }, + { + "epoch": 8.84221228141521, + "grad_norm": 4.545416811168707, + "learning_rate": 4.902340549278119e-06, + "loss": 0.0444, + "step": 21743 + }, + { + "epoch": 8.842618950793005, + "grad_norm": 7.250941429695275, + "learning_rate": 4.901468916332736e-06, + "loss": 0.3107, + "step": 21744 + }, + { + "epoch": 8.8430256201708, + "grad_norm": 0.0009752107756877576, + "learning_rate": 4.90059733572634e-06, + "loss": 0.0, + "step": 21745 + }, + { + "epoch": 8.843432289548597, + "grad_norm": 0.15422993946073382, + "learning_rate": 4.899725807467879e-06, + "loss": 0.0016, + "step": 21746 + }, + { + "epoch": 8.843838958926392, + "grad_norm": 0.74669935071518, + "learning_rate": 4.898854331566299e-06, + "loss": 0.0084, + "step": 21747 + }, + { + "epoch": 8.844245628304188, + "grad_norm": 0.36705649103263116, + "learning_rate": 4.897982908030547e-06, + "loss": 0.0053, + "step": 21748 + }, + { + "epoch": 8.844652297681984, + "grad_norm": 0.3813539452131914, + "learning_rate": 4.897111536869569e-06, + "loss": 0.0039, + "step": 21749 + }, + { + "epoch": 8.84505896705978, + "grad_norm": 0.34088854366987065, + "learning_rate": 4.896240218092309e-06, + "loss": 0.0037, + "step": 21750 + }, + { + "epoch": 8.845465636437575, + "grad_norm": 0.3772492573531632, + "learning_rate": 4.895368951707711e-06, + "loss": 0.004, + "step": 21751 + }, + { + "epoch": 8.845872305815373, + "grad_norm": 1.4968187914193345, + "learning_rate": 4.89449773772472e-06, + "loss": 0.0154, + "step": 21752 + }, + { + "epoch": 8.846278975193169, + "grad_norm": 0.0003606156664498914, + "learning_rate": 4.8936265761522795e-06, + "loss": 0.0, + "step": 21753 + }, + { + "epoch": 8.846685644570965, + "grad_norm": 1.9903524495856482, + "learning_rate": 4.892755466999331e-06, + "loss": 0.018, + "step": 21754 + }, + { + "epoch": 8.84709231394876, + "grad_norm": 3.1157419762776852, + "learning_rate": 4.891884410274816e-06, + "loss": 0.0598, + "step": 21755 + }, + { + "epoch": 8.847498983326556, + "grad_norm": 0.01659938783577229, + "learning_rate": 4.8910134059876824e-06, + "loss": 0.0001, + "step": 21756 + }, + { + "epoch": 8.847905652704352, + "grad_norm": 0.047089483033031086, + "learning_rate": 4.890142454146865e-06, + "loss": 0.0005, + "step": 21757 + }, + { + "epoch": 8.848312322082148, + "grad_norm": 0.0009052323020731563, + "learning_rate": 4.88927155476131e-06, + "loss": 0.0, + "step": 21758 + }, + { + "epoch": 8.848718991459943, + "grad_norm": 0.1874399643955477, + "learning_rate": 4.888400707839953e-06, + "loss": 0.0019, + "step": 21759 + }, + { + "epoch": 8.849125660837739, + "grad_norm": 1.4165178716665414, + "learning_rate": 4.887529913391736e-06, + "loss": 0.0244, + "step": 21760 + }, + { + "epoch": 8.849532330215535, + "grad_norm": 0.002874876624389365, + "learning_rate": 4.886659171425593e-06, + "loss": 0.0, + "step": 21761 + }, + { + "epoch": 8.84993899959333, + "grad_norm": 6.361801559685056, + "learning_rate": 4.885788481950472e-06, + "loss": 0.0491, + "step": 21762 + }, + { + "epoch": 8.850345668971126, + "grad_norm": 1.8851060436042764, + "learning_rate": 4.8849178449753055e-06, + "loss": 0.0193, + "step": 21763 + }, + { + "epoch": 8.850752338348922, + "grad_norm": 1.5780914488674722, + "learning_rate": 4.884047260509031e-06, + "loss": 0.0042, + "step": 21764 + }, + { + "epoch": 8.851159007726718, + "grad_norm": 0.8648264024837403, + "learning_rate": 4.883176728560587e-06, + "loss": 0.0117, + "step": 21765 + }, + { + "epoch": 8.851565677104514, + "grad_norm": 2.0154168947719366, + "learning_rate": 4.882306249138909e-06, + "loss": 0.0171, + "step": 21766 + }, + { + "epoch": 8.85197234648231, + "grad_norm": 0.0038607149219092585, + "learning_rate": 4.8814358222529335e-06, + "loss": 0.0, + "step": 21767 + }, + { + "epoch": 8.852379015860105, + "grad_norm": 4.632687112143079, + "learning_rate": 4.8805654479115904e-06, + "loss": 0.0327, + "step": 21768 + }, + { + "epoch": 8.852785685237901, + "grad_norm": 14.979633184652634, + "learning_rate": 4.879695126123825e-06, + "loss": 0.165, + "step": 21769 + }, + { + "epoch": 8.853192354615697, + "grad_norm": 2.2706171610352626, + "learning_rate": 4.878824856898565e-06, + "loss": 0.0217, + "step": 21770 + }, + { + "epoch": 8.853599023993493, + "grad_norm": 1.0599382447836463, + "learning_rate": 4.8779546402447454e-06, + "loss": 0.0048, + "step": 21771 + }, + { + "epoch": 8.854005693371288, + "grad_norm": 0.2554504218923055, + "learning_rate": 4.877084476171299e-06, + "loss": 0.0021, + "step": 21772 + }, + { + "epoch": 8.854412362749084, + "grad_norm": 1.8825611235690156, + "learning_rate": 4.87621436468716e-06, + "loss": 0.017, + "step": 21773 + }, + { + "epoch": 8.854819032126882, + "grad_norm": 0.0015870628741817873, + "learning_rate": 4.875344305801254e-06, + "loss": 0.0, + "step": 21774 + }, + { + "epoch": 8.855225701504677, + "grad_norm": 2.6943918708340777, + "learning_rate": 4.8744742995225225e-06, + "loss": 0.0327, + "step": 21775 + }, + { + "epoch": 8.855632370882473, + "grad_norm": 5.474674602804513, + "learning_rate": 4.873604345859891e-06, + "loss": 0.0565, + "step": 21776 + }, + { + "epoch": 8.856039040260269, + "grad_norm": 1.2426820882485428, + "learning_rate": 4.872734444822293e-06, + "loss": 0.0186, + "step": 21777 + }, + { + "epoch": 8.856445709638065, + "grad_norm": 7.049722004416272, + "learning_rate": 4.871864596418654e-06, + "loss": 0.1365, + "step": 21778 + }, + { + "epoch": 8.85685237901586, + "grad_norm": 0.1541757229349042, + "learning_rate": 4.870994800657907e-06, + "loss": 0.0022, + "step": 21779 + }, + { + "epoch": 8.857259048393656, + "grad_norm": 0.3922493506525999, + "learning_rate": 4.87012505754898e-06, + "loss": 0.0048, + "step": 21780 + }, + { + "epoch": 8.857665717771452, + "grad_norm": 0.14362594305160878, + "learning_rate": 4.869255367100797e-06, + "loss": 0.0013, + "step": 21781 + }, + { + "epoch": 8.858072387149248, + "grad_norm": 0.8104255947471752, + "learning_rate": 4.8683857293222936e-06, + "loss": 0.0055, + "step": 21782 + }, + { + "epoch": 8.858479056527043, + "grad_norm": 0.7912278260126172, + "learning_rate": 4.867516144222392e-06, + "loss": 0.0072, + "step": 21783 + }, + { + "epoch": 8.85888572590484, + "grad_norm": 0.04293234124557715, + "learning_rate": 4.8666466118100234e-06, + "loss": 0.0005, + "step": 21784 + }, + { + "epoch": 8.859292395282635, + "grad_norm": 1.3561033035901462, + "learning_rate": 4.865777132094109e-06, + "loss": 0.0122, + "step": 21785 + }, + { + "epoch": 8.85969906466043, + "grad_norm": 4.537661711786538, + "learning_rate": 4.864907705083577e-06, + "loss": 0.075, + "step": 21786 + }, + { + "epoch": 8.860105734038227, + "grad_norm": 0.11765109349948043, + "learning_rate": 4.864038330787347e-06, + "loss": 0.0009, + "step": 21787 + }, + { + "epoch": 8.860512403416022, + "grad_norm": 0.2229857933102257, + "learning_rate": 4.8631690092143535e-06, + "loss": 0.0022, + "step": 21788 + }, + { + "epoch": 8.860919072793818, + "grad_norm": 0.0974992196618707, + "learning_rate": 4.862299740373514e-06, + "loss": 0.0014, + "step": 21789 + }, + { + "epoch": 8.861325742171614, + "grad_norm": 1.148548826475934, + "learning_rate": 4.861430524273755e-06, + "loss": 0.0108, + "step": 21790 + }, + { + "epoch": 8.86173241154941, + "grad_norm": 0.8022012619684319, + "learning_rate": 4.860561360923996e-06, + "loss": 0.0104, + "step": 21791 + }, + { + "epoch": 8.862139080927205, + "grad_norm": 0.0002654087054018961, + "learning_rate": 4.859692250333163e-06, + "loss": 0.0, + "step": 21792 + }, + { + "epoch": 8.862545750305003, + "grad_norm": 0.08965903235331725, + "learning_rate": 4.858823192510176e-06, + "loss": 0.0008, + "step": 21793 + }, + { + "epoch": 8.862952419682799, + "grad_norm": 1.202371660428191, + "learning_rate": 4.857954187463951e-06, + "loss": 0.0141, + "step": 21794 + }, + { + "epoch": 8.863359089060594, + "grad_norm": 0.004366203315637873, + "learning_rate": 4.85708523520342e-06, + "loss": 0.0001, + "step": 21795 + }, + { + "epoch": 8.86376575843839, + "grad_norm": 0.6204169652054778, + "learning_rate": 4.856216335737496e-06, + "loss": 0.0083, + "step": 21796 + }, + { + "epoch": 8.864172427816186, + "grad_norm": 0.0073419006637965205, + "learning_rate": 4.855347489075101e-06, + "loss": 0.0001, + "step": 21797 + }, + { + "epoch": 8.864579097193982, + "grad_norm": 1.1288234367239587, + "learning_rate": 4.854478695225153e-06, + "loss": 0.0076, + "step": 21798 + }, + { + "epoch": 8.864985766571778, + "grad_norm": 5.798651707343577, + "learning_rate": 4.853609954196571e-06, + "loss": 0.1104, + "step": 21799 + }, + { + "epoch": 8.865392435949573, + "grad_norm": 11.498247717996495, + "learning_rate": 4.852741265998268e-06, + "loss": 0.5262, + "step": 21800 + }, + { + "epoch": 8.865799105327369, + "grad_norm": 0.005023681394159076, + "learning_rate": 4.851872630639172e-06, + "loss": 0.0, + "step": 21801 + }, + { + "epoch": 8.866205774705165, + "grad_norm": 0.008560649726517851, + "learning_rate": 4.851004048128193e-06, + "loss": 0.0001, + "step": 21802 + }, + { + "epoch": 8.86661244408296, + "grad_norm": 4.371897429251244, + "learning_rate": 4.850135518474249e-06, + "loss": 0.0629, + "step": 21803 + }, + { + "epoch": 8.867019113460756, + "grad_norm": 6.0233993405371535, + "learning_rate": 4.849267041686257e-06, + "loss": 0.1096, + "step": 21804 + }, + { + "epoch": 8.867425782838552, + "grad_norm": 0.002673203857583893, + "learning_rate": 4.84839861777313e-06, + "loss": 0.0, + "step": 21805 + }, + { + "epoch": 8.867832452216348, + "grad_norm": 2.0454690204891475, + "learning_rate": 4.847530246743784e-06, + "loss": 0.0224, + "step": 21806 + }, + { + "epoch": 8.868239121594144, + "grad_norm": 4.292006976695614, + "learning_rate": 4.84666192860713e-06, + "loss": 0.0574, + "step": 21807 + }, + { + "epoch": 8.86864579097194, + "grad_norm": 13.891961426054397, + "learning_rate": 4.8457936633720895e-06, + "loss": 0.3718, + "step": 21808 + }, + { + "epoch": 8.869052460349735, + "grad_norm": 0.035502019978720226, + "learning_rate": 4.8449254510475695e-06, + "loss": 0.0003, + "step": 21809 + }, + { + "epoch": 8.869459129727531, + "grad_norm": 0.08617998961170835, + "learning_rate": 4.844057291642486e-06, + "loss": 0.0004, + "step": 21810 + }, + { + "epoch": 8.869865799105327, + "grad_norm": 1.6404530134199284, + "learning_rate": 4.843189185165748e-06, + "loss": 0.0168, + "step": 21811 + }, + { + "epoch": 8.870272468483122, + "grad_norm": 4.552623860970633, + "learning_rate": 4.842321131626267e-06, + "loss": 0.0507, + "step": 21812 + }, + { + "epoch": 8.870679137860918, + "grad_norm": 0.16234817020478298, + "learning_rate": 4.8414531310329535e-06, + "loss": 0.0019, + "step": 21813 + }, + { + "epoch": 8.871085807238714, + "grad_norm": 0.18023549484804854, + "learning_rate": 4.840585183394724e-06, + "loss": 0.0023, + "step": 21814 + }, + { + "epoch": 8.871492476616512, + "grad_norm": 0.01882522409359901, + "learning_rate": 4.839717288720483e-06, + "loss": 0.0002, + "step": 21815 + }, + { + "epoch": 8.871899145994307, + "grad_norm": 0.005953537984899281, + "learning_rate": 4.838849447019142e-06, + "loss": 0.0001, + "step": 21816 + }, + { + "epoch": 8.872305815372103, + "grad_norm": 0.09986998405773076, + "learning_rate": 4.8379816582996085e-06, + "loss": 0.0008, + "step": 21817 + }, + { + "epoch": 8.872712484749899, + "grad_norm": 0.03793830510860431, + "learning_rate": 4.837113922570792e-06, + "loss": 0.0002, + "step": 21818 + }, + { + "epoch": 8.873119154127695, + "grad_norm": 0.45736098831269645, + "learning_rate": 4.8362462398415985e-06, + "loss": 0.0016, + "step": 21819 + }, + { + "epoch": 8.87352582350549, + "grad_norm": 0.03286380399002052, + "learning_rate": 4.835378610120933e-06, + "loss": 0.0004, + "step": 21820 + }, + { + "epoch": 8.873932492883286, + "grad_norm": 0.010059141804869746, + "learning_rate": 4.834511033417709e-06, + "loss": 0.0001, + "step": 21821 + }, + { + "epoch": 8.874339162261082, + "grad_norm": 0.05123672391840158, + "learning_rate": 4.833643509740829e-06, + "loss": 0.0004, + "step": 21822 + }, + { + "epoch": 8.874745831638878, + "grad_norm": 0.24602360739117418, + "learning_rate": 4.832776039099198e-06, + "loss": 0.0031, + "step": 21823 + }, + { + "epoch": 8.875152501016673, + "grad_norm": 13.490555315520343, + "learning_rate": 4.831908621501723e-06, + "loss": 0.4383, + "step": 21824 + }, + { + "epoch": 8.87555917039447, + "grad_norm": 7.690710552536356, + "learning_rate": 4.831041256957306e-06, + "loss": 0.2155, + "step": 21825 + }, + { + "epoch": 8.875965839772265, + "grad_norm": 0.19830602214685825, + "learning_rate": 4.830173945474849e-06, + "loss": 0.0022, + "step": 21826 + }, + { + "epoch": 8.87637250915006, + "grad_norm": 0.028118278201650627, + "learning_rate": 4.829306687063261e-06, + "loss": 0.0003, + "step": 21827 + }, + { + "epoch": 8.876779178527856, + "grad_norm": 0.0941966341495649, + "learning_rate": 4.828439481731443e-06, + "loss": 0.0009, + "step": 21828 + }, + { + "epoch": 8.877185847905652, + "grad_norm": 0.033116624488843074, + "learning_rate": 4.827572329488296e-06, + "loss": 0.0005, + "step": 21829 + }, + { + "epoch": 8.877592517283448, + "grad_norm": 2.230555369002989, + "learning_rate": 4.826705230342722e-06, + "loss": 0.0265, + "step": 21830 + }, + { + "epoch": 8.877999186661244, + "grad_norm": 0.0005367234898525499, + "learning_rate": 4.825838184303624e-06, + "loss": 0.0, + "step": 21831 + }, + { + "epoch": 8.87840585603904, + "grad_norm": 5.704626940523597, + "learning_rate": 4.824971191379899e-06, + "loss": 0.0503, + "step": 21832 + }, + { + "epoch": 8.878812525416835, + "grad_norm": 4.903186695608476, + "learning_rate": 4.824104251580447e-06, + "loss": 0.0387, + "step": 21833 + }, + { + "epoch": 8.879219194794633, + "grad_norm": 0.5669929380375767, + "learning_rate": 4.823237364914174e-06, + "loss": 0.0035, + "step": 21834 + }, + { + "epoch": 8.879625864172429, + "grad_norm": 0.00882046721856213, + "learning_rate": 4.822370531389974e-06, + "loss": 0.0001, + "step": 21835 + }, + { + "epoch": 8.880032533550224, + "grad_norm": 5.194984174850599, + "learning_rate": 4.821503751016746e-06, + "loss": 0.0668, + "step": 21836 + }, + { + "epoch": 8.88043920292802, + "grad_norm": 0.3619882030154554, + "learning_rate": 4.820637023803388e-06, + "loss": 0.0026, + "step": 21837 + }, + { + "epoch": 8.880845872305816, + "grad_norm": 12.770128479460997, + "learning_rate": 4.819770349758799e-06, + "loss": 0.2507, + "step": 21838 + }, + { + "epoch": 8.881252541683612, + "grad_norm": 1.170143465775529, + "learning_rate": 4.81890372889187e-06, + "loss": 0.0132, + "step": 21839 + }, + { + "epoch": 8.881659211061407, + "grad_norm": 0.2544088504267471, + "learning_rate": 4.8180371612115065e-06, + "loss": 0.0022, + "step": 21840 + }, + { + "epoch": 8.882065880439203, + "grad_norm": 0.9878176642690663, + "learning_rate": 4.817170646726597e-06, + "loss": 0.0112, + "step": 21841 + }, + { + "epoch": 8.882472549816999, + "grad_norm": 0.027659629963264258, + "learning_rate": 4.816304185446043e-06, + "loss": 0.0002, + "step": 21842 + }, + { + "epoch": 8.882879219194795, + "grad_norm": 1.3019718918297605, + "learning_rate": 4.815437777378733e-06, + "loss": 0.0178, + "step": 21843 + }, + { + "epoch": 8.88328588857259, + "grad_norm": 0.020403405029165777, + "learning_rate": 4.814571422533564e-06, + "loss": 0.0002, + "step": 21844 + }, + { + "epoch": 8.883692557950386, + "grad_norm": 0.02274840500006708, + "learning_rate": 4.813705120919428e-06, + "loss": 0.0004, + "step": 21845 + }, + { + "epoch": 8.884099227328182, + "grad_norm": 1.9587598884901156, + "learning_rate": 4.812838872545217e-06, + "loss": 0.018, + "step": 21846 + }, + { + "epoch": 8.884505896705978, + "grad_norm": 0.006126778175202825, + "learning_rate": 4.811972677419829e-06, + "loss": 0.0001, + "step": 21847 + }, + { + "epoch": 8.884912566083774, + "grad_norm": 10.140231990398792, + "learning_rate": 4.811106535552157e-06, + "loss": 0.7578, + "step": 21848 + }, + { + "epoch": 8.88531923546157, + "grad_norm": 0.3962106662096279, + "learning_rate": 4.810240446951083e-06, + "loss": 0.0035, + "step": 21849 + }, + { + "epoch": 8.885725904839365, + "grad_norm": 0.01611132967147766, + "learning_rate": 4.809374411625502e-06, + "loss": 0.0001, + "step": 21850 + }, + { + "epoch": 8.88613257421716, + "grad_norm": 0.13344009707647486, + "learning_rate": 4.808508429584303e-06, + "loss": 0.0008, + "step": 21851 + }, + { + "epoch": 8.886539243594957, + "grad_norm": 0.1162199943491191, + "learning_rate": 4.8076425008363814e-06, + "loss": 0.0011, + "step": 21852 + }, + { + "epoch": 8.886945912972752, + "grad_norm": 0.17582522105954576, + "learning_rate": 4.8067766253906225e-06, + "loss": 0.0018, + "step": 21853 + }, + { + "epoch": 8.887352582350548, + "grad_norm": 0.06957881299674365, + "learning_rate": 4.805910803255915e-06, + "loss": 0.0007, + "step": 21854 + }, + { + "epoch": 8.887759251728344, + "grad_norm": 1.7960040192115212, + "learning_rate": 4.805045034441147e-06, + "loss": 0.0147, + "step": 21855 + }, + { + "epoch": 8.888165921106141, + "grad_norm": 0.20101931606562967, + "learning_rate": 4.804179318955207e-06, + "loss": 0.0019, + "step": 21856 + }, + { + "epoch": 8.888572590483937, + "grad_norm": 0.04255148713285751, + "learning_rate": 4.803313656806982e-06, + "loss": 0.0004, + "step": 21857 + }, + { + "epoch": 8.888979259861733, + "grad_norm": 0.6613789045636694, + "learning_rate": 4.802448048005351e-06, + "loss": 0.0094, + "step": 21858 + }, + { + "epoch": 8.889385929239529, + "grad_norm": 0.06953191337816696, + "learning_rate": 4.801582492559213e-06, + "loss": 0.0004, + "step": 21859 + }, + { + "epoch": 8.889792598617325, + "grad_norm": 4.0135662155734755, + "learning_rate": 4.800716990477447e-06, + "loss": 0.0753, + "step": 21860 + }, + { + "epoch": 8.89019926799512, + "grad_norm": 0.4373068024042488, + "learning_rate": 4.799851541768937e-06, + "loss": 0.0057, + "step": 21861 + }, + { + "epoch": 8.890605937372916, + "grad_norm": 0.9541631072828803, + "learning_rate": 4.798986146442568e-06, + "loss": 0.0043, + "step": 21862 + }, + { + "epoch": 8.891012606750712, + "grad_norm": 0.47874398578752225, + "learning_rate": 4.798120804507225e-06, + "loss": 0.0051, + "step": 21863 + }, + { + "epoch": 8.891419276128508, + "grad_norm": 0.07419708859856296, + "learning_rate": 4.797255515971785e-06, + "loss": 0.0007, + "step": 21864 + }, + { + "epoch": 8.891825945506303, + "grad_norm": 0.0031543409257572025, + "learning_rate": 4.796390280845139e-06, + "loss": 0.0, + "step": 21865 + }, + { + "epoch": 8.892232614884099, + "grad_norm": 0.008537340633777831, + "learning_rate": 4.795525099136167e-06, + "loss": 0.0001, + "step": 21866 + }, + { + "epoch": 8.892639284261895, + "grad_norm": 0.06002007578331093, + "learning_rate": 4.7946599708537485e-06, + "loss": 0.0007, + "step": 21867 + }, + { + "epoch": 8.89304595363969, + "grad_norm": 1.2803525885894116, + "learning_rate": 4.793794896006765e-06, + "loss": 0.0146, + "step": 21868 + }, + { + "epoch": 8.893452623017486, + "grad_norm": 0.1052450606056048, + "learning_rate": 4.792929874604097e-06, + "loss": 0.0006, + "step": 21869 + }, + { + "epoch": 8.893859292395282, + "grad_norm": 4.0293206987587356, + "learning_rate": 4.792064906654626e-06, + "loss": 0.0529, + "step": 21870 + }, + { + "epoch": 8.894265961773078, + "grad_norm": 1.4942101958695504, + "learning_rate": 4.7911999921672246e-06, + "loss": 0.0173, + "step": 21871 + }, + { + "epoch": 8.894672631150874, + "grad_norm": 0.7036831445179073, + "learning_rate": 4.7903351311507805e-06, + "loss": 0.0044, + "step": 21872 + }, + { + "epoch": 8.89507930052867, + "grad_norm": 3.38240252211841, + "learning_rate": 4.78947032361417e-06, + "loss": 0.0287, + "step": 21873 + }, + { + "epoch": 8.895485969906465, + "grad_norm": 0.35377113497687446, + "learning_rate": 4.788605569566267e-06, + "loss": 0.0045, + "step": 21874 + }, + { + "epoch": 8.895892639284263, + "grad_norm": 0.014450792679326082, + "learning_rate": 4.787740869015952e-06, + "loss": 0.0001, + "step": 21875 + }, + { + "epoch": 8.896299308662059, + "grad_norm": 1.8321684075445415, + "learning_rate": 4.7868762219721e-06, + "loss": 0.024, + "step": 21876 + }, + { + "epoch": 8.896705978039854, + "grad_norm": 1.7714200572005883, + "learning_rate": 4.786011628443583e-06, + "loss": 0.02, + "step": 21877 + }, + { + "epoch": 8.89711264741765, + "grad_norm": 1.1432899625571866, + "learning_rate": 4.785147088439285e-06, + "loss": 0.0124, + "step": 21878 + }, + { + "epoch": 8.897519316795446, + "grad_norm": 0.011411398213937074, + "learning_rate": 4.784282601968076e-06, + "loss": 0.0001, + "step": 21879 + }, + { + "epoch": 8.897925986173242, + "grad_norm": 2.059362973199242, + "learning_rate": 4.783418169038831e-06, + "loss": 0.019, + "step": 21880 + }, + { + "epoch": 8.898332655551037, + "grad_norm": 0.0028735487684054145, + "learning_rate": 4.7825537896604245e-06, + "loss": 0.0, + "step": 21881 + }, + { + "epoch": 8.898739324928833, + "grad_norm": 0.0401465468638462, + "learning_rate": 4.7816894638417296e-06, + "loss": 0.0005, + "step": 21882 + }, + { + "epoch": 8.899145994306629, + "grad_norm": 0.01977916702610059, + "learning_rate": 4.780825191591618e-06, + "loss": 0.0001, + "step": 21883 + }, + { + "epoch": 8.899552663684425, + "grad_norm": 0.3874677466445019, + "learning_rate": 4.77996097291896e-06, + "loss": 0.0024, + "step": 21884 + }, + { + "epoch": 8.89995933306222, + "grad_norm": 0.0708575840195812, + "learning_rate": 4.779096807832633e-06, + "loss": 0.0007, + "step": 21885 + }, + { + "epoch": 8.900366002440016, + "grad_norm": 0.27272826094516356, + "learning_rate": 4.778232696341504e-06, + "loss": 0.0028, + "step": 21886 + }, + { + "epoch": 8.900772671817812, + "grad_norm": 0.008197683293880586, + "learning_rate": 4.777368638454445e-06, + "loss": 0.0, + "step": 21887 + }, + { + "epoch": 8.901179341195608, + "grad_norm": 0.06821138428603565, + "learning_rate": 4.776504634180325e-06, + "loss": 0.0008, + "step": 21888 + }, + { + "epoch": 8.901586010573403, + "grad_norm": 0.5758052064826285, + "learning_rate": 4.775640683528015e-06, + "loss": 0.0039, + "step": 21889 + }, + { + "epoch": 8.9019926799512, + "grad_norm": 0.0005463465854628629, + "learning_rate": 4.774776786506379e-06, + "loss": 0.0, + "step": 21890 + }, + { + "epoch": 8.902399349328995, + "grad_norm": 2.312374232642185, + "learning_rate": 4.773912943124291e-06, + "loss": 0.0185, + "step": 21891 + }, + { + "epoch": 8.90280601870679, + "grad_norm": 14.134460751654814, + "learning_rate": 4.773049153390618e-06, + "loss": 0.3977, + "step": 21892 + }, + { + "epoch": 8.903212688084587, + "grad_norm": 0.2967813030474364, + "learning_rate": 4.772185417314227e-06, + "loss": 0.0021, + "step": 21893 + }, + { + "epoch": 8.903619357462382, + "grad_norm": 0.0005331328754000126, + "learning_rate": 4.771321734903983e-06, + "loss": 0.0, + "step": 21894 + }, + { + "epoch": 8.904026026840178, + "grad_norm": 4.8877525461525355, + "learning_rate": 4.7704581061687524e-06, + "loss": 0.0628, + "step": 21895 + }, + { + "epoch": 8.904432696217976, + "grad_norm": 0.4907207436192186, + "learning_rate": 4.769594531117402e-06, + "loss": 0.0056, + "step": 21896 + }, + { + "epoch": 8.904839365595771, + "grad_norm": 0.0032069560847112007, + "learning_rate": 4.768731009758792e-06, + "loss": 0.0, + "step": 21897 + }, + { + "epoch": 8.905246034973567, + "grad_norm": 0.5965600890859117, + "learning_rate": 4.767867542101795e-06, + "loss": 0.0036, + "step": 21898 + }, + { + "epoch": 8.905652704351363, + "grad_norm": 0.5331101665235902, + "learning_rate": 4.767004128155271e-06, + "loss": 0.005, + "step": 21899 + }, + { + "epoch": 8.906059373729159, + "grad_norm": 1.8973700471053832, + "learning_rate": 4.766140767928083e-06, + "loss": 0.0242, + "step": 21900 + }, + { + "epoch": 8.906466043106954, + "grad_norm": 1.307556862201684, + "learning_rate": 4.765277461429094e-06, + "loss": 0.0119, + "step": 21901 + }, + { + "epoch": 8.90687271248475, + "grad_norm": 0.7272905647804231, + "learning_rate": 4.7644142086671676e-06, + "loss": 0.0054, + "step": 21902 + }, + { + "epoch": 8.907279381862546, + "grad_norm": 0.16949063061514938, + "learning_rate": 4.763551009651159e-06, + "loss": 0.0021, + "step": 21903 + }, + { + "epoch": 8.907686051240342, + "grad_norm": 5.848924778599733, + "learning_rate": 4.76268786438994e-06, + "loss": 0.0379, + "step": 21904 + }, + { + "epoch": 8.908092720618138, + "grad_norm": 9.788863500093083, + "learning_rate": 4.761824772892365e-06, + "loss": 0.1031, + "step": 21905 + }, + { + "epoch": 8.908499389995933, + "grad_norm": 4.89427464161471, + "learning_rate": 4.760961735167295e-06, + "loss": 0.1303, + "step": 21906 + }, + { + "epoch": 8.908906059373729, + "grad_norm": 0.07252794864137299, + "learning_rate": 4.760098751223591e-06, + "loss": 0.0006, + "step": 21907 + }, + { + "epoch": 8.909312728751525, + "grad_norm": 6.727279692971608, + "learning_rate": 4.759235821070109e-06, + "loss": 0.3054, + "step": 21908 + }, + { + "epoch": 8.90971939812932, + "grad_norm": 0.014749032064443716, + "learning_rate": 4.758372944715709e-06, + "loss": 0.0002, + "step": 21909 + }, + { + "epoch": 8.910126067507116, + "grad_norm": 0.052980406995800366, + "learning_rate": 4.7575101221692455e-06, + "loss": 0.0007, + "step": 21910 + }, + { + "epoch": 8.910532736884912, + "grad_norm": 1.0276749066632902, + "learning_rate": 4.756647353439583e-06, + "loss": 0.0094, + "step": 21911 + }, + { + "epoch": 8.910939406262708, + "grad_norm": 0.021624771886910102, + "learning_rate": 4.7557846385355745e-06, + "loss": 0.0001, + "step": 21912 + }, + { + "epoch": 8.911346075640504, + "grad_norm": 0.19767228827066555, + "learning_rate": 4.754921977466075e-06, + "loss": 0.0021, + "step": 21913 + }, + { + "epoch": 8.9117527450183, + "grad_norm": 0.2492694995063032, + "learning_rate": 4.754059370239942e-06, + "loss": 0.0024, + "step": 21914 + }, + { + "epoch": 8.912159414396095, + "grad_norm": 0.012117930093610866, + "learning_rate": 4.753196816866029e-06, + "loss": 0.0001, + "step": 21915 + }, + { + "epoch": 8.912566083773893, + "grad_norm": 0.0022444120531348827, + "learning_rate": 4.752334317353189e-06, + "loss": 0.0, + "step": 21916 + }, + { + "epoch": 8.912972753151688, + "grad_norm": 8.41533881082218, + "learning_rate": 4.751471871710281e-06, + "loss": 0.0805, + "step": 21917 + }, + { + "epoch": 8.913379422529484, + "grad_norm": 0.13479282157206213, + "learning_rate": 4.750609479946155e-06, + "loss": 0.0013, + "step": 21918 + }, + { + "epoch": 8.91378609190728, + "grad_norm": 2.8550464644681437, + "learning_rate": 4.7497471420696665e-06, + "loss": 0.0316, + "step": 21919 + }, + { + "epoch": 8.914192761285076, + "grad_norm": 0.04622643126529464, + "learning_rate": 4.7488848580896655e-06, + "loss": 0.0003, + "step": 21920 + }, + { + "epoch": 8.914599430662872, + "grad_norm": 0.10158118953042783, + "learning_rate": 4.748022628015003e-06, + "loss": 0.0009, + "step": 21921 + }, + { + "epoch": 8.915006100040667, + "grad_norm": 5.304626408310859, + "learning_rate": 4.747160451854532e-06, + "loss": 0.0723, + "step": 21922 + }, + { + "epoch": 8.915412769418463, + "grad_norm": 0.022733145200115545, + "learning_rate": 4.7462983296171e-06, + "loss": 0.0001, + "step": 21923 + }, + { + "epoch": 8.915819438796259, + "grad_norm": 0.8561980952704427, + "learning_rate": 4.745436261311562e-06, + "loss": 0.0071, + "step": 21924 + }, + { + "epoch": 8.916226108174055, + "grad_norm": 0.010746338578970303, + "learning_rate": 4.744574246946767e-06, + "loss": 0.0001, + "step": 21925 + }, + { + "epoch": 8.91663277755185, + "grad_norm": 0.014126152735700618, + "learning_rate": 4.743712286531561e-06, + "loss": 0.0002, + "step": 21926 + }, + { + "epoch": 8.917039446929646, + "grad_norm": 0.0615971136345623, + "learning_rate": 4.742850380074795e-06, + "loss": 0.0008, + "step": 21927 + }, + { + "epoch": 8.917446116307442, + "grad_norm": 0.7725876313165514, + "learning_rate": 4.741988527585315e-06, + "loss": 0.0056, + "step": 21928 + }, + { + "epoch": 8.917852785685238, + "grad_norm": 22.110589869520602, + "learning_rate": 4.741126729071966e-06, + "loss": 0.9554, + "step": 21929 + }, + { + "epoch": 8.918259455063033, + "grad_norm": 0.008277743912679224, + "learning_rate": 4.7402649845436e-06, + "loss": 0.0001, + "step": 21930 + }, + { + "epoch": 8.91866612444083, + "grad_norm": 0.30456520574595874, + "learning_rate": 4.739403294009062e-06, + "loss": 0.0027, + "step": 21931 + }, + { + "epoch": 8.919072793818625, + "grad_norm": 0.1863770064029875, + "learning_rate": 4.738541657477197e-06, + "loss": 0.0013, + "step": 21932 + }, + { + "epoch": 8.91947946319642, + "grad_norm": 0.4652002934438206, + "learning_rate": 4.73768007495685e-06, + "loss": 0.0051, + "step": 21933 + }, + { + "epoch": 8.919886132574216, + "grad_norm": 6.649572220305339, + "learning_rate": 4.736818546456865e-06, + "loss": 0.398, + "step": 21934 + }, + { + "epoch": 8.920292801952012, + "grad_norm": 0.13019533014897933, + "learning_rate": 4.735957071986087e-06, + "loss": 0.0016, + "step": 21935 + }, + { + "epoch": 8.920699471329808, + "grad_norm": 0.060061515927800374, + "learning_rate": 4.735095651553355e-06, + "loss": 0.0007, + "step": 21936 + }, + { + "epoch": 8.921106140707606, + "grad_norm": 1.380713132318367, + "learning_rate": 4.7342342851675195e-06, + "loss": 0.0228, + "step": 21937 + }, + { + "epoch": 8.921512810085401, + "grad_norm": 0.7079752311115743, + "learning_rate": 4.733372972837419e-06, + "loss": 0.0084, + "step": 21938 + }, + { + "epoch": 8.921919479463197, + "grad_norm": 5.177723440470278, + "learning_rate": 4.732511714571897e-06, + "loss": 0.1531, + "step": 21939 + }, + { + "epoch": 8.922326148840993, + "grad_norm": 0.13992727700398083, + "learning_rate": 4.731650510379792e-06, + "loss": 0.0016, + "step": 21940 + }, + { + "epoch": 8.922732818218789, + "grad_norm": 5.677655788079682, + "learning_rate": 4.730789360269946e-06, + "loss": 0.1589, + "step": 21941 + }, + { + "epoch": 8.923139487596584, + "grad_norm": 0.04036529567091559, + "learning_rate": 4.7299282642511955e-06, + "loss": 0.0005, + "step": 21942 + }, + { + "epoch": 8.92354615697438, + "grad_norm": 0.020807460955558986, + "learning_rate": 4.7290672223323885e-06, + "loss": 0.0003, + "step": 21943 + }, + { + "epoch": 8.923952826352176, + "grad_norm": 0.013672884525234524, + "learning_rate": 4.728206234522358e-06, + "loss": 0.0002, + "step": 21944 + }, + { + "epoch": 8.924359495729972, + "grad_norm": 0.03646542176999404, + "learning_rate": 4.727345300829944e-06, + "loss": 0.0005, + "step": 21945 + }, + { + "epoch": 8.924766165107767, + "grad_norm": 0.32547297725035496, + "learning_rate": 4.726484421263985e-06, + "loss": 0.0036, + "step": 21946 + }, + { + "epoch": 8.925172834485563, + "grad_norm": 0.007507296580589625, + "learning_rate": 4.725623595833317e-06, + "loss": 0.0001, + "step": 21947 + }, + { + "epoch": 8.925579503863359, + "grad_norm": 6.028314615787971, + "learning_rate": 4.724762824546777e-06, + "loss": 0.0652, + "step": 21948 + }, + { + "epoch": 8.925986173241155, + "grad_norm": 1.8418998320353435, + "learning_rate": 4.723902107413202e-06, + "loss": 0.0228, + "step": 21949 + }, + { + "epoch": 8.92639284261895, + "grad_norm": 0.008919750730448258, + "learning_rate": 4.723041444441427e-06, + "loss": 0.0001, + "step": 21950 + }, + { + "epoch": 8.926799511996746, + "grad_norm": 2.74297961724806, + "learning_rate": 4.722180835640288e-06, + "loss": 0.03, + "step": 21951 + }, + { + "epoch": 8.927206181374542, + "grad_norm": 0.4201582720409807, + "learning_rate": 4.721320281018619e-06, + "loss": 0.0047, + "step": 21952 + }, + { + "epoch": 8.927612850752338, + "grad_norm": 0.9647840970581447, + "learning_rate": 4.7204597805852525e-06, + "loss": 0.0114, + "step": 21953 + }, + { + "epoch": 8.928019520130134, + "grad_norm": 1.574046122266736, + "learning_rate": 4.719599334349021e-06, + "loss": 0.0197, + "step": 21954 + }, + { + "epoch": 8.92842618950793, + "grad_norm": 1.1989799307920836, + "learning_rate": 4.718738942318763e-06, + "loss": 0.0097, + "step": 21955 + }, + { + "epoch": 8.928832858885725, + "grad_norm": 7.9832427315882795, + "learning_rate": 4.717878604503309e-06, + "loss": 0.2953, + "step": 21956 + }, + { + "epoch": 8.929239528263523, + "grad_norm": 0.6706785882896116, + "learning_rate": 4.717018320911488e-06, + "loss": 0.0091, + "step": 21957 + }, + { + "epoch": 8.929646197641318, + "grad_norm": 0.12405923991805644, + "learning_rate": 4.716158091552135e-06, + "loss": 0.0017, + "step": 21958 + }, + { + "epoch": 8.930052867019114, + "grad_norm": 5.26618681048549, + "learning_rate": 4.7152979164340765e-06, + "loss": 0.0798, + "step": 21959 + }, + { + "epoch": 8.93045953639691, + "grad_norm": 0.05178816347374275, + "learning_rate": 4.714437795566142e-06, + "loss": 0.0005, + "step": 21960 + }, + { + "epoch": 8.930866205774706, + "grad_norm": 0.0007036505217464952, + "learning_rate": 4.713577728957166e-06, + "loss": 0.0, + "step": 21961 + }, + { + "epoch": 8.931272875152501, + "grad_norm": 9.426674311536882, + "learning_rate": 4.712717716615977e-06, + "loss": 0.2172, + "step": 21962 + }, + { + "epoch": 8.931679544530297, + "grad_norm": 0.07884272122718504, + "learning_rate": 4.711857758551401e-06, + "loss": 0.001, + "step": 21963 + }, + { + "epoch": 8.932086213908093, + "grad_norm": 0.7286142722178877, + "learning_rate": 4.710997854772266e-06, + "loss": 0.01, + "step": 21964 + }, + { + "epoch": 8.932492883285889, + "grad_norm": 4.20606521377504, + "learning_rate": 4.7101380052874005e-06, + "loss": 0.0869, + "step": 21965 + }, + { + "epoch": 8.932899552663685, + "grad_norm": 0.2419857760712477, + "learning_rate": 4.70927821010563e-06, + "loss": 0.0024, + "step": 21966 + }, + { + "epoch": 8.93330622204148, + "grad_norm": 0.9779852288734866, + "learning_rate": 4.708418469235778e-06, + "loss": 0.0073, + "step": 21967 + }, + { + "epoch": 8.933712891419276, + "grad_norm": 0.03004818449397469, + "learning_rate": 4.707558782686677e-06, + "loss": 0.0003, + "step": 21968 + }, + { + "epoch": 8.934119560797072, + "grad_norm": 0.2104209333820767, + "learning_rate": 4.70669915046715e-06, + "loss": 0.0021, + "step": 21969 + }, + { + "epoch": 8.934526230174868, + "grad_norm": 0.009272579747194335, + "learning_rate": 4.70583957258602e-06, + "loss": 0.0001, + "step": 21970 + }, + { + "epoch": 8.934932899552663, + "grad_norm": 0.7854842179628894, + "learning_rate": 4.70498004905211e-06, + "loss": 0.0099, + "step": 21971 + }, + { + "epoch": 8.935339568930459, + "grad_norm": 0.15273660644223483, + "learning_rate": 4.704120579874245e-06, + "loss": 0.0023, + "step": 21972 + }, + { + "epoch": 8.935746238308255, + "grad_norm": 2.11406353301256, + "learning_rate": 4.703261165061249e-06, + "loss": 0.0377, + "step": 21973 + }, + { + "epoch": 8.93615290768605, + "grad_norm": 0.28988481602805205, + "learning_rate": 4.702401804621938e-06, + "loss": 0.0025, + "step": 21974 + }, + { + "epoch": 8.936559577063846, + "grad_norm": 2.102857437382456, + "learning_rate": 4.7015424985651425e-06, + "loss": 0.0183, + "step": 21975 + }, + { + "epoch": 8.936966246441642, + "grad_norm": 0.21492121653495933, + "learning_rate": 4.700683246899681e-06, + "loss": 0.002, + "step": 21976 + }, + { + "epoch": 8.937372915819438, + "grad_norm": 1.2359107084616707, + "learning_rate": 4.6998240496343715e-06, + "loss": 0.0177, + "step": 21977 + }, + { + "epoch": 8.937779585197235, + "grad_norm": 1.823794403257702, + "learning_rate": 4.698964906778036e-06, + "loss": 0.0232, + "step": 21978 + }, + { + "epoch": 8.938186254575031, + "grad_norm": 0.19252669615376541, + "learning_rate": 4.698105818339495e-06, + "loss": 0.0028, + "step": 21979 + }, + { + "epoch": 8.938592923952827, + "grad_norm": 5.249106034449613, + "learning_rate": 4.697246784327561e-06, + "loss": 0.0655, + "step": 21980 + }, + { + "epoch": 8.938999593330623, + "grad_norm": 0.4011957857716339, + "learning_rate": 4.696387804751062e-06, + "loss": 0.004, + "step": 21981 + }, + { + "epoch": 8.939406262708419, + "grad_norm": 0.006297675597343795, + "learning_rate": 4.695528879618811e-06, + "loss": 0.0001, + "step": 21982 + }, + { + "epoch": 8.939812932086214, + "grad_norm": 0.20322953302813324, + "learning_rate": 4.694670008939627e-06, + "loss": 0.0025, + "step": 21983 + }, + { + "epoch": 8.94021960146401, + "grad_norm": 0.23991758785139164, + "learning_rate": 4.693811192722324e-06, + "loss": 0.0028, + "step": 21984 + }, + { + "epoch": 8.940626270841806, + "grad_norm": 1.4196559100716302, + "learning_rate": 4.69295243097572e-06, + "loss": 0.0237, + "step": 21985 + }, + { + "epoch": 8.941032940219602, + "grad_norm": 8.762455630343146, + "learning_rate": 4.692093723708631e-06, + "loss": 0.077, + "step": 21986 + }, + { + "epoch": 8.941439609597397, + "grad_norm": 0.5075858604190117, + "learning_rate": 4.691235070929867e-06, + "loss": 0.0048, + "step": 21987 + }, + { + "epoch": 8.941846278975193, + "grad_norm": 0.009747385462232527, + "learning_rate": 4.690376472648251e-06, + "loss": 0.0001, + "step": 21988 + }, + { + "epoch": 8.942252948352989, + "grad_norm": 0.32436482959702967, + "learning_rate": 4.689517928872591e-06, + "loss": 0.0035, + "step": 21989 + }, + { + "epoch": 8.942659617730785, + "grad_norm": 0.28336350898745993, + "learning_rate": 4.688659439611704e-06, + "loss": 0.0025, + "step": 21990 + }, + { + "epoch": 8.94306628710858, + "grad_norm": 0.00017881581653950103, + "learning_rate": 4.687801004874401e-06, + "loss": 0.0, + "step": 21991 + }, + { + "epoch": 8.943472956486376, + "grad_norm": 0.13939700816377326, + "learning_rate": 4.6869426246694936e-06, + "loss": 0.0018, + "step": 21992 + }, + { + "epoch": 8.943879625864172, + "grad_norm": 0.32025180872612813, + "learning_rate": 4.686084299005791e-06, + "loss": 0.0045, + "step": 21993 + }, + { + "epoch": 8.944286295241968, + "grad_norm": 0.0064289980625912456, + "learning_rate": 4.6852260278921105e-06, + "loss": 0.0, + "step": 21994 + }, + { + "epoch": 8.944692964619763, + "grad_norm": 0.15121433882830368, + "learning_rate": 4.68436781133726e-06, + "loss": 0.0007, + "step": 21995 + }, + { + "epoch": 8.94509963399756, + "grad_norm": 0.6320248063403519, + "learning_rate": 4.683509649350049e-06, + "loss": 0.0073, + "step": 21996 + }, + { + "epoch": 8.945506303375355, + "grad_norm": 1.4564121132728685, + "learning_rate": 4.682651541939288e-06, + "loss": 0.0138, + "step": 21997 + }, + { + "epoch": 8.945912972753153, + "grad_norm": 0.24736886587097698, + "learning_rate": 4.681793489113784e-06, + "loss": 0.0014, + "step": 21998 + }, + { + "epoch": 8.946319642130948, + "grad_norm": 0.0013061615702130642, + "learning_rate": 4.680935490882347e-06, + "loss": 0.0, + "step": 21999 + }, + { + "epoch": 8.946726311508744, + "grad_norm": 1.1752586965948422, + "learning_rate": 4.680077547253781e-06, + "loss": 0.0096, + "step": 22000 + }, + { + "epoch": 8.94713298088654, + "grad_norm": 15.577556328069395, + "learning_rate": 4.6792196582369e-06, + "loss": 0.577, + "step": 22001 + }, + { + "epoch": 8.947539650264336, + "grad_norm": 8.06260606651853, + "learning_rate": 4.678361823840506e-06, + "loss": 0.1552, + "step": 22002 + }, + { + "epoch": 8.947946319642131, + "grad_norm": 6.472170386211414, + "learning_rate": 4.677504044073408e-06, + "loss": 0.123, + "step": 22003 + }, + { + "epoch": 8.948352989019927, + "grad_norm": 1.1152160229337886, + "learning_rate": 4.676646318944408e-06, + "loss": 0.0172, + "step": 22004 + }, + { + "epoch": 8.948759658397723, + "grad_norm": 0.026284226658060694, + "learning_rate": 4.675788648462313e-06, + "loss": 0.0002, + "step": 22005 + }, + { + "epoch": 8.949166327775519, + "grad_norm": 0.008124615457214804, + "learning_rate": 4.674931032635924e-06, + "loss": 0.0001, + "step": 22006 + }, + { + "epoch": 8.949572997153314, + "grad_norm": 0.0620127237993282, + "learning_rate": 4.674073471474052e-06, + "loss": 0.0011, + "step": 22007 + }, + { + "epoch": 8.94997966653111, + "grad_norm": 0.34060658215833894, + "learning_rate": 4.673215964985496e-06, + "loss": 0.0039, + "step": 22008 + }, + { + "epoch": 8.950386335908906, + "grad_norm": 0.06460898729778583, + "learning_rate": 4.672358513179059e-06, + "loss": 0.001, + "step": 22009 + }, + { + "epoch": 8.950793005286702, + "grad_norm": 0.005463985262041016, + "learning_rate": 4.671501116063543e-06, + "loss": 0.0001, + "step": 22010 + }, + { + "epoch": 8.951199674664498, + "grad_norm": 0.16201114677838196, + "learning_rate": 4.670643773647749e-06, + "loss": 0.0014, + "step": 22011 + }, + { + "epoch": 8.951606344042293, + "grad_norm": 0.03273865692460557, + "learning_rate": 4.669786485940481e-06, + "loss": 0.0003, + "step": 22012 + }, + { + "epoch": 8.952013013420089, + "grad_norm": 0.08166722856382114, + "learning_rate": 4.668929252950531e-06, + "loss": 0.0009, + "step": 22013 + }, + { + "epoch": 8.952419682797885, + "grad_norm": 0.04432237650880403, + "learning_rate": 4.668072074686711e-06, + "loss": 0.0003, + "step": 22014 + }, + { + "epoch": 8.95282635217568, + "grad_norm": 0.02236640862006047, + "learning_rate": 4.667214951157815e-06, + "loss": 0.0002, + "step": 22015 + }, + { + "epoch": 8.953233021553476, + "grad_norm": 19.683336102948843, + "learning_rate": 4.66635788237264e-06, + "loss": 0.4087, + "step": 22016 + }, + { + "epoch": 8.953639690931272, + "grad_norm": 0.00609683232920771, + "learning_rate": 4.665500868339986e-06, + "loss": 0.0, + "step": 22017 + }, + { + "epoch": 8.954046360309068, + "grad_norm": 0.41825736850001727, + "learning_rate": 4.664643909068651e-06, + "loss": 0.0059, + "step": 22018 + }, + { + "epoch": 8.954453029686865, + "grad_norm": 2.512348073914868, + "learning_rate": 4.663787004567427e-06, + "loss": 0.0608, + "step": 22019 + }, + { + "epoch": 8.954859699064661, + "grad_norm": 0.5692247176603606, + "learning_rate": 4.662930154845119e-06, + "loss": 0.0068, + "step": 22020 + }, + { + "epoch": 8.955266368442457, + "grad_norm": 0.49517000486857543, + "learning_rate": 4.662073359910519e-06, + "loss": 0.0063, + "step": 22021 + }, + { + "epoch": 8.955673037820253, + "grad_norm": 3.558406014625174, + "learning_rate": 4.661216619772423e-06, + "loss": 0.0375, + "step": 22022 + }, + { + "epoch": 8.956079707198048, + "grad_norm": 0.01338175760151674, + "learning_rate": 4.660359934439625e-06, + "loss": 0.0001, + "step": 22023 + }, + { + "epoch": 8.956486376575844, + "grad_norm": 0.00460369919422177, + "learning_rate": 4.65950330392092e-06, + "loss": 0.0001, + "step": 22024 + }, + { + "epoch": 8.95689304595364, + "grad_norm": 0.2679626269060616, + "learning_rate": 4.658646728225101e-06, + "loss": 0.0022, + "step": 22025 + }, + { + "epoch": 8.957299715331436, + "grad_norm": 1.2283912212490302, + "learning_rate": 4.657790207360958e-06, + "loss": 0.0087, + "step": 22026 + }, + { + "epoch": 8.957706384709232, + "grad_norm": 7.081201115436956, + "learning_rate": 4.656933741337292e-06, + "loss": 0.306, + "step": 22027 + }, + { + "epoch": 8.958113054087027, + "grad_norm": 0.6642064791769339, + "learning_rate": 4.656077330162888e-06, + "loss": 0.0072, + "step": 22028 + }, + { + "epoch": 8.958519723464823, + "grad_norm": 5.251580815473419, + "learning_rate": 4.655220973846541e-06, + "loss": 0.0669, + "step": 22029 + }, + { + "epoch": 8.958926392842619, + "grad_norm": 0.00572279268472899, + "learning_rate": 4.654364672397041e-06, + "loss": 0.0001, + "step": 22030 + }, + { + "epoch": 8.959333062220415, + "grad_norm": 0.022751891795736744, + "learning_rate": 4.653508425823178e-06, + "loss": 0.0002, + "step": 22031 + }, + { + "epoch": 8.95973973159821, + "grad_norm": 0.7764917497530434, + "learning_rate": 4.652652234133738e-06, + "loss": 0.0089, + "step": 22032 + }, + { + "epoch": 8.960146400976006, + "grad_norm": 0.0053326486205428954, + "learning_rate": 4.651796097337518e-06, + "loss": 0.0001, + "step": 22033 + }, + { + "epoch": 8.960553070353802, + "grad_norm": 0.1271456047491179, + "learning_rate": 4.650940015443301e-06, + "loss": 0.0016, + "step": 22034 + }, + { + "epoch": 8.960959739731598, + "grad_norm": 0.4948713558292337, + "learning_rate": 4.650083988459879e-06, + "loss": 0.0062, + "step": 22035 + }, + { + "epoch": 8.961366409109393, + "grad_norm": 0.006782746365060755, + "learning_rate": 4.6492280163960365e-06, + "loss": 0.0001, + "step": 22036 + }, + { + "epoch": 8.96177307848719, + "grad_norm": 0.05801541372368341, + "learning_rate": 4.648372099260562e-06, + "loss": 0.0009, + "step": 22037 + }, + { + "epoch": 8.962179747864985, + "grad_norm": 0.24712154300555902, + "learning_rate": 4.64751623706224e-06, + "loss": 0.004, + "step": 22038 + }, + { + "epoch": 8.962586417242782, + "grad_norm": 0.6352293097178612, + "learning_rate": 4.646660429809855e-06, + "loss": 0.0056, + "step": 22039 + }, + { + "epoch": 8.962993086620578, + "grad_norm": 7.203823223970777, + "learning_rate": 4.645804677512199e-06, + "loss": 0.1258, + "step": 22040 + }, + { + "epoch": 8.963399755998374, + "grad_norm": 0.5220571948848137, + "learning_rate": 4.644948980178051e-06, + "loss": 0.0041, + "step": 22041 + }, + { + "epoch": 8.96380642537617, + "grad_norm": 0.04397568675051408, + "learning_rate": 4.644093337816199e-06, + "loss": 0.0006, + "step": 22042 + }, + { + "epoch": 8.964213094753966, + "grad_norm": 14.419757801265899, + "learning_rate": 4.643237750435423e-06, + "loss": 0.4721, + "step": 22043 + }, + { + "epoch": 8.964619764131761, + "grad_norm": 0.037026889670281435, + "learning_rate": 4.642382218044508e-06, + "loss": 0.0006, + "step": 22044 + }, + { + "epoch": 8.965026433509557, + "grad_norm": 1.0023713197266366, + "learning_rate": 4.641526740652233e-06, + "loss": 0.0122, + "step": 22045 + }, + { + "epoch": 8.965433102887353, + "grad_norm": 0.017975075282875598, + "learning_rate": 4.640671318267386e-06, + "loss": 0.0002, + "step": 22046 + }, + { + "epoch": 8.965839772265149, + "grad_norm": 12.091252462127933, + "learning_rate": 4.639815950898748e-06, + "loss": 0.08, + "step": 22047 + }, + { + "epoch": 8.966246441642944, + "grad_norm": 0.24352019893496485, + "learning_rate": 4.638960638555094e-06, + "loss": 0.0028, + "step": 22048 + }, + { + "epoch": 8.96665311102074, + "grad_norm": 7.703726297204665, + "learning_rate": 4.638105381245207e-06, + "loss": 0.1048, + "step": 22049 + }, + { + "epoch": 8.967059780398536, + "grad_norm": 2.9575971185736, + "learning_rate": 4.637250178977863e-06, + "loss": 0.0329, + "step": 22050 + }, + { + "epoch": 8.967466449776332, + "grad_norm": 1.1246444401867575, + "learning_rate": 4.63639503176185e-06, + "loss": 0.0056, + "step": 22051 + }, + { + "epoch": 8.967873119154127, + "grad_norm": 10.675946681900516, + "learning_rate": 4.6355399396059395e-06, + "loss": 0.5105, + "step": 22052 + }, + { + "epoch": 8.968279788531923, + "grad_norm": 0.2797145210046656, + "learning_rate": 4.634684902518913e-06, + "loss": 0.0042, + "step": 22053 + }, + { + "epoch": 8.968686457909719, + "grad_norm": 0.00213654900227726, + "learning_rate": 4.633829920509546e-06, + "loss": 0.0, + "step": 22054 + }, + { + "epoch": 8.969093127287515, + "grad_norm": 0.8267793656754063, + "learning_rate": 4.632974993586615e-06, + "loss": 0.0131, + "step": 22055 + }, + { + "epoch": 8.96949979666531, + "grad_norm": 1.8263537437322084, + "learning_rate": 4.632120121758897e-06, + "loss": 0.028, + "step": 22056 + }, + { + "epoch": 8.969906466043106, + "grad_norm": 6.319904434902234, + "learning_rate": 4.631265305035164e-06, + "loss": 0.1094, + "step": 22057 + }, + { + "epoch": 8.970313135420902, + "grad_norm": 2.719340582087909, + "learning_rate": 4.630410543424199e-06, + "loss": 0.0357, + "step": 22058 + }, + { + "epoch": 8.970719804798698, + "grad_norm": 0.11271087234852091, + "learning_rate": 4.62955583693477e-06, + "loss": 0.0013, + "step": 22059 + }, + { + "epoch": 8.971126474176495, + "grad_norm": 2.826473059222347, + "learning_rate": 4.628701185575656e-06, + "loss": 0.0222, + "step": 22060 + }, + { + "epoch": 8.971533143554291, + "grad_norm": 0.02988499320586651, + "learning_rate": 4.627846589355625e-06, + "loss": 0.0003, + "step": 22061 + }, + { + "epoch": 8.971939812932087, + "grad_norm": 0.581895619466565, + "learning_rate": 4.626992048283455e-06, + "loss": 0.008, + "step": 22062 + }, + { + "epoch": 8.972346482309883, + "grad_norm": 3.88952650917761, + "learning_rate": 4.626137562367911e-06, + "loss": 0.0728, + "step": 22063 + }, + { + "epoch": 8.972753151687678, + "grad_norm": 0.08440267254755332, + "learning_rate": 4.625283131617773e-06, + "loss": 0.0013, + "step": 22064 + }, + { + "epoch": 8.973159821065474, + "grad_norm": 4.952130992620213, + "learning_rate": 4.6244287560418085e-06, + "loss": 0.3376, + "step": 22065 + }, + { + "epoch": 8.97356649044327, + "grad_norm": 5.046233053686845, + "learning_rate": 4.623574435648789e-06, + "loss": 0.0691, + "step": 22066 + }, + { + "epoch": 8.973973159821066, + "grad_norm": 0.015519317650365995, + "learning_rate": 4.622720170447483e-06, + "loss": 0.0002, + "step": 22067 + }, + { + "epoch": 8.974379829198861, + "grad_norm": 0.23634472438373064, + "learning_rate": 4.6218659604466624e-06, + "loss": 0.0033, + "step": 22068 + }, + { + "epoch": 8.974786498576657, + "grad_norm": 4.497467408681057, + "learning_rate": 4.621011805655093e-06, + "loss": 0.0567, + "step": 22069 + }, + { + "epoch": 8.975193167954453, + "grad_norm": 0.2808949096547275, + "learning_rate": 4.620157706081542e-06, + "loss": 0.0033, + "step": 22070 + }, + { + "epoch": 8.975599837332249, + "grad_norm": 1.8434608490856836, + "learning_rate": 4.619303661734782e-06, + "loss": 0.0238, + "step": 22071 + }, + { + "epoch": 8.976006506710045, + "grad_norm": 10.810887065521156, + "learning_rate": 4.618449672623578e-06, + "loss": 0.2131, + "step": 22072 + }, + { + "epoch": 8.97641317608784, + "grad_norm": 0.32876790669140116, + "learning_rate": 4.617595738756699e-06, + "loss": 0.0058, + "step": 22073 + }, + { + "epoch": 8.976819845465636, + "grad_norm": 4.529846609384836, + "learning_rate": 4.616741860142906e-06, + "loss": 0.0703, + "step": 22074 + }, + { + "epoch": 8.977226514843432, + "grad_norm": 8.65029220240834, + "learning_rate": 4.615888036790969e-06, + "loss": 0.3421, + "step": 22075 + }, + { + "epoch": 8.977633184221228, + "grad_norm": 0.047250840478843, + "learning_rate": 4.615034268709646e-06, + "loss": 0.0007, + "step": 22076 + }, + { + "epoch": 8.978039853599023, + "grad_norm": 0.0009333272432715057, + "learning_rate": 4.61418055590771e-06, + "loss": 0.0, + "step": 22077 + }, + { + "epoch": 8.978446522976819, + "grad_norm": 1.6251747002765533, + "learning_rate": 4.6133268983939215e-06, + "loss": 0.0126, + "step": 22078 + }, + { + "epoch": 8.978853192354615, + "grad_norm": 3.32462490583738, + "learning_rate": 4.612473296177044e-06, + "loss": 0.0285, + "step": 22079 + }, + { + "epoch": 8.979259861732412, + "grad_norm": 2.3556743693891273, + "learning_rate": 4.61161974926584e-06, + "loss": 0.1731, + "step": 22080 + }, + { + "epoch": 8.979666531110208, + "grad_norm": 0.02524903026929948, + "learning_rate": 4.61076625766907e-06, + "loss": 0.0003, + "step": 22081 + }, + { + "epoch": 8.980073200488004, + "grad_norm": 0.025743956748151418, + "learning_rate": 4.609912821395498e-06, + "loss": 0.0003, + "step": 22082 + }, + { + "epoch": 8.9804798698658, + "grad_norm": 0.01797339001464994, + "learning_rate": 4.609059440453879e-06, + "loss": 0.0002, + "step": 22083 + }, + { + "epoch": 8.980886539243595, + "grad_norm": 0.00032646258542510425, + "learning_rate": 4.608206114852981e-06, + "loss": 0.0, + "step": 22084 + }, + { + "epoch": 8.981293208621391, + "grad_norm": 0.019466786172670194, + "learning_rate": 4.6073528446015626e-06, + "loss": 0.0003, + "step": 22085 + }, + { + "epoch": 8.981699877999187, + "grad_norm": 0.4158614677638374, + "learning_rate": 4.6064996297083805e-06, + "loss": 0.0037, + "step": 22086 + }, + { + "epoch": 8.982106547376983, + "grad_norm": 0.9127799643433665, + "learning_rate": 4.605646470182195e-06, + "loss": 0.0116, + "step": 22087 + }, + { + "epoch": 8.982513216754779, + "grad_norm": 1.158390911900235, + "learning_rate": 4.604793366031762e-06, + "loss": 0.0125, + "step": 22088 + }, + { + "epoch": 8.982919886132574, + "grad_norm": 0.0293185909088203, + "learning_rate": 4.603940317265842e-06, + "loss": 0.0004, + "step": 22089 + }, + { + "epoch": 8.98332655551037, + "grad_norm": 0.033521942571349814, + "learning_rate": 4.603087323893185e-06, + "loss": 0.0003, + "step": 22090 + }, + { + "epoch": 8.983733224888166, + "grad_norm": 0.07185388211950772, + "learning_rate": 4.602234385922557e-06, + "loss": 0.0009, + "step": 22091 + }, + { + "epoch": 8.984139894265962, + "grad_norm": 3.9508883557917467, + "learning_rate": 4.60138150336271e-06, + "loss": 0.0413, + "step": 22092 + }, + { + "epoch": 8.984546563643757, + "grad_norm": 8.040776560289382, + "learning_rate": 4.600528676222399e-06, + "loss": 0.1371, + "step": 22093 + }, + { + "epoch": 8.984953233021553, + "grad_norm": 0.033219085626682764, + "learning_rate": 4.599675904510379e-06, + "loss": 0.0005, + "step": 22094 + }, + { + "epoch": 8.985359902399349, + "grad_norm": 0.16418800999626132, + "learning_rate": 4.598823188235403e-06, + "loss": 0.0021, + "step": 22095 + }, + { + "epoch": 8.985766571777145, + "grad_norm": 0.07453377862344858, + "learning_rate": 4.597970527406223e-06, + "loss": 0.001, + "step": 22096 + }, + { + "epoch": 8.98617324115494, + "grad_norm": 0.016058000798185253, + "learning_rate": 4.597117922031597e-06, + "loss": 0.0002, + "step": 22097 + }, + { + "epoch": 8.986579910532736, + "grad_norm": 3.8204132096484207, + "learning_rate": 4.596265372120273e-06, + "loss": 0.0723, + "step": 22098 + }, + { + "epoch": 8.986986579910532, + "grad_norm": 2.5110799206828722, + "learning_rate": 4.595412877681007e-06, + "loss": 0.0238, + "step": 22099 + }, + { + "epoch": 8.987393249288328, + "grad_norm": 9.131332175513661, + "learning_rate": 4.594560438722547e-06, + "loss": 0.2244, + "step": 22100 + }, + { + "epoch": 8.987799918666125, + "grad_norm": 3.946710397203758, + "learning_rate": 4.593708055253645e-06, + "loss": 0.053, + "step": 22101 + }, + { + "epoch": 8.988206588043921, + "grad_norm": 0.48719914370255407, + "learning_rate": 4.592855727283051e-06, + "loss": 0.0042, + "step": 22102 + }, + { + "epoch": 8.988613257421717, + "grad_norm": 1.4624672307030249, + "learning_rate": 4.592003454819511e-06, + "loss": 0.0153, + "step": 22103 + }, + { + "epoch": 8.989019926799513, + "grad_norm": 5.504178132854234, + "learning_rate": 4.591151237871779e-06, + "loss": 0.0796, + "step": 22104 + }, + { + "epoch": 8.989426596177308, + "grad_norm": 0.044428215345569094, + "learning_rate": 4.590299076448603e-06, + "loss": 0.0002, + "step": 22105 + }, + { + "epoch": 8.989833265555104, + "grad_norm": 0.040666968468605805, + "learning_rate": 4.58944697055873e-06, + "loss": 0.0005, + "step": 22106 + }, + { + "epoch": 8.9902399349329, + "grad_norm": 0.09730443778180595, + "learning_rate": 4.588594920210906e-06, + "loss": 0.0016, + "step": 22107 + }, + { + "epoch": 8.990646604310696, + "grad_norm": 5.720018010125179, + "learning_rate": 4.58774292541388e-06, + "loss": 0.1026, + "step": 22108 + }, + { + "epoch": 8.991053273688491, + "grad_norm": 0.01495243454272571, + "learning_rate": 4.586890986176392e-06, + "loss": 0.0001, + "step": 22109 + }, + { + "epoch": 8.991459943066287, + "grad_norm": 4.4388233508863815, + "learning_rate": 4.586039102507196e-06, + "loss": 0.0653, + "step": 22110 + }, + { + "epoch": 8.991866612444083, + "grad_norm": 3.706320254293648, + "learning_rate": 4.585187274415034e-06, + "loss": 0.0648, + "step": 22111 + }, + { + "epoch": 8.992273281821879, + "grad_norm": 0.7695268970521602, + "learning_rate": 4.584335501908649e-06, + "loss": 0.0083, + "step": 22112 + }, + { + "epoch": 8.992679951199674, + "grad_norm": 0.06746918279429724, + "learning_rate": 4.583483784996787e-06, + "loss": 0.0008, + "step": 22113 + }, + { + "epoch": 8.99308662057747, + "grad_norm": 3.5093001561573183, + "learning_rate": 4.5826321236881886e-06, + "loss": 0.0471, + "step": 22114 + }, + { + "epoch": 8.993493289955266, + "grad_norm": 0.0005654588404770062, + "learning_rate": 4.5817805179916e-06, + "loss": 0.0, + "step": 22115 + }, + { + "epoch": 8.993899959333062, + "grad_norm": 0.29705911662118933, + "learning_rate": 4.5809289679157555e-06, + "loss": 0.005, + "step": 22116 + }, + { + "epoch": 8.994306628710858, + "grad_norm": 0.9016406703056232, + "learning_rate": 4.580077473469407e-06, + "loss": 0.011, + "step": 22117 + }, + { + "epoch": 8.994713298088653, + "grad_norm": 0.09726863464875724, + "learning_rate": 4.579226034661292e-06, + "loss": 0.0008, + "step": 22118 + }, + { + "epoch": 8.995119967466449, + "grad_norm": 1.1595760385315066, + "learning_rate": 4.578374651500148e-06, + "loss": 0.0144, + "step": 22119 + }, + { + "epoch": 8.995526636844245, + "grad_norm": 3.262333376875414, + "learning_rate": 4.577523323994718e-06, + "loss": 0.0272, + "step": 22120 + }, + { + "epoch": 8.995933306222042, + "grad_norm": 0.5529801633159772, + "learning_rate": 4.57667205215374e-06, + "loss": 0.0052, + "step": 22121 + }, + { + "epoch": 8.996339975599838, + "grad_norm": 0.15727185257708073, + "learning_rate": 4.575820835985949e-06, + "loss": 0.0017, + "step": 22122 + }, + { + "epoch": 8.996746644977634, + "grad_norm": 0.15191536719005458, + "learning_rate": 4.574969675500091e-06, + "loss": 0.0022, + "step": 22123 + }, + { + "epoch": 8.99715331435543, + "grad_norm": 5.299478542287229, + "learning_rate": 4.574118570704899e-06, + "loss": 0.0634, + "step": 22124 + }, + { + "epoch": 8.997559983733225, + "grad_norm": 0.01948242782262579, + "learning_rate": 4.573267521609112e-06, + "loss": 0.0003, + "step": 22125 + }, + { + "epoch": 8.997966653111021, + "grad_norm": 0.16385268091675287, + "learning_rate": 4.572416528221465e-06, + "loss": 0.0028, + "step": 22126 + }, + { + "epoch": 8.998373322488817, + "grad_norm": 16.293989735315, + "learning_rate": 4.571565590550693e-06, + "loss": 0.3824, + "step": 22127 + }, + { + "epoch": 8.998779991866613, + "grad_norm": 0.009701213194403232, + "learning_rate": 4.5707147086055325e-06, + "loss": 0.0001, + "step": 22128 + }, + { + "epoch": 8.999186661244408, + "grad_norm": 0.0037416911767156956, + "learning_rate": 4.569863882394714e-06, + "loss": 0.0, + "step": 22129 + }, + { + "epoch": 8.999593330622204, + "grad_norm": 0.7120806503987761, + "learning_rate": 4.56901311192698e-06, + "loss": 0.009, + "step": 22130 + }, + { + "epoch": 9.0, + "grad_norm": 0.6525676798108013, + "learning_rate": 4.56816239721106e-06, + "loss": 0.009, + "step": 22131 + }, + { + "epoch": 9.000406669377796, + "grad_norm": 0.14896282267526578, + "learning_rate": 4.5673117382556865e-06, + "loss": 0.0014, + "step": 22132 + }, + { + "epoch": 9.000813338755592, + "grad_norm": 0.16666995481652505, + "learning_rate": 4.566461135069592e-06, + "loss": 0.0023, + "step": 22133 + }, + { + "epoch": 9.001220008133387, + "grad_norm": 2.655548549256506, + "learning_rate": 4.5656105876615085e-06, + "loss": 0.0351, + "step": 22134 + }, + { + "epoch": 9.001626677511183, + "grad_norm": 0.022768706013062502, + "learning_rate": 4.564760096040164e-06, + "loss": 0.0003, + "step": 22135 + }, + { + "epoch": 9.002033346888979, + "grad_norm": 0.01131335475461241, + "learning_rate": 4.563909660214296e-06, + "loss": 0.0001, + "step": 22136 + }, + { + "epoch": 9.002440016266775, + "grad_norm": 0.01981960858879438, + "learning_rate": 4.563059280192632e-06, + "loss": 0.0003, + "step": 22137 + }, + { + "epoch": 9.00284668564457, + "grad_norm": 1.3512634482422214, + "learning_rate": 4.5622089559839e-06, + "loss": 0.0219, + "step": 22138 + }, + { + "epoch": 9.003253355022366, + "grad_norm": 0.1829028870979551, + "learning_rate": 4.561358687596829e-06, + "loss": 0.0024, + "step": 22139 + }, + { + "epoch": 9.003660024400162, + "grad_norm": 0.12215591807079992, + "learning_rate": 4.56050847504015e-06, + "loss": 0.001, + "step": 22140 + }, + { + "epoch": 9.004066693777958, + "grad_norm": 0.010050248086430657, + "learning_rate": 4.5596583183225895e-06, + "loss": 0.0001, + "step": 22141 + }, + { + "epoch": 9.004473363155755, + "grad_norm": 0.07165034999736193, + "learning_rate": 4.5588082174528695e-06, + "loss": 0.0012, + "step": 22142 + }, + { + "epoch": 9.004880032533551, + "grad_norm": 0.7238769151710127, + "learning_rate": 4.557958172439726e-06, + "loss": 0.0099, + "step": 22143 + }, + { + "epoch": 9.005286701911347, + "grad_norm": 0.012070123390885314, + "learning_rate": 4.5571081832918805e-06, + "loss": 0.0001, + "step": 22144 + }, + { + "epoch": 9.005693371289142, + "grad_norm": 0.5682956175738425, + "learning_rate": 4.556258250018059e-06, + "loss": 0.0046, + "step": 22145 + }, + { + "epoch": 9.006100040666938, + "grad_norm": 3.869597527530173, + "learning_rate": 4.5554083726269856e-06, + "loss": 0.0505, + "step": 22146 + }, + { + "epoch": 9.006506710044734, + "grad_norm": 0.6773623078083303, + "learning_rate": 4.554558551127386e-06, + "loss": 0.0092, + "step": 22147 + }, + { + "epoch": 9.00691337942253, + "grad_norm": 0.1964414333108888, + "learning_rate": 4.553708785527984e-06, + "loss": 0.0023, + "step": 22148 + }, + { + "epoch": 9.007320048800326, + "grad_norm": 0.8208861988244949, + "learning_rate": 4.552859075837501e-06, + "loss": 0.0113, + "step": 22149 + }, + { + "epoch": 9.007726718178121, + "grad_norm": 0.026498959877809657, + "learning_rate": 4.552009422064663e-06, + "loss": 0.0003, + "step": 22150 + }, + { + "epoch": 9.008133387555917, + "grad_norm": 0.011024371759681496, + "learning_rate": 4.551159824218189e-06, + "loss": 0.0001, + "step": 22151 + }, + { + "epoch": 9.008540056933713, + "grad_norm": 0.24156085997798116, + "learning_rate": 4.550310282306801e-06, + "loss": 0.0027, + "step": 22152 + }, + { + "epoch": 9.008946726311509, + "grad_norm": 0.13814517876521165, + "learning_rate": 4.549460796339218e-06, + "loss": 0.0014, + "step": 22153 + }, + { + "epoch": 9.009353395689304, + "grad_norm": 3.9058020665716846, + "learning_rate": 4.548611366324167e-06, + "loss": 0.0523, + "step": 22154 + }, + { + "epoch": 9.0097600650671, + "grad_norm": 1.561403403893333, + "learning_rate": 4.5477619922703645e-06, + "loss": 0.0231, + "step": 22155 + }, + { + "epoch": 9.010166734444896, + "grad_norm": 0.056967525980901663, + "learning_rate": 4.546912674186527e-06, + "loss": 0.0008, + "step": 22156 + }, + { + "epoch": 9.010573403822692, + "grad_norm": 0.3577692021345606, + "learning_rate": 4.546063412081377e-06, + "loss": 0.0036, + "step": 22157 + }, + { + "epoch": 9.010980073200487, + "grad_norm": 0.014942211421793155, + "learning_rate": 4.545214205963632e-06, + "loss": 0.0002, + "step": 22158 + }, + { + "epoch": 9.011386742578283, + "grad_norm": 10.679496955287977, + "learning_rate": 4.544365055842006e-06, + "loss": 0.1896, + "step": 22159 + }, + { + "epoch": 9.011793411956079, + "grad_norm": 0.02522981549375204, + "learning_rate": 4.5435159617252165e-06, + "loss": 0.0003, + "step": 22160 + }, + { + "epoch": 9.012200081333875, + "grad_norm": 0.10033704473775243, + "learning_rate": 4.542666923621986e-06, + "loss": 0.0012, + "step": 22161 + }, + { + "epoch": 9.012606750711672, + "grad_norm": 0.6371424510720886, + "learning_rate": 4.5418179415410246e-06, + "loss": 0.0066, + "step": 22162 + }, + { + "epoch": 9.013013420089468, + "grad_norm": 2.0271695950319453, + "learning_rate": 4.540969015491049e-06, + "loss": 0.0264, + "step": 22163 + }, + { + "epoch": 9.013420089467264, + "grad_norm": 0.024535202994683748, + "learning_rate": 4.5401201454807756e-06, + "loss": 0.0003, + "step": 22164 + }, + { + "epoch": 9.01382675884506, + "grad_norm": 0.005301095210236139, + "learning_rate": 4.5392713315189154e-06, + "loss": 0.0001, + "step": 22165 + }, + { + "epoch": 9.014233428222855, + "grad_norm": 0.18839422481614024, + "learning_rate": 4.5384225736141794e-06, + "loss": 0.0034, + "step": 22166 + }, + { + "epoch": 9.014640097600651, + "grad_norm": 2.3002291927432155, + "learning_rate": 4.537573871775289e-06, + "loss": 0.0179, + "step": 22167 + }, + { + "epoch": 9.015046766978447, + "grad_norm": 0.06713726330086914, + "learning_rate": 4.536725226010952e-06, + "loss": 0.001, + "step": 22168 + }, + { + "epoch": 9.015453436356243, + "grad_norm": 4.201552641810212, + "learning_rate": 4.53587663632988e-06, + "loss": 0.1965, + "step": 22169 + }, + { + "epoch": 9.015860105734038, + "grad_norm": 2.6630562505057047, + "learning_rate": 4.535028102740785e-06, + "loss": 0.0356, + "step": 22170 + }, + { + "epoch": 9.016266775111834, + "grad_norm": 0.5419643680968353, + "learning_rate": 4.534179625252376e-06, + "loss": 0.0048, + "step": 22171 + }, + { + "epoch": 9.01667344448963, + "grad_norm": 1.6357569549377902, + "learning_rate": 4.533331203873365e-06, + "loss": 0.0215, + "step": 22172 + }, + { + "epoch": 9.017080113867426, + "grad_norm": 0.03998954780623494, + "learning_rate": 4.532482838612456e-06, + "loss": 0.0004, + "step": 22173 + }, + { + "epoch": 9.017486783245221, + "grad_norm": 0.7260483055390446, + "learning_rate": 4.531634529478366e-06, + "loss": 0.0134, + "step": 22174 + }, + { + "epoch": 9.017893452623017, + "grad_norm": 0.003015198194156148, + "learning_rate": 4.5307862764798e-06, + "loss": 0.0, + "step": 22175 + }, + { + "epoch": 9.018300122000813, + "grad_norm": 0.02696118136841086, + "learning_rate": 4.5299380796254654e-06, + "loss": 0.0003, + "step": 22176 + }, + { + "epoch": 9.018706791378609, + "grad_norm": 0.12498526074027083, + "learning_rate": 4.52908993892407e-06, + "loss": 0.0011, + "step": 22177 + }, + { + "epoch": 9.019113460756405, + "grad_norm": 0.7923962822477523, + "learning_rate": 4.528241854384319e-06, + "loss": 0.0071, + "step": 22178 + }, + { + "epoch": 9.0195201301342, + "grad_norm": 0.20248009975757195, + "learning_rate": 4.527393826014915e-06, + "loss": 0.0017, + "step": 22179 + }, + { + "epoch": 9.019926799511996, + "grad_norm": 0.20913384032931198, + "learning_rate": 4.526545853824572e-06, + "loss": 0.0028, + "step": 22180 + }, + { + "epoch": 9.020333468889792, + "grad_norm": 0.11468913332676364, + "learning_rate": 4.5256979378219914e-06, + "loss": 0.0016, + "step": 22181 + }, + { + "epoch": 9.020740138267588, + "grad_norm": 7.198361544524543, + "learning_rate": 4.524850078015875e-06, + "loss": 0.2017, + "step": 22182 + }, + { + "epoch": 9.021146807645385, + "grad_norm": 0.6735722938562164, + "learning_rate": 4.524002274414931e-06, + "loss": 0.01, + "step": 22183 + }, + { + "epoch": 9.021553477023181, + "grad_norm": 0.08726850145570761, + "learning_rate": 4.5231545270278564e-06, + "loss": 0.0007, + "step": 22184 + }, + { + "epoch": 9.021960146400977, + "grad_norm": 0.5280052863099441, + "learning_rate": 4.522306835863359e-06, + "loss": 0.0078, + "step": 22185 + }, + { + "epoch": 9.022366815778772, + "grad_norm": 0.013708322371009276, + "learning_rate": 4.521459200930134e-06, + "loss": 0.0002, + "step": 22186 + }, + { + "epoch": 9.022773485156568, + "grad_norm": 0.031068707743262116, + "learning_rate": 4.520611622236893e-06, + "loss": 0.0003, + "step": 22187 + }, + { + "epoch": 9.023180154534364, + "grad_norm": 0.03866697164211281, + "learning_rate": 4.51976409979233e-06, + "loss": 0.0004, + "step": 22188 + }, + { + "epoch": 9.02358682391216, + "grad_norm": 8.423486488184267, + "learning_rate": 4.518916633605146e-06, + "loss": 0.0253, + "step": 22189 + }, + { + "epoch": 9.023993493289955, + "grad_norm": 0.4660251680118593, + "learning_rate": 4.5180692236840425e-06, + "loss": 0.0056, + "step": 22190 + }, + { + "epoch": 9.024400162667751, + "grad_norm": 1.6409479398867295, + "learning_rate": 4.517221870037717e-06, + "loss": 0.0281, + "step": 22191 + }, + { + "epoch": 9.024806832045547, + "grad_norm": 0.16845871690843145, + "learning_rate": 4.516374572674869e-06, + "loss": 0.0022, + "step": 22192 + }, + { + "epoch": 9.025213501423343, + "grad_norm": 0.0016457806700672657, + "learning_rate": 4.5155273316041915e-06, + "loss": 0.0, + "step": 22193 + }, + { + "epoch": 9.025620170801139, + "grad_norm": 0.2581808902530699, + "learning_rate": 4.51468014683439e-06, + "loss": 0.0028, + "step": 22194 + }, + { + "epoch": 9.026026840178934, + "grad_norm": 1.8638746458786735, + "learning_rate": 4.513833018374157e-06, + "loss": 0.0289, + "step": 22195 + }, + { + "epoch": 9.02643350955673, + "grad_norm": 8.601824958038382, + "learning_rate": 4.51298594623219e-06, + "loss": 0.1504, + "step": 22196 + }, + { + "epoch": 9.026840178934526, + "grad_norm": 1.2537574130107267, + "learning_rate": 4.512138930417185e-06, + "loss": 0.0121, + "step": 22197 + }, + { + "epoch": 9.027246848312322, + "grad_norm": 1.4467080080667876, + "learning_rate": 4.511291970937833e-06, + "loss": 0.0221, + "step": 22198 + }, + { + "epoch": 9.027653517690117, + "grad_norm": 0.00487359796757412, + "learning_rate": 4.51044506780283e-06, + "loss": 0.0, + "step": 22199 + }, + { + "epoch": 9.028060187067913, + "grad_norm": 0.04261924820123803, + "learning_rate": 4.5095982210208745e-06, + "loss": 0.0006, + "step": 22200 + }, + { + "epoch": 9.028466856445709, + "grad_norm": 0.05938057789195344, + "learning_rate": 4.508751430600655e-06, + "loss": 0.0007, + "step": 22201 + }, + { + "epoch": 9.028873525823505, + "grad_norm": 3.1033091860556175, + "learning_rate": 4.507904696550868e-06, + "loss": 0.0509, + "step": 22202 + }, + { + "epoch": 9.029280195201302, + "grad_norm": 0.3463314377053698, + "learning_rate": 4.5070580188802015e-06, + "loss": 0.0031, + "step": 22203 + }, + { + "epoch": 9.029686864579098, + "grad_norm": 0.11435003459559911, + "learning_rate": 4.5062113975973495e-06, + "loss": 0.0011, + "step": 22204 + }, + { + "epoch": 9.030093533956894, + "grad_norm": 0.0008587457283516612, + "learning_rate": 4.505364832711003e-06, + "loss": 0.0, + "step": 22205 + }, + { + "epoch": 9.03050020333469, + "grad_norm": 0.7902001522374261, + "learning_rate": 4.504518324229848e-06, + "loss": 0.009, + "step": 22206 + }, + { + "epoch": 9.030906872712485, + "grad_norm": 0.04490367469189716, + "learning_rate": 4.503671872162581e-06, + "loss": 0.0005, + "step": 22207 + }, + { + "epoch": 9.031313542090281, + "grad_norm": 0.050909293272990735, + "learning_rate": 4.50282547651789e-06, + "loss": 0.0006, + "step": 22208 + }, + { + "epoch": 9.031720211468077, + "grad_norm": 4.905912344393199, + "learning_rate": 4.501979137304462e-06, + "loss": 0.0518, + "step": 22209 + }, + { + "epoch": 9.032126880845873, + "grad_norm": 0.17948470147275591, + "learning_rate": 4.501132854530984e-06, + "loss": 0.0015, + "step": 22210 + }, + { + "epoch": 9.032533550223668, + "grad_norm": 0.026330200671176057, + "learning_rate": 4.500286628206145e-06, + "loss": 0.0003, + "step": 22211 + }, + { + "epoch": 9.032940219601464, + "grad_norm": 0.019133560438204737, + "learning_rate": 4.499440458338629e-06, + "loss": 0.0002, + "step": 22212 + }, + { + "epoch": 9.03334688897926, + "grad_norm": 0.003913147559140202, + "learning_rate": 4.498594344937129e-06, + "loss": 0.0, + "step": 22213 + }, + { + "epoch": 9.033753558357056, + "grad_norm": 0.0010301809295925031, + "learning_rate": 4.497748288010325e-06, + "loss": 0.0, + "step": 22214 + }, + { + "epoch": 9.034160227734851, + "grad_norm": 0.015258682372956078, + "learning_rate": 4.496902287566905e-06, + "loss": 0.0002, + "step": 22215 + }, + { + "epoch": 9.034566897112647, + "grad_norm": 2.4791041002788012, + "learning_rate": 4.496056343615554e-06, + "loss": 0.0303, + "step": 22216 + }, + { + "epoch": 9.034973566490443, + "grad_norm": 0.21889180180170556, + "learning_rate": 4.4952104561649525e-06, + "loss": 0.0025, + "step": 22217 + }, + { + "epoch": 9.035380235868239, + "grad_norm": 0.11606652156828648, + "learning_rate": 4.494364625223787e-06, + "loss": 0.001, + "step": 22218 + }, + { + "epoch": 9.035786905246034, + "grad_norm": 0.007647452022557166, + "learning_rate": 4.4935188508007365e-06, + "loss": 0.0001, + "step": 22219 + }, + { + "epoch": 9.03619357462383, + "grad_norm": 0.01447646234406435, + "learning_rate": 4.4926731329044895e-06, + "loss": 0.0002, + "step": 22220 + }, + { + "epoch": 9.036600244001626, + "grad_norm": 2.400554889398575, + "learning_rate": 4.491827471543725e-06, + "loss": 0.0389, + "step": 22221 + }, + { + "epoch": 9.037006913379422, + "grad_norm": 0.5400727139591287, + "learning_rate": 4.4909818667271235e-06, + "loss": 0.0068, + "step": 22222 + }, + { + "epoch": 9.037413582757218, + "grad_norm": 0.028138984831497113, + "learning_rate": 4.490136318463365e-06, + "loss": 0.0003, + "step": 22223 + }, + { + "epoch": 9.037820252135015, + "grad_norm": 0.7678828677958569, + "learning_rate": 4.489290826761131e-06, + "loss": 0.0089, + "step": 22224 + }, + { + "epoch": 9.03822692151281, + "grad_norm": 0.2689531228415937, + "learning_rate": 4.488445391629096e-06, + "loss": 0.002, + "step": 22225 + }, + { + "epoch": 9.038633590890607, + "grad_norm": 12.122509126734755, + "learning_rate": 4.4876000130759475e-06, + "loss": 0.1103, + "step": 22226 + }, + { + "epoch": 9.039040260268402, + "grad_norm": 0.6141628356151022, + "learning_rate": 4.486754691110359e-06, + "loss": 0.0077, + "step": 22227 + }, + { + "epoch": 9.039446929646198, + "grad_norm": 0.03279641200440722, + "learning_rate": 4.485909425741008e-06, + "loss": 0.0005, + "step": 22228 + }, + { + "epoch": 9.039853599023994, + "grad_norm": 0.10661702880047905, + "learning_rate": 4.485064216976572e-06, + "loss": 0.0013, + "step": 22229 + }, + { + "epoch": 9.04026026840179, + "grad_norm": 0.10301653278317309, + "learning_rate": 4.4842190648257266e-06, + "loss": 0.0011, + "step": 22230 + }, + { + "epoch": 9.040666937779585, + "grad_norm": 0.5567380911842321, + "learning_rate": 4.48337396929715e-06, + "loss": 0.0052, + "step": 22231 + }, + { + "epoch": 9.041073607157381, + "grad_norm": 0.045102545959708654, + "learning_rate": 4.482528930399511e-06, + "loss": 0.0005, + "step": 22232 + }, + { + "epoch": 9.041480276535177, + "grad_norm": 1.1090952442210433, + "learning_rate": 4.481683948141494e-06, + "loss": 0.0127, + "step": 22233 + }, + { + "epoch": 9.041886945912973, + "grad_norm": 1.5079603283316634, + "learning_rate": 4.480839022531769e-06, + "loss": 0.0193, + "step": 22234 + }, + { + "epoch": 9.042293615290768, + "grad_norm": 7.312705975722899, + "learning_rate": 4.479994153579008e-06, + "loss": 0.2623, + "step": 22235 + }, + { + "epoch": 9.042700284668564, + "grad_norm": 6.35271987247225, + "learning_rate": 4.479149341291885e-06, + "loss": 0.1378, + "step": 22236 + }, + { + "epoch": 9.04310695404636, + "grad_norm": 0.1241250086461014, + "learning_rate": 4.478304585679074e-06, + "loss": 0.0014, + "step": 22237 + }, + { + "epoch": 9.043513623424156, + "grad_norm": 0.01194769650387905, + "learning_rate": 4.477459886749242e-06, + "loss": 0.0002, + "step": 22238 + }, + { + "epoch": 9.043920292801952, + "grad_norm": 0.651755254923441, + "learning_rate": 4.476615244511065e-06, + "loss": 0.0067, + "step": 22239 + }, + { + "epoch": 9.044326962179747, + "grad_norm": 0.35010750397095763, + "learning_rate": 4.475770658973214e-06, + "loss": 0.0037, + "step": 22240 + }, + { + "epoch": 9.044733631557543, + "grad_norm": 0.012519976596280118, + "learning_rate": 4.474926130144358e-06, + "loss": 0.0001, + "step": 22241 + }, + { + "epoch": 9.045140300935339, + "grad_norm": 0.27486996750919485, + "learning_rate": 4.474081658033165e-06, + "loss": 0.0039, + "step": 22242 + }, + { + "epoch": 9.045546970313135, + "grad_norm": 4.651898285346508, + "learning_rate": 4.4732372426483055e-06, + "loss": 0.14, + "step": 22243 + }, + { + "epoch": 9.045953639690932, + "grad_norm": 0.1684322127039681, + "learning_rate": 4.472392883998446e-06, + "loss": 0.0012, + "step": 22244 + }, + { + "epoch": 9.046360309068728, + "grad_norm": 0.10299059338387101, + "learning_rate": 4.4715485820922535e-06, + "loss": 0.0017, + "step": 22245 + }, + { + "epoch": 9.046766978446524, + "grad_norm": 0.0005529031374175613, + "learning_rate": 4.4707043369384006e-06, + "loss": 0.0, + "step": 22246 + }, + { + "epoch": 9.04717364782432, + "grad_norm": 0.0006547194726838669, + "learning_rate": 4.469860148545554e-06, + "loss": 0.0, + "step": 22247 + }, + { + "epoch": 9.047580317202115, + "grad_norm": 1.2643663870620756, + "learning_rate": 4.469016016922372e-06, + "loss": 0.0155, + "step": 22248 + }, + { + "epoch": 9.047986986579911, + "grad_norm": 4.154303759204665, + "learning_rate": 4.468171942077525e-06, + "loss": 0.0488, + "step": 22249 + }, + { + "epoch": 9.048393655957707, + "grad_norm": 0.5351678653290978, + "learning_rate": 4.467327924019672e-06, + "loss": 0.0069, + "step": 22250 + }, + { + "epoch": 9.048800325335502, + "grad_norm": 0.00706917276393546, + "learning_rate": 4.4664839627574876e-06, + "loss": 0.0001, + "step": 22251 + }, + { + "epoch": 9.049206994713298, + "grad_norm": 0.6746616667192831, + "learning_rate": 4.46564005829963e-06, + "loss": 0.0084, + "step": 22252 + }, + { + "epoch": 9.049613664091094, + "grad_norm": 0.2140349460666078, + "learning_rate": 4.464796210654763e-06, + "loss": 0.0025, + "step": 22253 + }, + { + "epoch": 9.05002033346889, + "grad_norm": 0.5974045277719693, + "learning_rate": 4.463952419831548e-06, + "loss": 0.0034, + "step": 22254 + }, + { + "epoch": 9.050427002846686, + "grad_norm": 0.010442865955142638, + "learning_rate": 4.463108685838647e-06, + "loss": 0.0001, + "step": 22255 + }, + { + "epoch": 9.050833672224481, + "grad_norm": 5.1826837555883625, + "learning_rate": 4.46226500868472e-06, + "loss": 0.0758, + "step": 22256 + }, + { + "epoch": 9.051240341602277, + "grad_norm": 2.174827938218911, + "learning_rate": 4.461421388378432e-06, + "loss": 0.0216, + "step": 22257 + }, + { + "epoch": 9.051647010980073, + "grad_norm": 0.18770808923946444, + "learning_rate": 4.4605778249284414e-06, + "loss": 0.0014, + "step": 22258 + }, + { + "epoch": 9.052053680357869, + "grad_norm": 0.5849905946860463, + "learning_rate": 4.4597343183434075e-06, + "loss": 0.0047, + "step": 22259 + }, + { + "epoch": 9.052460349735664, + "grad_norm": 0.10098886285321865, + "learning_rate": 4.45889086863199e-06, + "loss": 0.001, + "step": 22260 + }, + { + "epoch": 9.05286701911346, + "grad_norm": 0.09480237763067653, + "learning_rate": 4.458047475802845e-06, + "loss": 0.001, + "step": 22261 + }, + { + "epoch": 9.053273688491256, + "grad_norm": 0.2702397416236524, + "learning_rate": 4.457204139864633e-06, + "loss": 0.0028, + "step": 22262 + }, + { + "epoch": 9.053680357869052, + "grad_norm": 0.006417991588290258, + "learning_rate": 4.4563608608260055e-06, + "loss": 0.0001, + "step": 22263 + }, + { + "epoch": 9.054087027246847, + "grad_norm": 0.053921183412763106, + "learning_rate": 4.455517638695628e-06, + "loss": 0.0005, + "step": 22264 + }, + { + "epoch": 9.054493696624645, + "grad_norm": 0.040389131417064086, + "learning_rate": 4.454674473482152e-06, + "loss": 0.0004, + "step": 22265 + }, + { + "epoch": 9.05490036600244, + "grad_norm": 0.11086668989479534, + "learning_rate": 4.453831365194235e-06, + "loss": 0.001, + "step": 22266 + }, + { + "epoch": 9.055307035380237, + "grad_norm": 0.011212598807379437, + "learning_rate": 4.452988313840528e-06, + "loss": 0.0002, + "step": 22267 + }, + { + "epoch": 9.055713704758032, + "grad_norm": 0.02250314989211889, + "learning_rate": 4.4521453194296895e-06, + "loss": 0.0002, + "step": 22268 + }, + { + "epoch": 9.056120374135828, + "grad_norm": 5.255301060378736, + "learning_rate": 4.451302381970367e-06, + "loss": 0.0486, + "step": 22269 + }, + { + "epoch": 9.056527043513624, + "grad_norm": 0.197420581297246, + "learning_rate": 4.450459501471223e-06, + "loss": 0.0026, + "step": 22270 + }, + { + "epoch": 9.05693371289142, + "grad_norm": 0.06858615320393431, + "learning_rate": 4.449616677940904e-06, + "loss": 0.0005, + "step": 22271 + }, + { + "epoch": 9.057340382269215, + "grad_norm": 0.0036692313753040255, + "learning_rate": 4.4487739113880635e-06, + "loss": 0.0, + "step": 22272 + }, + { + "epoch": 9.057747051647011, + "grad_norm": 4.496776297004968, + "learning_rate": 4.447931201821353e-06, + "loss": 0.057, + "step": 22273 + }, + { + "epoch": 9.058153721024807, + "grad_norm": 0.3103206047726268, + "learning_rate": 4.4470885492494224e-06, + "loss": 0.0025, + "step": 22274 + }, + { + "epoch": 9.058560390402603, + "grad_norm": 0.0012991387185220342, + "learning_rate": 4.446245953680922e-06, + "loss": 0.0, + "step": 22275 + }, + { + "epoch": 9.058967059780398, + "grad_norm": 0.09976180741063014, + "learning_rate": 4.4454034151244995e-06, + "loss": 0.0014, + "step": 22276 + }, + { + "epoch": 9.059373729158194, + "grad_norm": 0.05594515638389723, + "learning_rate": 4.444560933588809e-06, + "loss": 0.0005, + "step": 22277 + }, + { + "epoch": 9.05978039853599, + "grad_norm": 0.020376816433696446, + "learning_rate": 4.443718509082499e-06, + "loss": 0.0002, + "step": 22278 + }, + { + "epoch": 9.060187067913786, + "grad_norm": 0.08793145443554476, + "learning_rate": 4.442876141614213e-06, + "loss": 0.0009, + "step": 22279 + }, + { + "epoch": 9.060593737291581, + "grad_norm": 0.02651749944844219, + "learning_rate": 4.4420338311926005e-06, + "loss": 0.0003, + "step": 22280 + }, + { + "epoch": 9.061000406669377, + "grad_norm": 5.65137351829869, + "learning_rate": 4.441191577826308e-06, + "loss": 0.0469, + "step": 22281 + }, + { + "epoch": 9.061407076047173, + "grad_norm": 0.20709587321762388, + "learning_rate": 4.440349381523979e-06, + "loss": 0.0023, + "step": 22282 + }, + { + "epoch": 9.061813745424969, + "grad_norm": 0.022368941783310184, + "learning_rate": 4.439507242294263e-06, + "loss": 0.0004, + "step": 22283 + }, + { + "epoch": 9.062220414802765, + "grad_norm": 0.07948448430699816, + "learning_rate": 4.438665160145805e-06, + "loss": 0.0003, + "step": 22284 + }, + { + "epoch": 9.062627084180562, + "grad_norm": 0.28674066793902964, + "learning_rate": 4.437823135087249e-06, + "loss": 0.0027, + "step": 22285 + }, + { + "epoch": 9.063033753558358, + "grad_norm": 0.5357746634340697, + "learning_rate": 4.436981167127237e-06, + "loss": 0.0061, + "step": 22286 + }, + { + "epoch": 9.063440422936154, + "grad_norm": 0.20698450123336704, + "learning_rate": 4.436139256274413e-06, + "loss": 0.0016, + "step": 22287 + }, + { + "epoch": 9.06384709231395, + "grad_norm": 0.3378318038986764, + "learning_rate": 4.435297402537421e-06, + "loss": 0.0037, + "step": 22288 + }, + { + "epoch": 9.064253761691745, + "grad_norm": 0.10538727953556316, + "learning_rate": 4.434455605924897e-06, + "loss": 0.0014, + "step": 22289 + }, + { + "epoch": 9.064660431069541, + "grad_norm": 0.2723319237857102, + "learning_rate": 4.4336138664454905e-06, + "loss": 0.0022, + "step": 22290 + }, + { + "epoch": 9.065067100447337, + "grad_norm": 0.19733037155213703, + "learning_rate": 4.43277218410784e-06, + "loss": 0.0032, + "step": 22291 + }, + { + "epoch": 9.065473769825132, + "grad_norm": 8.412211431482504, + "learning_rate": 4.431930558920585e-06, + "loss": 0.0987, + "step": 22292 + }, + { + "epoch": 9.065880439202928, + "grad_norm": 0.013735933229276558, + "learning_rate": 4.431088990892365e-06, + "loss": 0.0002, + "step": 22293 + }, + { + "epoch": 9.066287108580724, + "grad_norm": 0.9461306319019286, + "learning_rate": 4.4302474800318195e-06, + "loss": 0.0117, + "step": 22294 + }, + { + "epoch": 9.06669377795852, + "grad_norm": 2.692149188422957, + "learning_rate": 4.4294060263475825e-06, + "loss": 0.0318, + "step": 22295 + }, + { + "epoch": 9.067100447336315, + "grad_norm": 0.23142673539003392, + "learning_rate": 4.4285646298483e-06, + "loss": 0.0026, + "step": 22296 + }, + { + "epoch": 9.067507116714111, + "grad_norm": 0.8973453567808873, + "learning_rate": 4.427723290542606e-06, + "loss": 0.0093, + "step": 22297 + }, + { + "epoch": 9.067913786091907, + "grad_norm": 0.0740032523510147, + "learning_rate": 4.426882008439137e-06, + "loss": 0.0007, + "step": 22298 + }, + { + "epoch": 9.068320455469703, + "grad_norm": 9.490286083673299, + "learning_rate": 4.426040783546529e-06, + "loss": 0.087, + "step": 22299 + }, + { + "epoch": 9.068727124847499, + "grad_norm": 0.5160264287341211, + "learning_rate": 4.425199615873418e-06, + "loss": 0.0063, + "step": 22300 + }, + { + "epoch": 9.069133794225294, + "grad_norm": 0.12197233189817631, + "learning_rate": 4.424358505428438e-06, + "loss": 0.0011, + "step": 22301 + }, + { + "epoch": 9.06954046360309, + "grad_norm": 0.5503415502615864, + "learning_rate": 4.42351745222022e-06, + "loss": 0.0027, + "step": 22302 + }, + { + "epoch": 9.069947132980886, + "grad_norm": 0.9590270005750047, + "learning_rate": 4.422676456257407e-06, + "loss": 0.0105, + "step": 22303 + }, + { + "epoch": 9.070353802358682, + "grad_norm": 3.706966388116609, + "learning_rate": 4.421835517548626e-06, + "loss": 0.0848, + "step": 22304 + }, + { + "epoch": 9.070760471736477, + "grad_norm": 0.6766317675471369, + "learning_rate": 4.4209946361025114e-06, + "loss": 0.0048, + "step": 22305 + }, + { + "epoch": 9.071167141114275, + "grad_norm": 0.02551065078257282, + "learning_rate": 4.420153811927694e-06, + "loss": 0.0002, + "step": 22306 + }, + { + "epoch": 9.07157381049207, + "grad_norm": 0.46719187565908304, + "learning_rate": 4.419313045032807e-06, + "loss": 0.0049, + "step": 22307 + }, + { + "epoch": 9.071980479869866, + "grad_norm": 0.1186231661819912, + "learning_rate": 4.4184723354264805e-06, + "loss": 0.0014, + "step": 22308 + }, + { + "epoch": 9.072387149247662, + "grad_norm": 0.041382666642799694, + "learning_rate": 4.417631683117341e-06, + "loss": 0.0004, + "step": 22309 + }, + { + "epoch": 9.072793818625458, + "grad_norm": 0.5244100713183946, + "learning_rate": 4.416791088114024e-06, + "loss": 0.007, + "step": 22310 + }, + { + "epoch": 9.073200488003254, + "grad_norm": 0.27081301905750277, + "learning_rate": 4.415950550425157e-06, + "loss": 0.0025, + "step": 22311 + }, + { + "epoch": 9.07360715738105, + "grad_norm": 0.40599212749425045, + "learning_rate": 4.415110070059368e-06, + "loss": 0.0024, + "step": 22312 + }, + { + "epoch": 9.074013826758845, + "grad_norm": 0.16274451163029519, + "learning_rate": 4.4142696470252856e-06, + "loss": 0.0018, + "step": 22313 + }, + { + "epoch": 9.074420496136641, + "grad_norm": 0.05662499734476163, + "learning_rate": 4.413429281331536e-06, + "loss": 0.0005, + "step": 22314 + }, + { + "epoch": 9.074827165514437, + "grad_norm": 0.04601347973309193, + "learning_rate": 4.4125889729867435e-06, + "loss": 0.0004, + "step": 22315 + }, + { + "epoch": 9.075233834892233, + "grad_norm": 0.008622988327996987, + "learning_rate": 4.41174872199954e-06, + "loss": 0.0001, + "step": 22316 + }, + { + "epoch": 9.075640504270028, + "grad_norm": 0.008635047153999728, + "learning_rate": 4.410908528378549e-06, + "loss": 0.0002, + "step": 22317 + }, + { + "epoch": 9.076047173647824, + "grad_norm": 1.6540034795655714, + "learning_rate": 4.410068392132395e-06, + "loss": 0.0179, + "step": 22318 + }, + { + "epoch": 9.07645384302562, + "grad_norm": 0.010685615282045507, + "learning_rate": 4.409228313269701e-06, + "loss": 0.0001, + "step": 22319 + }, + { + "epoch": 9.076860512403416, + "grad_norm": 0.03808083631433671, + "learning_rate": 4.408388291799094e-06, + "loss": 0.0004, + "step": 22320 + }, + { + "epoch": 9.077267181781211, + "grad_norm": 0.04662678332699502, + "learning_rate": 4.407548327729194e-06, + "loss": 0.0005, + "step": 22321 + }, + { + "epoch": 9.077673851159007, + "grad_norm": 0.0020908657239377465, + "learning_rate": 4.406708421068622e-06, + "loss": 0.0, + "step": 22322 + }, + { + "epoch": 9.078080520536803, + "grad_norm": 1.9829108598224565, + "learning_rate": 4.405868571826006e-06, + "loss": 0.015, + "step": 22323 + }, + { + "epoch": 9.078487189914599, + "grad_norm": 0.3402711679261462, + "learning_rate": 4.405028780009965e-06, + "loss": 0.0037, + "step": 22324 + }, + { + "epoch": 9.078893859292394, + "grad_norm": 11.59974433446588, + "learning_rate": 4.40418904562912e-06, + "loss": 0.1688, + "step": 22325 + }, + { + "epoch": 9.079300528670192, + "grad_norm": 0.3429699113437069, + "learning_rate": 4.403349368692091e-06, + "loss": 0.0052, + "step": 22326 + }, + { + "epoch": 9.079707198047988, + "grad_norm": 0.05195920162736275, + "learning_rate": 4.402509749207496e-06, + "loss": 0.0006, + "step": 22327 + }, + { + "epoch": 9.080113867425784, + "grad_norm": 0.4715699978802164, + "learning_rate": 4.401670187183954e-06, + "loss": 0.0034, + "step": 22328 + }, + { + "epoch": 9.08052053680358, + "grad_norm": 0.035302496927070375, + "learning_rate": 4.400830682630087e-06, + "loss": 0.0003, + "step": 22329 + }, + { + "epoch": 9.080927206181375, + "grad_norm": 0.31762927697606713, + "learning_rate": 4.399991235554512e-06, + "loss": 0.003, + "step": 22330 + }, + { + "epoch": 9.08133387555917, + "grad_norm": 0.4855712111715791, + "learning_rate": 4.399151845965845e-06, + "loss": 0.0057, + "step": 22331 + }, + { + "epoch": 9.081740544936967, + "grad_norm": 9.310380743731994, + "learning_rate": 4.398312513872704e-06, + "loss": 0.182, + "step": 22332 + }, + { + "epoch": 9.082147214314762, + "grad_norm": 0.1558102101139968, + "learning_rate": 4.3974732392837025e-06, + "loss": 0.0021, + "step": 22333 + }, + { + "epoch": 9.082553883692558, + "grad_norm": 0.018007798073620667, + "learning_rate": 4.396634022207459e-06, + "loss": 0.0003, + "step": 22334 + }, + { + "epoch": 9.082960553070354, + "grad_norm": 0.0623081632146592, + "learning_rate": 4.3957948626525835e-06, + "loss": 0.0005, + "step": 22335 + }, + { + "epoch": 9.08336722244815, + "grad_norm": 0.2111100742530555, + "learning_rate": 4.394955760627698e-06, + "loss": 0.0021, + "step": 22336 + }, + { + "epoch": 9.083773891825945, + "grad_norm": 0.08522110717788407, + "learning_rate": 4.394116716141413e-06, + "loss": 0.0012, + "step": 22337 + }, + { + "epoch": 9.084180561203741, + "grad_norm": 0.02358575287307667, + "learning_rate": 4.393277729202341e-06, + "loss": 0.0003, + "step": 22338 + }, + { + "epoch": 9.084587230581537, + "grad_norm": 0.017377343938736838, + "learning_rate": 4.392438799819094e-06, + "loss": 0.0002, + "step": 22339 + }, + { + "epoch": 9.084993899959333, + "grad_norm": 0.31762997471334636, + "learning_rate": 4.3915999280002865e-06, + "loss": 0.0031, + "step": 22340 + }, + { + "epoch": 9.085400569337128, + "grad_norm": 0.05271691827619085, + "learning_rate": 4.390761113754524e-06, + "loss": 0.0007, + "step": 22341 + }, + { + "epoch": 9.085807238714924, + "grad_norm": 0.26098545203561463, + "learning_rate": 4.389922357090425e-06, + "loss": 0.0035, + "step": 22342 + }, + { + "epoch": 9.08621390809272, + "grad_norm": 2.108698757262873, + "learning_rate": 4.3890836580165975e-06, + "loss": 0.0122, + "step": 22343 + }, + { + "epoch": 9.086620577470516, + "grad_norm": 0.07719985840197413, + "learning_rate": 4.388245016541649e-06, + "loss": 0.0007, + "step": 22344 + }, + { + "epoch": 9.087027246848312, + "grad_norm": 0.12955469069431305, + "learning_rate": 4.38740643267419e-06, + "loss": 0.0015, + "step": 22345 + }, + { + "epoch": 9.087433916226107, + "grad_norm": 0.012548517576582352, + "learning_rate": 4.3865679064228294e-06, + "loss": 0.0001, + "step": 22346 + }, + { + "epoch": 9.087840585603905, + "grad_norm": 0.07843538631811098, + "learning_rate": 4.385729437796175e-06, + "loss": 0.0005, + "step": 22347 + }, + { + "epoch": 9.0882472549817, + "grad_norm": 1.6638515390738757, + "learning_rate": 4.384891026802832e-06, + "loss": 0.0121, + "step": 22348 + }, + { + "epoch": 9.088653924359496, + "grad_norm": 0.0041310332440819345, + "learning_rate": 4.384052673451409e-06, + "loss": 0.0, + "step": 22349 + }, + { + "epoch": 9.089060593737292, + "grad_norm": 5.420396296211366, + "learning_rate": 4.3832143777505125e-06, + "loss": 0.0851, + "step": 22350 + }, + { + "epoch": 9.089467263115088, + "grad_norm": 0.004843072007101751, + "learning_rate": 4.382376139708746e-06, + "loss": 0.0001, + "step": 22351 + }, + { + "epoch": 9.089873932492884, + "grad_norm": 0.1382843046860207, + "learning_rate": 4.3815379593347166e-06, + "loss": 0.0012, + "step": 22352 + }, + { + "epoch": 9.09028060187068, + "grad_norm": 0.011849292530581335, + "learning_rate": 4.380699836637025e-06, + "loss": 0.0002, + "step": 22353 + }, + { + "epoch": 9.090687271248475, + "grad_norm": 0.986189015417529, + "learning_rate": 4.37986177162428e-06, + "loss": 0.0167, + "step": 22354 + }, + { + "epoch": 9.091093940626271, + "grad_norm": 0.06783892806239628, + "learning_rate": 4.379023764305083e-06, + "loss": 0.0008, + "step": 22355 + }, + { + "epoch": 9.091500610004067, + "grad_norm": 1.102503223001402, + "learning_rate": 4.378185814688035e-06, + "loss": 0.018, + "step": 22356 + }, + { + "epoch": 9.091907279381862, + "grad_norm": 10.130457095529785, + "learning_rate": 4.37734792278174e-06, + "loss": 0.1947, + "step": 22357 + }, + { + "epoch": 9.092313948759658, + "grad_norm": 0.06854086485466654, + "learning_rate": 4.376510088594799e-06, + "loss": 0.0007, + "step": 22358 + }, + { + "epoch": 9.092720618137454, + "grad_norm": 2.599538261879665, + "learning_rate": 4.375672312135807e-06, + "loss": 0.0166, + "step": 22359 + }, + { + "epoch": 9.09312728751525, + "grad_norm": 1.7320619379094355, + "learning_rate": 4.374834593413374e-06, + "loss": 0.0091, + "step": 22360 + }, + { + "epoch": 9.093533956893046, + "grad_norm": 0.020033893629896944, + "learning_rate": 4.373996932436094e-06, + "loss": 0.0001, + "step": 22361 + }, + { + "epoch": 9.093940626270841, + "grad_norm": 0.06979305322714661, + "learning_rate": 4.373159329212568e-06, + "loss": 0.0008, + "step": 22362 + }, + { + "epoch": 9.094347295648637, + "grad_norm": 0.2885414559939145, + "learning_rate": 4.372321783751394e-06, + "loss": 0.0054, + "step": 22363 + }, + { + "epoch": 9.094753965026433, + "grad_norm": 0.4003197484043362, + "learning_rate": 4.371484296061168e-06, + "loss": 0.0032, + "step": 22364 + }, + { + "epoch": 9.095160634404229, + "grad_norm": 0.21721745186970312, + "learning_rate": 4.370646866150489e-06, + "loss": 0.0014, + "step": 22365 + }, + { + "epoch": 9.095567303782024, + "grad_norm": 0.9399749083555354, + "learning_rate": 4.369809494027949e-06, + "loss": 0.0116, + "step": 22366 + }, + { + "epoch": 9.095973973159822, + "grad_norm": 0.056356003184031195, + "learning_rate": 4.368972179702152e-06, + "loss": 0.0006, + "step": 22367 + }, + { + "epoch": 9.096380642537618, + "grad_norm": 0.093915907833398, + "learning_rate": 4.3681349231816906e-06, + "loss": 0.0009, + "step": 22368 + }, + { + "epoch": 9.096787311915413, + "grad_norm": 0.0003055750916148087, + "learning_rate": 4.367297724475158e-06, + "loss": 0.0, + "step": 22369 + }, + { + "epoch": 9.09719398129321, + "grad_norm": 2.9473421600041037, + "learning_rate": 4.366460583591149e-06, + "loss": 0.0236, + "step": 22370 + }, + { + "epoch": 9.097600650671005, + "grad_norm": 3.5826210087149226, + "learning_rate": 4.3656235005382576e-06, + "loss": 0.0341, + "step": 22371 + }, + { + "epoch": 9.0980073200488, + "grad_norm": 0.11064851968154397, + "learning_rate": 4.364786475325072e-06, + "loss": 0.0013, + "step": 22372 + }, + { + "epoch": 9.098413989426597, + "grad_norm": 0.04039445722163395, + "learning_rate": 4.363949507960195e-06, + "loss": 0.0004, + "step": 22373 + }, + { + "epoch": 9.098820658804392, + "grad_norm": 1.2582092978165582, + "learning_rate": 4.363112598452211e-06, + "loss": 0.0089, + "step": 22374 + }, + { + "epoch": 9.099227328182188, + "grad_norm": 0.0017215478483164545, + "learning_rate": 4.362275746809714e-06, + "loss": 0.0, + "step": 22375 + }, + { + "epoch": 9.099633997559984, + "grad_norm": 2.1123772704084836, + "learning_rate": 4.361438953041293e-06, + "loss": 0.027, + "step": 22376 + }, + { + "epoch": 9.10004066693778, + "grad_norm": 3.3980272840239243, + "learning_rate": 4.360602217155541e-06, + "loss": 0.0342, + "step": 22377 + }, + { + "epoch": 9.100447336315575, + "grad_norm": 0.06311363246720239, + "learning_rate": 4.359765539161044e-06, + "loss": 0.001, + "step": 22378 + }, + { + "epoch": 9.100854005693371, + "grad_norm": 0.017125249694835035, + "learning_rate": 4.358928919066388e-06, + "loss": 0.0002, + "step": 22379 + }, + { + "epoch": 9.101260675071167, + "grad_norm": 0.1628888367887531, + "learning_rate": 4.358092356880171e-06, + "loss": 0.0015, + "step": 22380 + }, + { + "epoch": 9.101667344448963, + "grad_norm": 0.03960216975475879, + "learning_rate": 4.357255852610975e-06, + "loss": 0.0004, + "step": 22381 + }, + { + "epoch": 9.102074013826758, + "grad_norm": 0.8539546462081431, + "learning_rate": 4.356419406267388e-06, + "loss": 0.0118, + "step": 22382 + }, + { + "epoch": 9.102480683204554, + "grad_norm": 0.047122543684182894, + "learning_rate": 4.355583017857995e-06, + "loss": 0.0006, + "step": 22383 + }, + { + "epoch": 9.10288735258235, + "grad_norm": 0.2068561122999715, + "learning_rate": 4.354746687391385e-06, + "loss": 0.001, + "step": 22384 + }, + { + "epoch": 9.103294021960146, + "grad_norm": 0.26849506033211484, + "learning_rate": 4.353910414876136e-06, + "loss": 0.0028, + "step": 22385 + }, + { + "epoch": 9.103700691337941, + "grad_norm": 4.9227092868410045, + "learning_rate": 4.353074200320843e-06, + "loss": 0.0401, + "step": 22386 + }, + { + "epoch": 9.104107360715737, + "grad_norm": 0.11061560371029977, + "learning_rate": 4.352238043734084e-06, + "loss": 0.0009, + "step": 22387 + }, + { + "epoch": 9.104514030093535, + "grad_norm": 0.012789874249326514, + "learning_rate": 4.3514019451244445e-06, + "loss": 0.0001, + "step": 22388 + }, + { + "epoch": 9.10492069947133, + "grad_norm": 0.0026230186980208303, + "learning_rate": 4.350565904500508e-06, + "loss": 0.0, + "step": 22389 + }, + { + "epoch": 9.105327368849126, + "grad_norm": 0.00436148571324827, + "learning_rate": 4.3497299218708545e-06, + "loss": 0.0, + "step": 22390 + }, + { + "epoch": 9.105734038226922, + "grad_norm": 0.1098787033578199, + "learning_rate": 4.348893997244067e-06, + "loss": 0.0012, + "step": 22391 + }, + { + "epoch": 9.106140707604718, + "grad_norm": 0.806982243611398, + "learning_rate": 4.348058130628724e-06, + "loss": 0.0041, + "step": 22392 + }, + { + "epoch": 9.106547376982514, + "grad_norm": 1.9658458419466602, + "learning_rate": 4.347222322033412e-06, + "loss": 0.036, + "step": 22393 + }, + { + "epoch": 9.10695404636031, + "grad_norm": 0.0009008494023605658, + "learning_rate": 4.346386571466708e-06, + "loss": 0.0, + "step": 22394 + }, + { + "epoch": 9.107360715738105, + "grad_norm": 0.06407830431344712, + "learning_rate": 4.345550878937191e-06, + "loss": 0.0004, + "step": 22395 + }, + { + "epoch": 9.107767385115901, + "grad_norm": 0.018700370299104933, + "learning_rate": 4.34471524445344e-06, + "loss": 0.0003, + "step": 22396 + }, + { + "epoch": 9.108174054493697, + "grad_norm": 0.0482032697243146, + "learning_rate": 4.343879668024033e-06, + "loss": 0.0005, + "step": 22397 + }, + { + "epoch": 9.108580723871492, + "grad_norm": 0.0003850140624340923, + "learning_rate": 4.343044149657546e-06, + "loss": 0.0, + "step": 22398 + }, + { + "epoch": 9.108987393249288, + "grad_norm": 0.028574906465852923, + "learning_rate": 4.342208689362559e-06, + "loss": 0.0004, + "step": 22399 + }, + { + "epoch": 9.109394062627084, + "grad_norm": 2.7142413225652318, + "learning_rate": 4.341373287147649e-06, + "loss": 0.0258, + "step": 22400 + }, + { + "epoch": 9.10980073200488, + "grad_norm": 0.6989392915023799, + "learning_rate": 4.3405379430213904e-06, + "loss": 0.008, + "step": 22401 + }, + { + "epoch": 9.110207401382675, + "grad_norm": 0.006810485938090906, + "learning_rate": 4.339702656992357e-06, + "loss": 0.0001, + "step": 22402 + }, + { + "epoch": 9.110614070760471, + "grad_norm": 8.625654299423468, + "learning_rate": 4.338867429069126e-06, + "loss": 0.2036, + "step": 22403 + }, + { + "epoch": 9.111020740138267, + "grad_norm": 0.0021099881352070094, + "learning_rate": 4.338032259260269e-06, + "loss": 0.0, + "step": 22404 + }, + { + "epoch": 9.111427409516063, + "grad_norm": 0.015954239916012653, + "learning_rate": 4.337197147574356e-06, + "loss": 0.0002, + "step": 22405 + }, + { + "epoch": 9.111834078893859, + "grad_norm": 0.05014285406010108, + "learning_rate": 4.336362094019969e-06, + "loss": 0.0006, + "step": 22406 + }, + { + "epoch": 9.112240748271654, + "grad_norm": 0.3541170708570541, + "learning_rate": 4.335527098605675e-06, + "loss": 0.0043, + "step": 22407 + }, + { + "epoch": 9.112647417649452, + "grad_norm": 0.043545618425710676, + "learning_rate": 4.334692161340047e-06, + "loss": 0.0005, + "step": 22408 + }, + { + "epoch": 9.113054087027248, + "grad_norm": 0.009980919398460547, + "learning_rate": 4.333857282231655e-06, + "loss": 0.0001, + "step": 22409 + }, + { + "epoch": 9.113460756405043, + "grad_norm": 0.015841158406223264, + "learning_rate": 4.333022461289069e-06, + "loss": 0.0002, + "step": 22410 + }, + { + "epoch": 9.11386742578284, + "grad_norm": 0.039519557885593144, + "learning_rate": 4.332187698520861e-06, + "loss": 0.0005, + "step": 22411 + }, + { + "epoch": 9.114274095160635, + "grad_norm": 0.2516593985652893, + "learning_rate": 4.331352993935595e-06, + "loss": 0.0024, + "step": 22412 + }, + { + "epoch": 9.11468076453843, + "grad_norm": 0.03723190362267867, + "learning_rate": 4.330518347541845e-06, + "loss": 0.0004, + "step": 22413 + }, + { + "epoch": 9.115087433916226, + "grad_norm": 1.9928263988027435, + "learning_rate": 4.32968375934818e-06, + "loss": 0.0274, + "step": 22414 + }, + { + "epoch": 9.115494103294022, + "grad_norm": 0.4861254023151782, + "learning_rate": 4.328849229363163e-06, + "loss": 0.0049, + "step": 22415 + }, + { + "epoch": 9.115900772671818, + "grad_norm": 11.263546863098318, + "learning_rate": 4.328014757595365e-06, + "loss": 0.1753, + "step": 22416 + }, + { + "epoch": 9.116307442049614, + "grad_norm": 0.1140413441611235, + "learning_rate": 4.327180344053349e-06, + "loss": 0.0016, + "step": 22417 + }, + { + "epoch": 9.11671411142741, + "grad_norm": 0.01822371202801048, + "learning_rate": 4.326345988745679e-06, + "loss": 0.0002, + "step": 22418 + }, + { + "epoch": 9.117120780805205, + "grad_norm": 0.18849314996662864, + "learning_rate": 4.325511691680924e-06, + "loss": 0.0019, + "step": 22419 + }, + { + "epoch": 9.117527450183001, + "grad_norm": 0.021780847514328964, + "learning_rate": 4.324677452867651e-06, + "loss": 0.0003, + "step": 22420 + }, + { + "epoch": 9.117934119560797, + "grad_norm": 2.4655933744272556, + "learning_rate": 4.323843272314417e-06, + "loss": 0.0343, + "step": 22421 + }, + { + "epoch": 9.118340788938593, + "grad_norm": 3.523610110558917, + "learning_rate": 4.323009150029791e-06, + "loss": 0.046, + "step": 22422 + }, + { + "epoch": 9.118747458316388, + "grad_norm": 0.014687778805140118, + "learning_rate": 4.322175086022332e-06, + "loss": 0.0002, + "step": 22423 + }, + { + "epoch": 9.119154127694184, + "grad_norm": 1.3118217459917636, + "learning_rate": 4.321341080300604e-06, + "loss": 0.0163, + "step": 22424 + }, + { + "epoch": 9.11956079707198, + "grad_norm": 0.03143550120567946, + "learning_rate": 4.320507132873164e-06, + "loss": 0.0004, + "step": 22425 + }, + { + "epoch": 9.119967466449776, + "grad_norm": 0.19658975412327512, + "learning_rate": 4.31967324374858e-06, + "loss": 0.0021, + "step": 22426 + }, + { + "epoch": 9.120374135827571, + "grad_norm": 0.1274439235508064, + "learning_rate": 4.318839412935409e-06, + "loss": 0.0014, + "step": 22427 + }, + { + "epoch": 9.120780805205367, + "grad_norm": 5.669011853317949, + "learning_rate": 4.3180056404422115e-06, + "loss": 0.0927, + "step": 22428 + }, + { + "epoch": 9.121187474583165, + "grad_norm": 0.0017834014671595554, + "learning_rate": 4.317171926277545e-06, + "loss": 0.0, + "step": 22429 + }, + { + "epoch": 9.12159414396096, + "grad_norm": 0.0022827850451483337, + "learning_rate": 4.316338270449969e-06, + "loss": 0.0, + "step": 22430 + }, + { + "epoch": 9.122000813338756, + "grad_norm": 0.016734734035238452, + "learning_rate": 4.315504672968037e-06, + "loss": 0.0002, + "step": 22431 + }, + { + "epoch": 9.122407482716552, + "grad_norm": 0.007476094666687505, + "learning_rate": 4.314671133840315e-06, + "loss": 0.0, + "step": 22432 + }, + { + "epoch": 9.122814152094348, + "grad_norm": 0.24273554716271176, + "learning_rate": 4.313837653075355e-06, + "loss": 0.0044, + "step": 22433 + }, + { + "epoch": 9.123220821472144, + "grad_norm": 3.248091370512353, + "learning_rate": 4.313004230681714e-06, + "loss": 0.049, + "step": 22434 + }, + { + "epoch": 9.12362749084994, + "grad_norm": 0.026516061399807386, + "learning_rate": 4.312170866667944e-06, + "loss": 0.0004, + "step": 22435 + }, + { + "epoch": 9.124034160227735, + "grad_norm": 0.006116497032039162, + "learning_rate": 4.311337561042606e-06, + "loss": 0.0001, + "step": 22436 + }, + { + "epoch": 9.12444082960553, + "grad_norm": 1.0768361862461302, + "learning_rate": 4.310504313814249e-06, + "loss": 0.0082, + "step": 22437 + }, + { + "epoch": 9.124847498983327, + "grad_norm": 0.17342883216515068, + "learning_rate": 4.309671124991424e-06, + "loss": 0.002, + "step": 22438 + }, + { + "epoch": 9.125254168361122, + "grad_norm": 0.00042588463339330706, + "learning_rate": 4.308837994582695e-06, + "loss": 0.0, + "step": 22439 + }, + { + "epoch": 9.125660837738918, + "grad_norm": 0.0557313192802204, + "learning_rate": 4.308004922596607e-06, + "loss": 0.0005, + "step": 22440 + }, + { + "epoch": 9.126067507116714, + "grad_norm": 9.064844281256496, + "learning_rate": 4.307171909041714e-06, + "loss": 0.1781, + "step": 22441 + }, + { + "epoch": 9.12647417649451, + "grad_norm": 0.14819534040301657, + "learning_rate": 4.306338953926565e-06, + "loss": 0.0019, + "step": 22442 + }, + { + "epoch": 9.126880845872305, + "grad_norm": 0.18696781816119876, + "learning_rate": 4.305506057259714e-06, + "loss": 0.0019, + "step": 22443 + }, + { + "epoch": 9.127287515250101, + "grad_norm": 2.179949373405312, + "learning_rate": 4.304673219049706e-06, + "loss": 0.0118, + "step": 22444 + }, + { + "epoch": 9.127694184627897, + "grad_norm": 1.5952497815047606, + "learning_rate": 4.3038404393050945e-06, + "loss": 0.023, + "step": 22445 + }, + { + "epoch": 9.128100854005693, + "grad_norm": 0.23109484208410358, + "learning_rate": 4.303007718034434e-06, + "loss": 0.0024, + "step": 22446 + }, + { + "epoch": 9.128507523383488, + "grad_norm": 0.0035052206987929435, + "learning_rate": 4.302175055246261e-06, + "loss": 0.0, + "step": 22447 + }, + { + "epoch": 9.128914192761284, + "grad_norm": 0.10397513161639367, + "learning_rate": 4.301342450949131e-06, + "loss": 0.0014, + "step": 22448 + }, + { + "epoch": 9.129320862139082, + "grad_norm": 0.3050413879828386, + "learning_rate": 4.300509905151583e-06, + "loss": 0.0026, + "step": 22449 + }, + { + "epoch": 9.129727531516878, + "grad_norm": 0.9249387534271026, + "learning_rate": 4.299677417862174e-06, + "loss": 0.0089, + "step": 22450 + }, + { + "epoch": 9.130134200894673, + "grad_norm": 0.02794345686079512, + "learning_rate": 4.298844989089446e-06, + "loss": 0.0003, + "step": 22451 + }, + { + "epoch": 9.130540870272469, + "grad_norm": 0.1757212929878487, + "learning_rate": 4.2980126188419414e-06, + "loss": 0.0018, + "step": 22452 + }, + { + "epoch": 9.130947539650265, + "grad_norm": 2.1104342300959447, + "learning_rate": 4.297180307128208e-06, + "loss": 0.0307, + "step": 22453 + }, + { + "epoch": 9.13135420902806, + "grad_norm": 0.14114897623103065, + "learning_rate": 4.296348053956789e-06, + "loss": 0.0015, + "step": 22454 + }, + { + "epoch": 9.131760878405856, + "grad_norm": 0.304867769085899, + "learning_rate": 4.295515859336229e-06, + "loss": 0.0043, + "step": 22455 + }, + { + "epoch": 9.132167547783652, + "grad_norm": 0.32335607184683135, + "learning_rate": 4.294683723275065e-06, + "loss": 0.0026, + "step": 22456 + }, + { + "epoch": 9.132574217161448, + "grad_norm": 1.1493257060806334, + "learning_rate": 4.2938516457818465e-06, + "loss": 0.011, + "step": 22457 + }, + { + "epoch": 9.132980886539244, + "grad_norm": 2.263762207096746, + "learning_rate": 4.293019626865115e-06, + "loss": 0.0168, + "step": 22458 + }, + { + "epoch": 9.13338755591704, + "grad_norm": 0.07908271923739134, + "learning_rate": 4.292187666533407e-06, + "loss": 0.0007, + "step": 22459 + }, + { + "epoch": 9.133794225294835, + "grad_norm": 0.4170213507397542, + "learning_rate": 4.2913557647952675e-06, + "loss": 0.0063, + "step": 22460 + }, + { + "epoch": 9.134200894672631, + "grad_norm": 0.21193066606926467, + "learning_rate": 4.2905239216592336e-06, + "loss": 0.0026, + "step": 22461 + }, + { + "epoch": 9.134607564050427, + "grad_norm": 0.003529676048201965, + "learning_rate": 4.289692137133839e-06, + "loss": 0.0, + "step": 22462 + }, + { + "epoch": 9.135014233428222, + "grad_norm": 11.6526847761318, + "learning_rate": 4.288860411227635e-06, + "loss": 0.1672, + "step": 22463 + }, + { + "epoch": 9.135420902806018, + "grad_norm": 0.032157817988392436, + "learning_rate": 4.288028743949153e-06, + "loss": 0.0003, + "step": 22464 + }, + { + "epoch": 9.135827572183814, + "grad_norm": 0.5947793471112524, + "learning_rate": 4.28719713530693e-06, + "loss": 0.0098, + "step": 22465 + }, + { + "epoch": 9.13623424156161, + "grad_norm": 0.06759685137208565, + "learning_rate": 4.2863655853095034e-06, + "loss": 0.0008, + "step": 22466 + }, + { + "epoch": 9.136640910939406, + "grad_norm": 0.08521125755710303, + "learning_rate": 4.28553409396541e-06, + "loss": 0.0007, + "step": 22467 + }, + { + "epoch": 9.137047580317201, + "grad_norm": 0.36444609296478897, + "learning_rate": 4.284702661283185e-06, + "loss": 0.0033, + "step": 22468 + }, + { + "epoch": 9.137454249694997, + "grad_norm": 0.013202788884901735, + "learning_rate": 4.283871287271361e-06, + "loss": 0.0002, + "step": 22469 + }, + { + "epoch": 9.137860919072795, + "grad_norm": 0.010816741193479137, + "learning_rate": 4.2830399719384775e-06, + "loss": 0.0001, + "step": 22470 + }, + { + "epoch": 9.13826758845059, + "grad_norm": 0.004270625813148824, + "learning_rate": 4.282208715293067e-06, + "loss": 0.0001, + "step": 22471 + }, + { + "epoch": 9.138674257828386, + "grad_norm": 0.05862618834686703, + "learning_rate": 4.28137751734366e-06, + "loss": 0.0007, + "step": 22472 + }, + { + "epoch": 9.139080927206182, + "grad_norm": 0.1671242387889585, + "learning_rate": 4.280546378098792e-06, + "loss": 0.0019, + "step": 22473 + }, + { + "epoch": 9.139487596583978, + "grad_norm": 3.4325199547361502, + "learning_rate": 4.279715297566994e-06, + "loss": 0.0425, + "step": 22474 + }, + { + "epoch": 9.139894265961773, + "grad_norm": 0.4609162955703026, + "learning_rate": 4.278884275756794e-06, + "loss": 0.0045, + "step": 22475 + }, + { + "epoch": 9.14030093533957, + "grad_norm": 0.03719833945382311, + "learning_rate": 4.27805331267673e-06, + "loss": 0.0004, + "step": 22476 + }, + { + "epoch": 9.140707604717365, + "grad_norm": 0.16128778522419107, + "learning_rate": 4.277222408335329e-06, + "loss": 0.0018, + "step": 22477 + }, + { + "epoch": 9.14111427409516, + "grad_norm": 0.03798989482833624, + "learning_rate": 4.276391562741119e-06, + "loss": 0.0004, + "step": 22478 + }, + { + "epoch": 9.141520943472957, + "grad_norm": 0.7091860535232696, + "learning_rate": 4.27556077590263e-06, + "loss": 0.0066, + "step": 22479 + }, + { + "epoch": 9.141927612850752, + "grad_norm": 0.09011233942645615, + "learning_rate": 4.274730047828392e-06, + "loss": 0.0006, + "step": 22480 + }, + { + "epoch": 9.142334282228548, + "grad_norm": 0.35384477967714106, + "learning_rate": 4.2738993785269315e-06, + "loss": 0.004, + "step": 22481 + }, + { + "epoch": 9.142740951606344, + "grad_norm": 1.5632433958679712, + "learning_rate": 4.27306876800677e-06, + "loss": 0.0191, + "step": 22482 + }, + { + "epoch": 9.14314762098414, + "grad_norm": 10.446930650841832, + "learning_rate": 4.272238216276446e-06, + "loss": 0.0792, + "step": 22483 + }, + { + "epoch": 9.143554290361935, + "grad_norm": 1.8996041239595878, + "learning_rate": 4.271407723344479e-06, + "loss": 0.0151, + "step": 22484 + }, + { + "epoch": 9.143960959739731, + "grad_norm": 0.00836029488441604, + "learning_rate": 4.270577289219395e-06, + "loss": 0.0001, + "step": 22485 + }, + { + "epoch": 9.144367629117527, + "grad_norm": 0.1257746507432343, + "learning_rate": 4.269746913909717e-06, + "loss": 0.0006, + "step": 22486 + }, + { + "epoch": 9.144774298495323, + "grad_norm": 9.91529641615374, + "learning_rate": 4.268916597423972e-06, + "loss": 0.1475, + "step": 22487 + }, + { + "epoch": 9.145180967873118, + "grad_norm": 0.027237913226719205, + "learning_rate": 4.268086339770678e-06, + "loss": 0.0003, + "step": 22488 + }, + { + "epoch": 9.145587637250914, + "grad_norm": 0.8180846854903725, + "learning_rate": 4.267256140958368e-06, + "loss": 0.0047, + "step": 22489 + }, + { + "epoch": 9.145994306628712, + "grad_norm": 3.032251388644744, + "learning_rate": 4.266426000995558e-06, + "loss": 0.0329, + "step": 22490 + }, + { + "epoch": 9.146400976006507, + "grad_norm": 9.89129040575446, + "learning_rate": 4.265595919890771e-06, + "loss": 0.1534, + "step": 22491 + }, + { + "epoch": 9.146807645384303, + "grad_norm": 0.011948602771040578, + "learning_rate": 4.264765897652528e-06, + "loss": 0.0001, + "step": 22492 + }, + { + "epoch": 9.147214314762099, + "grad_norm": 0.002968498116771986, + "learning_rate": 4.263935934289348e-06, + "loss": 0.0, + "step": 22493 + }, + { + "epoch": 9.147620984139895, + "grad_norm": 0.046318631266470144, + "learning_rate": 4.263106029809754e-06, + "loss": 0.0006, + "step": 22494 + }, + { + "epoch": 9.14802765351769, + "grad_norm": 10.18551573853749, + "learning_rate": 4.2622761842222604e-06, + "loss": 0.1448, + "step": 22495 + }, + { + "epoch": 9.148434322895486, + "grad_norm": 0.004835608181992147, + "learning_rate": 4.261446397535393e-06, + "loss": 0.0001, + "step": 22496 + }, + { + "epoch": 9.148840992273282, + "grad_norm": 1.3150616469370955, + "learning_rate": 4.260616669757666e-06, + "loss": 0.0171, + "step": 22497 + }, + { + "epoch": 9.149247661651078, + "grad_norm": 0.044213342629573195, + "learning_rate": 4.259787000897597e-06, + "loss": 0.0005, + "step": 22498 + }, + { + "epoch": 9.149654331028874, + "grad_norm": 0.011247259809712985, + "learning_rate": 4.258957390963705e-06, + "loss": 0.0001, + "step": 22499 + }, + { + "epoch": 9.15006100040667, + "grad_norm": 13.506185891486046, + "learning_rate": 4.2581278399645035e-06, + "loss": 0.0988, + "step": 22500 + }, + { + "epoch": 9.150467669784465, + "grad_norm": 0.8780650238434117, + "learning_rate": 4.257298347908506e-06, + "loss": 0.0096, + "step": 22501 + }, + { + "epoch": 9.150874339162261, + "grad_norm": 0.052057375348448176, + "learning_rate": 4.256468914804236e-06, + "loss": 0.0007, + "step": 22502 + }, + { + "epoch": 9.151281008540057, + "grad_norm": 0.3172876186158256, + "learning_rate": 4.2556395406602014e-06, + "loss": 0.0038, + "step": 22503 + }, + { + "epoch": 9.151687677917852, + "grad_norm": 0.21790160522391208, + "learning_rate": 4.25481022548492e-06, + "loss": 0.0019, + "step": 22504 + }, + { + "epoch": 9.152094347295648, + "grad_norm": 0.009048839687883297, + "learning_rate": 4.253980969286901e-06, + "loss": 0.0001, + "step": 22505 + }, + { + "epoch": 9.152501016673444, + "grad_norm": 0.00459897508269944, + "learning_rate": 4.253151772074661e-06, + "loss": 0.0001, + "step": 22506 + }, + { + "epoch": 9.15290768605124, + "grad_norm": 1.9213967649465005, + "learning_rate": 4.2523226338567105e-06, + "loss": 0.025, + "step": 22507 + }, + { + "epoch": 9.153314355429035, + "grad_norm": 0.17102127795404237, + "learning_rate": 4.251493554641556e-06, + "loss": 0.0023, + "step": 22508 + }, + { + "epoch": 9.153721024806831, + "grad_norm": 0.04585226274878527, + "learning_rate": 4.250664534437717e-06, + "loss": 0.0006, + "step": 22509 + }, + { + "epoch": 9.154127694184627, + "grad_norm": 0.08290773384275796, + "learning_rate": 4.249835573253701e-06, + "loss": 0.0008, + "step": 22510 + }, + { + "epoch": 9.154534363562425, + "grad_norm": 0.012447746181356249, + "learning_rate": 4.249006671098016e-06, + "loss": 0.0001, + "step": 22511 + }, + { + "epoch": 9.15494103294022, + "grad_norm": 0.0068438726354130884, + "learning_rate": 4.248177827979173e-06, + "loss": 0.0001, + "step": 22512 + }, + { + "epoch": 9.155347702318016, + "grad_norm": 3.1279874171641295, + "learning_rate": 4.247349043905678e-06, + "loss": 0.0183, + "step": 22513 + }, + { + "epoch": 9.155754371695812, + "grad_norm": 0.005104642073147044, + "learning_rate": 4.246520318886038e-06, + "loss": 0.0001, + "step": 22514 + }, + { + "epoch": 9.156161041073608, + "grad_norm": 0.006743322503767114, + "learning_rate": 4.245691652928766e-06, + "loss": 0.0001, + "step": 22515 + }, + { + "epoch": 9.156567710451403, + "grad_norm": 0.20008192102104302, + "learning_rate": 4.244863046042366e-06, + "loss": 0.0017, + "step": 22516 + }, + { + "epoch": 9.1569743798292, + "grad_norm": 7.704386867856014, + "learning_rate": 4.244034498235341e-06, + "loss": 0.1176, + "step": 22517 + }, + { + "epoch": 9.157381049206995, + "grad_norm": 0.006817559878851511, + "learning_rate": 4.243206009516201e-06, + "loss": 0.0001, + "step": 22518 + }, + { + "epoch": 9.15778771858479, + "grad_norm": 0.07548678004645602, + "learning_rate": 4.242377579893447e-06, + "loss": 0.0003, + "step": 22519 + }, + { + "epoch": 9.158194387962586, + "grad_norm": 0.0662603722545897, + "learning_rate": 4.241549209375586e-06, + "loss": 0.0006, + "step": 22520 + }, + { + "epoch": 9.158601057340382, + "grad_norm": 10.057023786964159, + "learning_rate": 4.240720897971117e-06, + "loss": 0.4337, + "step": 22521 + }, + { + "epoch": 9.159007726718178, + "grad_norm": 0.1415224591333645, + "learning_rate": 4.239892645688549e-06, + "loss": 0.0014, + "step": 22522 + }, + { + "epoch": 9.159414396095974, + "grad_norm": 0.05637070227693679, + "learning_rate": 4.239064452536384e-06, + "loss": 0.0007, + "step": 22523 + }, + { + "epoch": 9.15982106547377, + "grad_norm": 0.04985804034261661, + "learning_rate": 4.23823631852312e-06, + "loss": 0.0003, + "step": 22524 + }, + { + "epoch": 9.160227734851565, + "grad_norm": 1.8404376891446097, + "learning_rate": 4.237408243657261e-06, + "loss": 0.0184, + "step": 22525 + }, + { + "epoch": 9.160634404229361, + "grad_norm": 0.8512789382373236, + "learning_rate": 4.236580227947305e-06, + "loss": 0.0088, + "step": 22526 + }, + { + "epoch": 9.161041073607157, + "grad_norm": 0.0036476529634597034, + "learning_rate": 4.235752271401755e-06, + "loss": 0.0, + "step": 22527 + }, + { + "epoch": 9.161447742984953, + "grad_norm": 0.010466736644829234, + "learning_rate": 4.234924374029105e-06, + "loss": 0.0001, + "step": 22528 + }, + { + "epoch": 9.161854412362748, + "grad_norm": 0.0019255127909977125, + "learning_rate": 4.234096535837863e-06, + "loss": 0.0, + "step": 22529 + }, + { + "epoch": 9.162261081740544, + "grad_norm": 0.10029523703420598, + "learning_rate": 4.233268756836521e-06, + "loss": 0.0018, + "step": 22530 + }, + { + "epoch": 9.162667751118342, + "grad_norm": 0.6362867308055685, + "learning_rate": 4.232441037033576e-06, + "loss": 0.0067, + "step": 22531 + }, + { + "epoch": 9.163074420496137, + "grad_norm": 0.009490574733416557, + "learning_rate": 4.231613376437528e-06, + "loss": 0.0001, + "step": 22532 + }, + { + "epoch": 9.163481089873933, + "grad_norm": 0.028849779014867204, + "learning_rate": 4.2307857750568715e-06, + "loss": 0.0004, + "step": 22533 + }, + { + "epoch": 9.163887759251729, + "grad_norm": 0.013970624385987482, + "learning_rate": 4.2299582329001e-06, + "loss": 0.0001, + "step": 22534 + }, + { + "epoch": 9.164294428629525, + "grad_norm": 0.8252970857399625, + "learning_rate": 4.229130749975713e-06, + "loss": 0.0088, + "step": 22535 + }, + { + "epoch": 9.16470109800732, + "grad_norm": 0.8921164055593264, + "learning_rate": 4.228303326292205e-06, + "loss": 0.0041, + "step": 22536 + }, + { + "epoch": 9.165107767385116, + "grad_norm": 0.7812301500274154, + "learning_rate": 4.227475961858067e-06, + "loss": 0.009, + "step": 22537 + }, + { + "epoch": 9.165514436762912, + "grad_norm": 0.000630051099856158, + "learning_rate": 4.226648656681794e-06, + "loss": 0.0, + "step": 22538 + }, + { + "epoch": 9.165921106140708, + "grad_norm": 0.45547574546381586, + "learning_rate": 4.225821410771877e-06, + "loss": 0.004, + "step": 22539 + }, + { + "epoch": 9.166327775518504, + "grad_norm": 0.003341754415340443, + "learning_rate": 4.2249942241368115e-06, + "loss": 0.0, + "step": 22540 + }, + { + "epoch": 9.1667344448963, + "grad_norm": 0.06322828320363418, + "learning_rate": 4.224167096785082e-06, + "loss": 0.0007, + "step": 22541 + }, + { + "epoch": 9.167141114274095, + "grad_norm": 0.10202835312604604, + "learning_rate": 4.223340028725188e-06, + "loss": 0.0011, + "step": 22542 + }, + { + "epoch": 9.16754778365189, + "grad_norm": 0.08591149588620818, + "learning_rate": 4.222513019965616e-06, + "loss": 0.0006, + "step": 22543 + }, + { + "epoch": 9.167954453029687, + "grad_norm": 8.738956379293008, + "learning_rate": 4.221686070514856e-06, + "loss": 0.0191, + "step": 22544 + }, + { + "epoch": 9.168361122407482, + "grad_norm": 5.519748400339949, + "learning_rate": 4.220859180381395e-06, + "loss": 0.0791, + "step": 22545 + }, + { + "epoch": 9.168767791785278, + "grad_norm": 0.0017030003680948563, + "learning_rate": 4.220032349573724e-06, + "loss": 0.0, + "step": 22546 + }, + { + "epoch": 9.169174461163074, + "grad_norm": 0.255852104227978, + "learning_rate": 4.21920557810033e-06, + "loss": 0.0028, + "step": 22547 + }, + { + "epoch": 9.16958113054087, + "grad_norm": 0.9256270202574868, + "learning_rate": 4.2183788659697e-06, + "loss": 0.0085, + "step": 22548 + }, + { + "epoch": 9.169987799918665, + "grad_norm": 8.46529146822723, + "learning_rate": 4.217552213190321e-06, + "loss": 0.1327, + "step": 22549 + }, + { + "epoch": 9.170394469296461, + "grad_norm": 2.419735521960716, + "learning_rate": 4.216725619770679e-06, + "loss": 0.0288, + "step": 22550 + }, + { + "epoch": 9.170801138674257, + "grad_norm": 0.008259463742070695, + "learning_rate": 4.215899085719259e-06, + "loss": 0.0001, + "step": 22551 + }, + { + "epoch": 9.171207808052054, + "grad_norm": 0.07486295316778757, + "learning_rate": 4.2150726110445425e-06, + "loss": 0.0008, + "step": 22552 + }, + { + "epoch": 9.17161447742985, + "grad_norm": 0.0008060510694468179, + "learning_rate": 4.214246195755021e-06, + "loss": 0.0, + "step": 22553 + }, + { + "epoch": 9.172021146807646, + "grad_norm": 0.029459448861058734, + "learning_rate": 4.213419839859173e-06, + "loss": 0.0002, + "step": 22554 + }, + { + "epoch": 9.172427816185442, + "grad_norm": 0.191766262926512, + "learning_rate": 4.212593543365484e-06, + "loss": 0.0024, + "step": 22555 + }, + { + "epoch": 9.172834485563238, + "grad_norm": 0.005179147955203616, + "learning_rate": 4.211767306282434e-06, + "loss": 0.0001, + "step": 22556 + }, + { + "epoch": 9.173241154941033, + "grad_norm": 0.2359885884701756, + "learning_rate": 4.210941128618507e-06, + "loss": 0.0037, + "step": 22557 + }, + { + "epoch": 9.173647824318829, + "grad_norm": 0.19946594455539407, + "learning_rate": 4.210115010382182e-06, + "loss": 0.0013, + "step": 22558 + }, + { + "epoch": 9.174054493696625, + "grad_norm": 0.031052277295536012, + "learning_rate": 4.209288951581937e-06, + "loss": 0.0004, + "step": 22559 + }, + { + "epoch": 9.17446116307442, + "grad_norm": 0.06690231644976294, + "learning_rate": 4.208462952226258e-06, + "loss": 0.0005, + "step": 22560 + }, + { + "epoch": 9.174867832452216, + "grad_norm": 0.005192299165833559, + "learning_rate": 4.207637012323622e-06, + "loss": 0.0001, + "step": 22561 + }, + { + "epoch": 9.175274501830012, + "grad_norm": 13.127800985458974, + "learning_rate": 4.2068111318825075e-06, + "loss": 0.1861, + "step": 22562 + }, + { + "epoch": 9.175681171207808, + "grad_norm": 0.06587235363754852, + "learning_rate": 4.205985310911393e-06, + "loss": 0.0009, + "step": 22563 + }, + { + "epoch": 9.176087840585604, + "grad_norm": 0.014383844112351702, + "learning_rate": 4.205159549418756e-06, + "loss": 0.0002, + "step": 22564 + }, + { + "epoch": 9.1764945099634, + "grad_norm": 0.0327962678256964, + "learning_rate": 4.204333847413068e-06, + "loss": 0.0004, + "step": 22565 + }, + { + "epoch": 9.176901179341195, + "grad_norm": 0.3536145021933205, + "learning_rate": 4.203508204902813e-06, + "loss": 0.002, + "step": 22566 + }, + { + "epoch": 9.177307848718991, + "grad_norm": 0.06155752952520911, + "learning_rate": 4.202682621896464e-06, + "loss": 0.0005, + "step": 22567 + }, + { + "epoch": 9.177714518096787, + "grad_norm": 0.008379358555566936, + "learning_rate": 4.201857098402496e-06, + "loss": 0.0001, + "step": 22568 + }, + { + "epoch": 9.178121187474582, + "grad_norm": 0.8745947694027437, + "learning_rate": 4.201031634429383e-06, + "loss": 0.0102, + "step": 22569 + }, + { + "epoch": 9.178527856852378, + "grad_norm": 6.073615949049296, + "learning_rate": 4.200206229985599e-06, + "loss": 0.0847, + "step": 22570 + }, + { + "epoch": 9.178934526230174, + "grad_norm": 0.12126152762468301, + "learning_rate": 4.199380885079617e-06, + "loss": 0.0009, + "step": 22571 + }, + { + "epoch": 9.179341195607972, + "grad_norm": 0.6494496551104937, + "learning_rate": 4.1985555997199055e-06, + "loss": 0.007, + "step": 22572 + }, + { + "epoch": 9.179747864985767, + "grad_norm": 0.9711021545170468, + "learning_rate": 4.197730373914944e-06, + "loss": 0.0117, + "step": 22573 + }, + { + "epoch": 9.180154534363563, + "grad_norm": 0.0006709693355181667, + "learning_rate": 4.196905207673201e-06, + "loss": 0.0, + "step": 22574 + }, + { + "epoch": 9.180561203741359, + "grad_norm": 0.03501724281420196, + "learning_rate": 4.196080101003146e-06, + "loss": 0.0005, + "step": 22575 + }, + { + "epoch": 9.180967873119155, + "grad_norm": 0.002589490819057596, + "learning_rate": 4.195255053913251e-06, + "loss": 0.0, + "step": 22576 + }, + { + "epoch": 9.18137454249695, + "grad_norm": 0.38301953516441783, + "learning_rate": 4.194430066411983e-06, + "loss": 0.0034, + "step": 22577 + }, + { + "epoch": 9.181781211874746, + "grad_norm": 0.8705518777110849, + "learning_rate": 4.1936051385078096e-06, + "loss": 0.0118, + "step": 22578 + }, + { + "epoch": 9.182187881252542, + "grad_norm": 7.587168226012348, + "learning_rate": 4.192780270209205e-06, + "loss": 0.202, + "step": 22579 + }, + { + "epoch": 9.182594550630338, + "grad_norm": 0.019277893537104775, + "learning_rate": 4.1919554615246335e-06, + "loss": 0.0002, + "step": 22580 + }, + { + "epoch": 9.183001220008133, + "grad_norm": 0.0005182306167918632, + "learning_rate": 4.191130712462563e-06, + "loss": 0.0, + "step": 22581 + }, + { + "epoch": 9.18340788938593, + "grad_norm": 0.00031633995125009736, + "learning_rate": 4.190306023031459e-06, + "loss": 0.0, + "step": 22582 + }, + { + "epoch": 9.183814558763725, + "grad_norm": 0.10592915084406528, + "learning_rate": 4.1894813932397874e-06, + "loss": 0.0012, + "step": 22583 + }, + { + "epoch": 9.18422122814152, + "grad_norm": 4.150217605350346, + "learning_rate": 4.188656823096014e-06, + "loss": 0.0657, + "step": 22584 + }, + { + "epoch": 9.184627897519317, + "grad_norm": 0.01760069997383594, + "learning_rate": 4.1878323126085985e-06, + "loss": 0.0002, + "step": 22585 + }, + { + "epoch": 9.185034566897112, + "grad_norm": 0.06748741306238334, + "learning_rate": 4.187007861786015e-06, + "loss": 0.0006, + "step": 22586 + }, + { + "epoch": 9.185441236274908, + "grad_norm": 0.21745354273772377, + "learning_rate": 4.186183470636719e-06, + "loss": 0.0024, + "step": 22587 + }, + { + "epoch": 9.185847905652704, + "grad_norm": 0.15521479335229865, + "learning_rate": 4.185359139169177e-06, + "loss": 0.0012, + "step": 22588 + }, + { + "epoch": 9.1862545750305, + "grad_norm": 0.1048717657571791, + "learning_rate": 4.184534867391849e-06, + "loss": 0.0008, + "step": 22589 + }, + { + "epoch": 9.186661244408295, + "grad_norm": 0.20280315913010646, + "learning_rate": 4.183710655313199e-06, + "loss": 0.0019, + "step": 22590 + }, + { + "epoch": 9.187067913786091, + "grad_norm": 0.6687933993939932, + "learning_rate": 4.182886502941682e-06, + "loss": 0.0061, + "step": 22591 + }, + { + "epoch": 9.187474583163887, + "grad_norm": 0.300675351616519, + "learning_rate": 4.182062410285767e-06, + "loss": 0.0025, + "step": 22592 + }, + { + "epoch": 9.187881252541684, + "grad_norm": 0.05823108151642727, + "learning_rate": 4.181238377353909e-06, + "loss": 0.0006, + "step": 22593 + }, + { + "epoch": 9.18828792191948, + "grad_norm": 2.5050230575362473, + "learning_rate": 4.1804144041545665e-06, + "loss": 0.029, + "step": 22594 + }, + { + "epoch": 9.188694591297276, + "grad_norm": 2.309456644325734, + "learning_rate": 4.179590490696201e-06, + "loss": 0.0091, + "step": 22595 + }, + { + "epoch": 9.189101260675072, + "grad_norm": 0.014380044946189179, + "learning_rate": 4.178766636987267e-06, + "loss": 0.0001, + "step": 22596 + }, + { + "epoch": 9.189507930052867, + "grad_norm": 0.042838692163359345, + "learning_rate": 4.177942843036223e-06, + "loss": 0.0005, + "step": 22597 + }, + { + "epoch": 9.189914599430663, + "grad_norm": 0.8401583350957536, + "learning_rate": 4.1771191088515236e-06, + "loss": 0.0055, + "step": 22598 + }, + { + "epoch": 9.190321268808459, + "grad_norm": 0.010014217565758394, + "learning_rate": 4.176295434441631e-06, + "loss": 0.0001, + "step": 22599 + }, + { + "epoch": 9.190727938186255, + "grad_norm": 0.37263392180641797, + "learning_rate": 4.175471819814996e-06, + "loss": 0.0038, + "step": 22600 + }, + { + "epoch": 9.19113460756405, + "grad_norm": 1.2441691316240615, + "learning_rate": 4.174648264980073e-06, + "loss": 0.0118, + "step": 22601 + }, + { + "epoch": 9.191541276941846, + "grad_norm": 0.015353826063351613, + "learning_rate": 4.173824769945319e-06, + "loss": 0.0001, + "step": 22602 + }, + { + "epoch": 9.191947946319642, + "grad_norm": 0.06176712474225471, + "learning_rate": 4.173001334719186e-06, + "loss": 0.0005, + "step": 22603 + }, + { + "epoch": 9.192354615697438, + "grad_norm": 0.0032137301375162854, + "learning_rate": 4.172177959310122e-06, + "loss": 0.0, + "step": 22604 + }, + { + "epoch": 9.192761285075234, + "grad_norm": 0.3891029636342918, + "learning_rate": 4.171354643726589e-06, + "loss": 0.0043, + "step": 22605 + }, + { + "epoch": 9.19316795445303, + "grad_norm": 1.574714398329897, + "learning_rate": 4.170531387977034e-06, + "loss": 0.037, + "step": 22606 + }, + { + "epoch": 9.193574623830825, + "grad_norm": 0.19912346077296308, + "learning_rate": 4.1697081920699075e-06, + "loss": 0.0028, + "step": 22607 + }, + { + "epoch": 9.193981293208621, + "grad_norm": 0.009669464489040843, + "learning_rate": 4.168885056013662e-06, + "loss": 0.0001, + "step": 22608 + }, + { + "epoch": 9.194387962586417, + "grad_norm": 0.02438703708250418, + "learning_rate": 4.168061979816746e-06, + "loss": 0.0003, + "step": 22609 + }, + { + "epoch": 9.194794631964212, + "grad_norm": 0.08208791706653494, + "learning_rate": 4.167238963487608e-06, + "loss": 0.0007, + "step": 22610 + }, + { + "epoch": 9.195201301342008, + "grad_norm": 0.0002916805453297217, + "learning_rate": 4.166416007034695e-06, + "loss": 0.0, + "step": 22611 + }, + { + "epoch": 9.195607970719804, + "grad_norm": 1.6546466972921257, + "learning_rate": 4.16559311046646e-06, + "loss": 0.0141, + "step": 22612 + }, + { + "epoch": 9.196014640097602, + "grad_norm": 0.013912387046466756, + "learning_rate": 4.1647702737913486e-06, + "loss": 0.0002, + "step": 22613 + }, + { + "epoch": 9.196421309475397, + "grad_norm": 0.2196081870728638, + "learning_rate": 4.163947497017807e-06, + "loss": 0.0033, + "step": 22614 + }, + { + "epoch": 9.196827978853193, + "grad_norm": 0.08828702128812553, + "learning_rate": 4.1631247801542825e-06, + "loss": 0.001, + "step": 22615 + }, + { + "epoch": 9.197234648230989, + "grad_norm": 2.1619508315468865, + "learning_rate": 4.16230212320922e-06, + "loss": 0.0208, + "step": 22616 + }, + { + "epoch": 9.197641317608785, + "grad_norm": 2.4537034605544363, + "learning_rate": 4.161479526191059e-06, + "loss": 0.0263, + "step": 22617 + }, + { + "epoch": 9.19804798698658, + "grad_norm": 0.054768478650919244, + "learning_rate": 4.160656989108254e-06, + "loss": 0.0005, + "step": 22618 + }, + { + "epoch": 9.198454656364376, + "grad_norm": 0.49941737122923596, + "learning_rate": 4.159834511969243e-06, + "loss": 0.0058, + "step": 22619 + }, + { + "epoch": 9.198861325742172, + "grad_norm": 1.435810600314866, + "learning_rate": 4.159012094782471e-06, + "loss": 0.0132, + "step": 22620 + }, + { + "epoch": 9.199267995119968, + "grad_norm": 0.0003952806931228103, + "learning_rate": 4.1581897375563785e-06, + "loss": 0.0, + "step": 22621 + }, + { + "epoch": 9.199674664497763, + "grad_norm": 0.0045200457246716876, + "learning_rate": 4.157367440299409e-06, + "loss": 0.0001, + "step": 22622 + }, + { + "epoch": 9.20008133387556, + "grad_norm": 0.007395725908019571, + "learning_rate": 4.156545203020004e-06, + "loss": 0.0001, + "step": 22623 + }, + { + "epoch": 9.200488003253355, + "grad_norm": 0.006137303039187737, + "learning_rate": 4.155723025726599e-06, + "loss": 0.0001, + "step": 22624 + }, + { + "epoch": 9.20089467263115, + "grad_norm": 0.01919988439225642, + "learning_rate": 4.154900908427642e-06, + "loss": 0.0002, + "step": 22625 + }, + { + "epoch": 9.201301342008946, + "grad_norm": 0.08787135079315177, + "learning_rate": 4.154078851131569e-06, + "loss": 0.0014, + "step": 22626 + }, + { + "epoch": 9.201708011386742, + "grad_norm": 0.07752175289171918, + "learning_rate": 4.153256853846818e-06, + "loss": 0.0008, + "step": 22627 + }, + { + "epoch": 9.202114680764538, + "grad_norm": 0.10913292722488378, + "learning_rate": 4.15243491658183e-06, + "loss": 0.001, + "step": 22628 + }, + { + "epoch": 9.202521350142334, + "grad_norm": 0.04572767928833778, + "learning_rate": 4.1516130393450395e-06, + "loss": 0.0003, + "step": 22629 + }, + { + "epoch": 9.20292801952013, + "grad_norm": 0.09176980764729972, + "learning_rate": 4.150791222144884e-06, + "loss": 0.0006, + "step": 22630 + }, + { + "epoch": 9.203334688897925, + "grad_norm": 0.11074064439056609, + "learning_rate": 4.149969464989798e-06, + "loss": 0.0012, + "step": 22631 + }, + { + "epoch": 9.203741358275721, + "grad_norm": 0.0006025616792133606, + "learning_rate": 4.149147767888223e-06, + "loss": 0.0, + "step": 22632 + }, + { + "epoch": 9.204148027653517, + "grad_norm": 0.07456284329227879, + "learning_rate": 4.148326130848591e-06, + "loss": 0.0014, + "step": 22633 + }, + { + "epoch": 9.204554697031314, + "grad_norm": 0.11378647085045425, + "learning_rate": 4.147504553879336e-06, + "loss": 0.001, + "step": 22634 + }, + { + "epoch": 9.20496136640911, + "grad_norm": 0.1949979877531529, + "learning_rate": 4.146683036988892e-06, + "loss": 0.0018, + "step": 22635 + }, + { + "epoch": 9.205368035786906, + "grad_norm": 0.012706243405919733, + "learning_rate": 4.145861580185693e-06, + "loss": 0.0002, + "step": 22636 + }, + { + "epoch": 9.205774705164702, + "grad_norm": 9.617342034282872, + "learning_rate": 4.145040183478167e-06, + "loss": 0.1282, + "step": 22637 + }, + { + "epoch": 9.206181374542497, + "grad_norm": 0.15056094796440397, + "learning_rate": 4.144218846874755e-06, + "loss": 0.0014, + "step": 22638 + }, + { + "epoch": 9.206588043920293, + "grad_norm": 0.0023468319661148858, + "learning_rate": 4.143397570383882e-06, + "loss": 0.0, + "step": 22639 + }, + { + "epoch": 9.206994713298089, + "grad_norm": 0.011502658753492158, + "learning_rate": 4.1425763540139815e-06, + "loss": 0.0001, + "step": 22640 + }, + { + "epoch": 9.207401382675885, + "grad_norm": 0.05021430737613787, + "learning_rate": 4.141755197773482e-06, + "loss": 0.0007, + "step": 22641 + }, + { + "epoch": 9.20780805205368, + "grad_norm": 0.054949690955283355, + "learning_rate": 4.140934101670815e-06, + "loss": 0.0005, + "step": 22642 + }, + { + "epoch": 9.208214721431476, + "grad_norm": 0.0012971998757982898, + "learning_rate": 4.140113065714409e-06, + "loss": 0.0, + "step": 22643 + }, + { + "epoch": 9.208621390809272, + "grad_norm": 0.061931666215064074, + "learning_rate": 4.1392920899126855e-06, + "loss": 0.0007, + "step": 22644 + }, + { + "epoch": 9.209028060187068, + "grad_norm": 4.118796759506042, + "learning_rate": 4.138471174274087e-06, + "loss": 0.0801, + "step": 22645 + }, + { + "epoch": 9.209434729564864, + "grad_norm": 1.4488547099575524, + "learning_rate": 4.137650318807028e-06, + "loss": 0.0177, + "step": 22646 + }, + { + "epoch": 9.20984139894266, + "grad_norm": 1.1934938002666053, + "learning_rate": 4.136829523519939e-06, + "loss": 0.0079, + "step": 22647 + }, + { + "epoch": 9.210248068320455, + "grad_norm": 0.034571164253957676, + "learning_rate": 4.1360087884212456e-06, + "loss": 0.0003, + "step": 22648 + }, + { + "epoch": 9.21065473769825, + "grad_norm": 0.08862127496349281, + "learning_rate": 4.1351881135193695e-06, + "loss": 0.0007, + "step": 22649 + }, + { + "epoch": 9.211061407076047, + "grad_norm": 0.38501385837677893, + "learning_rate": 4.134367498822744e-06, + "loss": 0.0035, + "step": 22650 + }, + { + "epoch": 9.211468076453842, + "grad_norm": 0.10419835943438507, + "learning_rate": 4.133546944339787e-06, + "loss": 0.0007, + "step": 22651 + }, + { + "epoch": 9.211874745831638, + "grad_norm": 0.0826053897397055, + "learning_rate": 4.132726450078923e-06, + "loss": 0.001, + "step": 22652 + }, + { + "epoch": 9.212281415209434, + "grad_norm": 0.0014940717104561217, + "learning_rate": 4.131906016048576e-06, + "loss": 0.0, + "step": 22653 + }, + { + "epoch": 9.212688084587231, + "grad_norm": 0.1567999015527029, + "learning_rate": 4.131085642257166e-06, + "loss": 0.0016, + "step": 22654 + }, + { + "epoch": 9.213094753965027, + "grad_norm": 0.10107032623596673, + "learning_rate": 4.1302653287131125e-06, + "loss": 0.0011, + "step": 22655 + }, + { + "epoch": 9.213501423342823, + "grad_norm": 0.044008013015507975, + "learning_rate": 4.129445075424844e-06, + "loss": 0.0004, + "step": 22656 + }, + { + "epoch": 9.213908092720619, + "grad_norm": 0.29945588493790515, + "learning_rate": 4.1286248824007745e-06, + "loss": 0.0023, + "step": 22657 + }, + { + "epoch": 9.214314762098414, + "grad_norm": 7.7982650107391, + "learning_rate": 4.127804749649327e-06, + "loss": 0.0736, + "step": 22658 + }, + { + "epoch": 9.21472143147621, + "grad_norm": 0.014814611865654685, + "learning_rate": 4.1269846771789195e-06, + "loss": 0.0002, + "step": 22659 + }, + { + "epoch": 9.215128100854006, + "grad_norm": 0.05114338050494801, + "learning_rate": 4.126164664997969e-06, + "loss": 0.0001, + "step": 22660 + }, + { + "epoch": 9.215534770231802, + "grad_norm": 0.11324594936935987, + "learning_rate": 4.125344713114895e-06, + "loss": 0.0017, + "step": 22661 + }, + { + "epoch": 9.215941439609598, + "grad_norm": 0.3108229147812949, + "learning_rate": 4.124524821538111e-06, + "loss": 0.0037, + "step": 22662 + }, + { + "epoch": 9.216348108987393, + "grad_norm": 0.5164161474570533, + "learning_rate": 4.123704990276039e-06, + "loss": 0.0057, + "step": 22663 + }, + { + "epoch": 9.216754778365189, + "grad_norm": 0.014537573987316388, + "learning_rate": 4.1228852193370935e-06, + "loss": 0.0001, + "step": 22664 + }, + { + "epoch": 9.217161447742985, + "grad_norm": 0.06094007483976399, + "learning_rate": 4.122065508729689e-06, + "loss": 0.0006, + "step": 22665 + }, + { + "epoch": 9.21756811712078, + "grad_norm": 0.22735301327879795, + "learning_rate": 4.121245858462242e-06, + "loss": 0.0021, + "step": 22666 + }, + { + "epoch": 9.217974786498576, + "grad_norm": 0.010456974140681445, + "learning_rate": 4.120426268543163e-06, + "loss": 0.0001, + "step": 22667 + }, + { + "epoch": 9.218381455876372, + "grad_norm": 0.43007848932782317, + "learning_rate": 4.119606738980863e-06, + "loss": 0.003, + "step": 22668 + }, + { + "epoch": 9.218788125254168, + "grad_norm": 0.025339998439305105, + "learning_rate": 4.118787269783765e-06, + "loss": 0.0003, + "step": 22669 + }, + { + "epoch": 9.219194794631964, + "grad_norm": 0.1399319044268538, + "learning_rate": 4.117967860960275e-06, + "loss": 0.0016, + "step": 22670 + }, + { + "epoch": 9.21960146400976, + "grad_norm": 0.3925144536987626, + "learning_rate": 4.117148512518806e-06, + "loss": 0.0036, + "step": 22671 + }, + { + "epoch": 9.220008133387555, + "grad_norm": 0.003361560537589459, + "learning_rate": 4.116329224467768e-06, + "loss": 0.0, + "step": 22672 + }, + { + "epoch": 9.220414802765351, + "grad_norm": 0.3193098403763589, + "learning_rate": 4.115509996815571e-06, + "loss": 0.0028, + "step": 22673 + }, + { + "epoch": 9.220821472143147, + "grad_norm": 0.002816853609156086, + "learning_rate": 4.114690829570625e-06, + "loss": 0.0, + "step": 22674 + }, + { + "epoch": 9.221228141520944, + "grad_norm": 0.038646561311899544, + "learning_rate": 4.113871722741337e-06, + "loss": 0.0003, + "step": 22675 + }, + { + "epoch": 9.22163481089874, + "grad_norm": 0.02209249054066649, + "learning_rate": 4.1130526763361225e-06, + "loss": 0.0001, + "step": 22676 + }, + { + "epoch": 9.222041480276536, + "grad_norm": 0.030583702696383788, + "learning_rate": 4.112233690363384e-06, + "loss": 0.0003, + "step": 22677 + }, + { + "epoch": 9.222448149654332, + "grad_norm": 0.04852823120547075, + "learning_rate": 4.11141476483153e-06, + "loss": 0.0004, + "step": 22678 + }, + { + "epoch": 9.222854819032127, + "grad_norm": 0.03738322965089216, + "learning_rate": 4.110595899748967e-06, + "loss": 0.0002, + "step": 22679 + }, + { + "epoch": 9.223261488409923, + "grad_norm": 0.40870177547512987, + "learning_rate": 4.109777095124101e-06, + "loss": 0.0051, + "step": 22680 + }, + { + "epoch": 9.223668157787719, + "grad_norm": 0.0021704298117386066, + "learning_rate": 4.108958350965335e-06, + "loss": 0.0, + "step": 22681 + }, + { + "epoch": 9.224074827165515, + "grad_norm": 0.012760529210264498, + "learning_rate": 4.108139667281079e-06, + "loss": 0.0001, + "step": 22682 + }, + { + "epoch": 9.22448149654331, + "grad_norm": 0.03260135783496367, + "learning_rate": 4.107321044079735e-06, + "loss": 0.0004, + "step": 22683 + }, + { + "epoch": 9.224888165921106, + "grad_norm": 0.00013989312142794163, + "learning_rate": 4.1065024813697066e-06, + "loss": 0.0, + "step": 22684 + }, + { + "epoch": 9.225294835298902, + "grad_norm": 0.0022325568669979796, + "learning_rate": 4.105683979159396e-06, + "loss": 0.0, + "step": 22685 + }, + { + "epoch": 9.225701504676698, + "grad_norm": 0.36829858763154033, + "learning_rate": 4.1048655374572065e-06, + "loss": 0.002, + "step": 22686 + }, + { + "epoch": 9.226108174054493, + "grad_norm": 0.011569057301739745, + "learning_rate": 4.104047156271539e-06, + "loss": 0.0001, + "step": 22687 + }, + { + "epoch": 9.22651484343229, + "grad_norm": 0.005798599692856345, + "learning_rate": 4.103228835610791e-06, + "loss": 0.0001, + "step": 22688 + }, + { + "epoch": 9.226921512810085, + "grad_norm": 8.953155389371362, + "learning_rate": 4.10241057548337e-06, + "loss": 0.0813, + "step": 22689 + }, + { + "epoch": 9.22732818218788, + "grad_norm": 0.11034161569144897, + "learning_rate": 4.101592375897673e-06, + "loss": 0.001, + "step": 22690 + }, + { + "epoch": 9.227734851565677, + "grad_norm": 1.4389965375994525, + "learning_rate": 4.100774236862098e-06, + "loss": 0.0115, + "step": 22691 + }, + { + "epoch": 9.228141520943472, + "grad_norm": 0.17538002462083285, + "learning_rate": 4.099956158385045e-06, + "loss": 0.0015, + "step": 22692 + }, + { + "epoch": 9.228548190321268, + "grad_norm": 0.3264931175873377, + "learning_rate": 4.099138140474912e-06, + "loss": 0.0023, + "step": 22693 + }, + { + "epoch": 9.228954859699064, + "grad_norm": 0.04050822745455946, + "learning_rate": 4.098320183140092e-06, + "loss": 0.0005, + "step": 22694 + }, + { + "epoch": 9.229361529076861, + "grad_norm": 0.14876966887969822, + "learning_rate": 4.097502286388989e-06, + "loss": 0.0011, + "step": 22695 + }, + { + "epoch": 9.229768198454657, + "grad_norm": 0.28814147251287947, + "learning_rate": 4.096684450229996e-06, + "loss": 0.0022, + "step": 22696 + }, + { + "epoch": 9.230174867832453, + "grad_norm": 0.04484666843016658, + "learning_rate": 4.095866674671508e-06, + "loss": 0.0004, + "step": 22697 + }, + { + "epoch": 9.230581537210249, + "grad_norm": 9.446002257429877, + "learning_rate": 4.09504895972192e-06, + "loss": 0.1126, + "step": 22698 + }, + { + "epoch": 9.230988206588044, + "grad_norm": 12.496452827219427, + "learning_rate": 4.094231305389627e-06, + "loss": 0.1191, + "step": 22699 + }, + { + "epoch": 9.23139487596584, + "grad_norm": 0.0002882298593127238, + "learning_rate": 4.093413711683021e-06, + "loss": 0.0, + "step": 22700 + }, + { + "epoch": 9.231801545343636, + "grad_norm": 1.0717243225455861, + "learning_rate": 4.092596178610493e-06, + "loss": 0.0097, + "step": 22701 + }, + { + "epoch": 9.232208214721432, + "grad_norm": 0.0023043681228821756, + "learning_rate": 4.091778706180442e-06, + "loss": 0.0, + "step": 22702 + }, + { + "epoch": 9.232614884099227, + "grad_norm": 0.25918317538105706, + "learning_rate": 4.0909612944012555e-06, + "loss": 0.0023, + "step": 22703 + }, + { + "epoch": 9.233021553477023, + "grad_norm": 0.05206857122962562, + "learning_rate": 4.090143943281325e-06, + "loss": 0.0008, + "step": 22704 + }, + { + "epoch": 9.233428222854819, + "grad_norm": 0.01610427042999357, + "learning_rate": 4.0893266528290425e-06, + "loss": 0.0001, + "step": 22705 + }, + { + "epoch": 9.233834892232615, + "grad_norm": 0.0008085604605800236, + "learning_rate": 4.0885094230527954e-06, + "loss": 0.0, + "step": 22706 + }, + { + "epoch": 9.23424156161041, + "grad_norm": 0.0050248710898789005, + "learning_rate": 4.087692253960972e-06, + "loss": 0.0001, + "step": 22707 + }, + { + "epoch": 9.234648230988206, + "grad_norm": 0.0012432594445787057, + "learning_rate": 4.086875145561966e-06, + "loss": 0.0, + "step": 22708 + }, + { + "epoch": 9.235054900366002, + "grad_norm": 0.03342170684068047, + "learning_rate": 4.086058097864163e-06, + "loss": 0.0005, + "step": 22709 + }, + { + "epoch": 9.235461569743798, + "grad_norm": 0.2779900039221879, + "learning_rate": 4.0852411108759495e-06, + "loss": 0.0032, + "step": 22710 + }, + { + "epoch": 9.235868239121594, + "grad_norm": 0.004620131251382297, + "learning_rate": 4.084424184605713e-06, + "loss": 0.0001, + "step": 22711 + }, + { + "epoch": 9.23627490849939, + "grad_norm": 0.40152048545858554, + "learning_rate": 4.0836073190618395e-06, + "loss": 0.0034, + "step": 22712 + }, + { + "epoch": 9.236681577877185, + "grad_norm": 0.07928889953323834, + "learning_rate": 4.082790514252715e-06, + "loss": 0.0012, + "step": 22713 + }, + { + "epoch": 9.237088247254981, + "grad_norm": 0.013172348541675883, + "learning_rate": 4.08197377018672e-06, + "loss": 0.0002, + "step": 22714 + }, + { + "epoch": 9.237494916632777, + "grad_norm": 0.009347016410225754, + "learning_rate": 4.081157086872246e-06, + "loss": 0.0001, + "step": 22715 + }, + { + "epoch": 9.237901586010574, + "grad_norm": 4.560390366803286, + "learning_rate": 4.080340464317675e-06, + "loss": 0.0323, + "step": 22716 + }, + { + "epoch": 9.23830825538837, + "grad_norm": 0.18827635852248173, + "learning_rate": 4.079523902531387e-06, + "loss": 0.0019, + "step": 22717 + }, + { + "epoch": 9.238714924766166, + "grad_norm": 0.0324855771517634, + "learning_rate": 4.078707401521765e-06, + "loss": 0.0003, + "step": 22718 + }, + { + "epoch": 9.239121594143962, + "grad_norm": 0.14417877072440863, + "learning_rate": 4.077890961297193e-06, + "loss": 0.0016, + "step": 22719 + }, + { + "epoch": 9.239528263521757, + "grad_norm": 1.5829926468877549, + "learning_rate": 4.0770745818660475e-06, + "loss": 0.0177, + "step": 22720 + }, + { + "epoch": 9.239934932899553, + "grad_norm": 0.10713211988935505, + "learning_rate": 4.076258263236715e-06, + "loss": 0.0012, + "step": 22721 + }, + { + "epoch": 9.240341602277349, + "grad_norm": 0.0195816725086615, + "learning_rate": 4.0754420054175735e-06, + "loss": 0.0001, + "step": 22722 + }, + { + "epoch": 9.240748271655145, + "grad_norm": 0.597605246578613, + "learning_rate": 4.074625808417001e-06, + "loss": 0.0043, + "step": 22723 + }, + { + "epoch": 9.24115494103294, + "grad_norm": 0.7884964686114074, + "learning_rate": 4.073809672243377e-06, + "loss": 0.0047, + "step": 22724 + }, + { + "epoch": 9.241561610410736, + "grad_norm": 0.8169298533816676, + "learning_rate": 4.072993596905079e-06, + "loss": 0.0098, + "step": 22725 + }, + { + "epoch": 9.241968279788532, + "grad_norm": 21.89703053239125, + "learning_rate": 4.072177582410486e-06, + "loss": 0.3791, + "step": 22726 + }, + { + "epoch": 9.242374949166328, + "grad_norm": 0.009802272931692722, + "learning_rate": 4.071361628767968e-06, + "loss": 0.0001, + "step": 22727 + }, + { + "epoch": 9.242781618544123, + "grad_norm": 0.0025796262581913956, + "learning_rate": 4.0705457359859114e-06, + "loss": 0.0, + "step": 22728 + }, + { + "epoch": 9.24318828792192, + "grad_norm": 2.935638814775727, + "learning_rate": 4.069729904072687e-06, + "loss": 0.0527, + "step": 22729 + }, + { + "epoch": 9.243594957299715, + "grad_norm": 0.6885445838071984, + "learning_rate": 4.06891413303667e-06, + "loss": 0.0099, + "step": 22730 + }, + { + "epoch": 9.24400162667751, + "grad_norm": 2.093448544988343, + "learning_rate": 4.068098422886234e-06, + "loss": 0.0185, + "step": 22731 + }, + { + "epoch": 9.244408296055306, + "grad_norm": 0.03952400586523866, + "learning_rate": 4.067282773629754e-06, + "loss": 0.0003, + "step": 22732 + }, + { + "epoch": 9.244814965433102, + "grad_norm": 0.36600943469021346, + "learning_rate": 4.066467185275598e-06, + "loss": 0.0016, + "step": 22733 + }, + { + "epoch": 9.245221634810898, + "grad_norm": 0.0032156060631296592, + "learning_rate": 4.065651657832146e-06, + "loss": 0.0, + "step": 22734 + }, + { + "epoch": 9.245628304188694, + "grad_norm": 1.8085117292523916, + "learning_rate": 4.064836191307766e-06, + "loss": 0.0179, + "step": 22735 + }, + { + "epoch": 9.246034973566491, + "grad_norm": 0.016277732003094972, + "learning_rate": 4.064020785710831e-06, + "loss": 0.0002, + "step": 22736 + }, + { + "epoch": 9.246441642944287, + "grad_norm": 0.5004008558139978, + "learning_rate": 4.063205441049709e-06, + "loss": 0.0048, + "step": 22737 + }, + { + "epoch": 9.246848312322083, + "grad_norm": 2.985333249915987, + "learning_rate": 4.0623901573327705e-06, + "loss": 0.0399, + "step": 22738 + }, + { + "epoch": 9.247254981699879, + "grad_norm": 0.016917984601394813, + "learning_rate": 4.061574934568387e-06, + "loss": 0.0002, + "step": 22739 + }, + { + "epoch": 9.247661651077674, + "grad_norm": 0.01434013257941723, + "learning_rate": 4.060759772764922e-06, + "loss": 0.0001, + "step": 22740 + }, + { + "epoch": 9.24806832045547, + "grad_norm": 0.16320316755769565, + "learning_rate": 4.059944671930749e-06, + "loss": 0.0014, + "step": 22741 + }, + { + "epoch": 9.248474989833266, + "grad_norm": 0.08801199706839094, + "learning_rate": 4.0591296320742345e-06, + "loss": 0.0009, + "step": 22742 + }, + { + "epoch": 9.248881659211062, + "grad_norm": 0.44855757725501294, + "learning_rate": 4.058314653203744e-06, + "loss": 0.0033, + "step": 22743 + }, + { + "epoch": 9.249288328588857, + "grad_norm": 0.02621002247056167, + "learning_rate": 4.057499735327644e-06, + "loss": 0.0002, + "step": 22744 + }, + { + "epoch": 9.249694997966653, + "grad_norm": 0.02403296578445164, + "learning_rate": 4.056684878454301e-06, + "loss": 0.0003, + "step": 22745 + }, + { + "epoch": 9.250101667344449, + "grad_norm": 0.0022979715818534437, + "learning_rate": 4.055870082592079e-06, + "loss": 0.0, + "step": 22746 + }, + { + "epoch": 9.250508336722245, + "grad_norm": 0.033500672644036014, + "learning_rate": 4.055055347749341e-06, + "loss": 0.0006, + "step": 22747 + }, + { + "epoch": 9.25091500610004, + "grad_norm": 0.027598914677979956, + "learning_rate": 4.054240673934452e-06, + "loss": 0.0004, + "step": 22748 + }, + { + "epoch": 9.251321675477836, + "grad_norm": 0.027315128734843576, + "learning_rate": 4.053426061155775e-06, + "loss": 0.0004, + "step": 22749 + }, + { + "epoch": 9.251728344855632, + "grad_norm": 0.009327740215628847, + "learning_rate": 4.052611509421673e-06, + "loss": 0.0002, + "step": 22750 + }, + { + "epoch": 9.252135014233428, + "grad_norm": 0.298550311797846, + "learning_rate": 4.051797018740506e-06, + "loss": 0.003, + "step": 22751 + }, + { + "epoch": 9.252541683611224, + "grad_norm": 0.06861904240083341, + "learning_rate": 4.050982589120634e-06, + "loss": 0.0009, + "step": 22752 + }, + { + "epoch": 9.25294835298902, + "grad_norm": 0.12864846926498136, + "learning_rate": 4.050168220570422e-06, + "loss": 0.0011, + "step": 22753 + }, + { + "epoch": 9.253355022366815, + "grad_norm": 0.45631109889483545, + "learning_rate": 4.0493539130982284e-06, + "loss": 0.0061, + "step": 22754 + }, + { + "epoch": 9.25376169174461, + "grad_norm": 0.0007540983512579919, + "learning_rate": 4.048539666712412e-06, + "loss": 0.0, + "step": 22755 + }, + { + "epoch": 9.254168361122407, + "grad_norm": 0.0048108584228854796, + "learning_rate": 4.047725481421331e-06, + "loss": 0.0001, + "step": 22756 + }, + { + "epoch": 9.254575030500204, + "grad_norm": 0.00014598694414223146, + "learning_rate": 4.046911357233343e-06, + "loss": 0.0, + "step": 22757 + }, + { + "epoch": 9.254981699878, + "grad_norm": 0.2925054039936118, + "learning_rate": 4.046097294156803e-06, + "loss": 0.004, + "step": 22758 + }, + { + "epoch": 9.255388369255796, + "grad_norm": 0.000148907836418293, + "learning_rate": 4.045283292200075e-06, + "loss": 0.0, + "step": 22759 + }, + { + "epoch": 9.255795038633591, + "grad_norm": 0.04153720921174634, + "learning_rate": 4.044469351371511e-06, + "loss": 0.0003, + "step": 22760 + }, + { + "epoch": 9.256201708011387, + "grad_norm": 0.07543814329734044, + "learning_rate": 4.043655471679465e-06, + "loss": 0.0008, + "step": 22761 + }, + { + "epoch": 9.256608377389183, + "grad_norm": 0.0002615211672700363, + "learning_rate": 4.0428416531322946e-06, + "loss": 0.0, + "step": 22762 + }, + { + "epoch": 9.257015046766979, + "grad_norm": 0.06993643890942343, + "learning_rate": 4.042027895738353e-06, + "loss": 0.0007, + "step": 22763 + }, + { + "epoch": 9.257421716144774, + "grad_norm": 0.24073554160116015, + "learning_rate": 4.041214199505993e-06, + "loss": 0.0019, + "step": 22764 + }, + { + "epoch": 9.25782838552257, + "grad_norm": 0.32120711292422344, + "learning_rate": 4.040400564443564e-06, + "loss": 0.0052, + "step": 22765 + }, + { + "epoch": 9.258235054900366, + "grad_norm": 0.30223751202777166, + "learning_rate": 4.039586990559428e-06, + "loss": 0.003, + "step": 22766 + }, + { + "epoch": 9.258641724278162, + "grad_norm": 9.401981990045124, + "learning_rate": 4.038773477861929e-06, + "loss": 0.1558, + "step": 22767 + }, + { + "epoch": 9.259048393655958, + "grad_norm": 0.023031677117884848, + "learning_rate": 4.037960026359422e-06, + "loss": 0.0002, + "step": 22768 + }, + { + "epoch": 9.259455063033753, + "grad_norm": 0.11208014407599838, + "learning_rate": 4.037146636060256e-06, + "loss": 0.0007, + "step": 22769 + }, + { + "epoch": 9.259861732411549, + "grad_norm": 0.0217335078829401, + "learning_rate": 4.03633330697278e-06, + "loss": 0.0003, + "step": 22770 + }, + { + "epoch": 9.260268401789345, + "grad_norm": 0.0002158658972246514, + "learning_rate": 4.035520039105341e-06, + "loss": 0.0, + "step": 22771 + }, + { + "epoch": 9.26067507116714, + "grad_norm": 0.009851440744028639, + "learning_rate": 4.034706832466295e-06, + "loss": 0.0001, + "step": 22772 + }, + { + "epoch": 9.261081740544936, + "grad_norm": 0.10933325851412667, + "learning_rate": 4.033893687063986e-06, + "loss": 0.001, + "step": 22773 + }, + { + "epoch": 9.261488409922732, + "grad_norm": 0.0005863234743833781, + "learning_rate": 4.033080602906759e-06, + "loss": 0.0, + "step": 22774 + }, + { + "epoch": 9.261895079300528, + "grad_norm": 0.10855590700097165, + "learning_rate": 4.032267580002963e-06, + "loss": 0.0012, + "step": 22775 + }, + { + "epoch": 9.262301748678324, + "grad_norm": 9.59422844989123, + "learning_rate": 4.031454618360945e-06, + "loss": 0.2853, + "step": 22776 + }, + { + "epoch": 9.262708418056121, + "grad_norm": 6.744644826716837, + "learning_rate": 4.030641717989049e-06, + "loss": 0.072, + "step": 22777 + }, + { + "epoch": 9.263115087433917, + "grad_norm": 0.08361282251083797, + "learning_rate": 4.029828878895616e-06, + "loss": 0.0009, + "step": 22778 + }, + { + "epoch": 9.263521756811713, + "grad_norm": 1.6853150487549626, + "learning_rate": 4.029016101088998e-06, + "loss": 0.0215, + "step": 22779 + }, + { + "epoch": 9.263928426189509, + "grad_norm": 0.0079019934458484, + "learning_rate": 4.028203384577534e-06, + "loss": 0.0001, + "step": 22780 + }, + { + "epoch": 9.264335095567304, + "grad_norm": 0.010474189002026454, + "learning_rate": 4.027390729369569e-06, + "loss": 0.0001, + "step": 22781 + }, + { + "epoch": 9.2647417649451, + "grad_norm": 0.03042455425585577, + "learning_rate": 4.0265781354734445e-06, + "loss": 0.0004, + "step": 22782 + }, + { + "epoch": 9.265148434322896, + "grad_norm": 0.11278273141345348, + "learning_rate": 4.025765602897501e-06, + "loss": 0.0008, + "step": 22783 + }, + { + "epoch": 9.265555103700692, + "grad_norm": 0.7517515804338017, + "learning_rate": 4.0249531316500765e-06, + "loss": 0.0087, + "step": 22784 + }, + { + "epoch": 9.265961773078487, + "grad_norm": 0.5585642713907972, + "learning_rate": 4.024140721739519e-06, + "loss": 0.0057, + "step": 22785 + }, + { + "epoch": 9.266368442456283, + "grad_norm": 0.04897030326186972, + "learning_rate": 4.023328373174165e-06, + "loss": 0.0005, + "step": 22786 + }, + { + "epoch": 9.266775111834079, + "grad_norm": 6.902083100396734e-05, + "learning_rate": 4.022516085962354e-06, + "loss": 0.0, + "step": 22787 + }, + { + "epoch": 9.267181781211875, + "grad_norm": 0.04434042086419841, + "learning_rate": 4.021703860112422e-06, + "loss": 0.0003, + "step": 22788 + }, + { + "epoch": 9.26758845058967, + "grad_norm": 0.4195011948359303, + "learning_rate": 4.020891695632709e-06, + "loss": 0.0034, + "step": 22789 + }, + { + "epoch": 9.267995119967466, + "grad_norm": 0.056682157877082494, + "learning_rate": 4.0200795925315525e-06, + "loss": 0.0005, + "step": 22790 + }, + { + "epoch": 9.268401789345262, + "grad_norm": 0.5288895697337058, + "learning_rate": 4.019267550817285e-06, + "loss": 0.0037, + "step": 22791 + }, + { + "epoch": 9.268808458723058, + "grad_norm": 0.4018841954510072, + "learning_rate": 4.0184555704982495e-06, + "loss": 0.0033, + "step": 22792 + }, + { + "epoch": 9.269215128100853, + "grad_norm": 0.00720753952401428, + "learning_rate": 4.0176436515827785e-06, + "loss": 0.0001, + "step": 22793 + }, + { + "epoch": 9.26962179747865, + "grad_norm": 0.005799385708007291, + "learning_rate": 4.016831794079207e-06, + "loss": 0.0, + "step": 22794 + }, + { + "epoch": 9.270028466856445, + "grad_norm": 0.008679419339804437, + "learning_rate": 4.0160199979958666e-06, + "loss": 0.0001, + "step": 22795 + }, + { + "epoch": 9.27043513623424, + "grad_norm": 0.06885134581339382, + "learning_rate": 4.0152082633410935e-06, + "loss": 0.0007, + "step": 22796 + }, + { + "epoch": 9.270841805612037, + "grad_norm": 0.009440786924874357, + "learning_rate": 4.014396590123216e-06, + "loss": 0.0001, + "step": 22797 + }, + { + "epoch": 9.271248474989834, + "grad_norm": 0.004632255101625967, + "learning_rate": 4.013584978350573e-06, + "loss": 0.0001, + "step": 22798 + }, + { + "epoch": 9.27165514436763, + "grad_norm": 0.005911239373662911, + "learning_rate": 4.012773428031495e-06, + "loss": 0.0001, + "step": 22799 + }, + { + "epoch": 9.272061813745426, + "grad_norm": 0.01905517542068318, + "learning_rate": 4.011961939174309e-06, + "loss": 0.0001, + "step": 22800 + }, + { + "epoch": 9.272468483123221, + "grad_norm": 0.09414130674227307, + "learning_rate": 4.011150511787348e-06, + "loss": 0.0005, + "step": 22801 + }, + { + "epoch": 9.272875152501017, + "grad_norm": 0.9203552138232944, + "learning_rate": 4.010339145878941e-06, + "loss": 0.0105, + "step": 22802 + }, + { + "epoch": 9.273281821878813, + "grad_norm": 0.0016443717737155584, + "learning_rate": 4.009527841457418e-06, + "loss": 0.0, + "step": 22803 + }, + { + "epoch": 9.273688491256609, + "grad_norm": 1.2553721891210725, + "learning_rate": 4.008716598531103e-06, + "loss": 0.011, + "step": 22804 + }, + { + "epoch": 9.274095160634404, + "grad_norm": 4.863652879617975, + "learning_rate": 4.0079054171083294e-06, + "loss": 0.0529, + "step": 22805 + }, + { + "epoch": 9.2745018300122, + "grad_norm": 0.003109475016580762, + "learning_rate": 4.007094297197425e-06, + "loss": 0.0001, + "step": 22806 + }, + { + "epoch": 9.274908499389996, + "grad_norm": 0.0020084611718124996, + "learning_rate": 4.006283238806712e-06, + "loss": 0.0, + "step": 22807 + }, + { + "epoch": 9.275315168767792, + "grad_norm": 0.3232738202170418, + "learning_rate": 4.005472241944518e-06, + "loss": 0.0067, + "step": 22808 + }, + { + "epoch": 9.275721838145587, + "grad_norm": 0.6501047254030184, + "learning_rate": 4.00466130661917e-06, + "loss": 0.0043, + "step": 22809 + }, + { + "epoch": 9.276128507523383, + "grad_norm": 0.00018536084736450677, + "learning_rate": 4.003850432838986e-06, + "loss": 0.0, + "step": 22810 + }, + { + "epoch": 9.276535176901179, + "grad_norm": 1.0595560171634144, + "learning_rate": 4.003039620612299e-06, + "loss": 0.0123, + "step": 22811 + }, + { + "epoch": 9.276941846278975, + "grad_norm": 0.08284386089611936, + "learning_rate": 4.002228869947428e-06, + "loss": 0.001, + "step": 22812 + }, + { + "epoch": 9.27734851565677, + "grad_norm": 0.010756551979439772, + "learning_rate": 4.001418180852698e-06, + "loss": 0.0002, + "step": 22813 + }, + { + "epoch": 9.277755185034566, + "grad_norm": 0.2046297820053984, + "learning_rate": 4.000607553336427e-06, + "loss": 0.0015, + "step": 22814 + }, + { + "epoch": 9.278161854412362, + "grad_norm": 0.01796677918700108, + "learning_rate": 3.999796987406941e-06, + "loss": 0.0001, + "step": 22815 + }, + { + "epoch": 9.278568523790158, + "grad_norm": 0.02215349989283956, + "learning_rate": 3.998986483072558e-06, + "loss": 0.0002, + "step": 22816 + }, + { + "epoch": 9.278975193167954, + "grad_norm": 0.011572926006737138, + "learning_rate": 3.998176040341596e-06, + "loss": 0.0001, + "step": 22817 + }, + { + "epoch": 9.279381862545751, + "grad_norm": 2.962861312414428, + "learning_rate": 3.99736565922238e-06, + "loss": 0.0247, + "step": 22818 + }, + { + "epoch": 9.279788531923547, + "grad_norm": 0.007629882305169132, + "learning_rate": 3.996555339723227e-06, + "loss": 0.0001, + "step": 22819 + }, + { + "epoch": 9.280195201301343, + "grad_norm": 3.4764065312960715, + "learning_rate": 3.995745081852456e-06, + "loss": 0.0608, + "step": 22820 + }, + { + "epoch": 9.280601870679138, + "grad_norm": 0.016016142926480088, + "learning_rate": 3.994934885618382e-06, + "loss": 0.0002, + "step": 22821 + }, + { + "epoch": 9.281008540056934, + "grad_norm": 0.34932812920047046, + "learning_rate": 3.994124751029324e-06, + "loss": 0.0056, + "step": 22822 + }, + { + "epoch": 9.28141520943473, + "grad_norm": 0.5657727417841765, + "learning_rate": 3.993314678093595e-06, + "loss": 0.0046, + "step": 22823 + }, + { + "epoch": 9.281821878812526, + "grad_norm": 0.009611962801647344, + "learning_rate": 3.992504666819518e-06, + "loss": 0.0001, + "step": 22824 + }, + { + "epoch": 9.282228548190322, + "grad_norm": 0.023007165323994962, + "learning_rate": 3.991694717215403e-06, + "loss": 0.0002, + "step": 22825 + }, + { + "epoch": 9.282635217568117, + "grad_norm": 0.011325069840663669, + "learning_rate": 3.990884829289566e-06, + "loss": 0.0001, + "step": 22826 + }, + { + "epoch": 9.283041886945913, + "grad_norm": 0.006767810688155579, + "learning_rate": 3.990075003050321e-06, + "loss": 0.0001, + "step": 22827 + }, + { + "epoch": 9.283448556323709, + "grad_norm": 4.812187566016663, + "learning_rate": 3.989265238505981e-06, + "loss": 0.0983, + "step": 22828 + }, + { + "epoch": 9.283855225701505, + "grad_norm": 0.006004108673041096, + "learning_rate": 3.988455535664858e-06, + "loss": 0.0001, + "step": 22829 + }, + { + "epoch": 9.2842618950793, + "grad_norm": 0.015574488142112103, + "learning_rate": 3.98764589453526e-06, + "loss": 0.0002, + "step": 22830 + }, + { + "epoch": 9.284668564457096, + "grad_norm": 0.0021468927806640183, + "learning_rate": 3.986836315125508e-06, + "loss": 0.0, + "step": 22831 + }, + { + "epoch": 9.285075233834892, + "grad_norm": 0.18891704521424896, + "learning_rate": 3.986026797443907e-06, + "loss": 0.0014, + "step": 22832 + }, + { + "epoch": 9.285481903212688, + "grad_norm": 0.005281167205321308, + "learning_rate": 3.985217341498767e-06, + "loss": 0.0, + "step": 22833 + }, + { + "epoch": 9.285888572590483, + "grad_norm": 9.063314920403482, + "learning_rate": 3.984407947298399e-06, + "loss": 0.1779, + "step": 22834 + }, + { + "epoch": 9.28629524196828, + "grad_norm": 0.18255298537363712, + "learning_rate": 3.983598614851111e-06, + "loss": 0.0022, + "step": 22835 + }, + { + "epoch": 9.286701911346075, + "grad_norm": 0.014150289503225904, + "learning_rate": 3.9827893441652075e-06, + "loss": 0.0002, + "step": 22836 + }, + { + "epoch": 9.28710858072387, + "grad_norm": 1.6512259153384137, + "learning_rate": 3.981980135249003e-06, + "loss": 0.019, + "step": 22837 + }, + { + "epoch": 9.287515250101666, + "grad_norm": 0.010492207704095114, + "learning_rate": 3.9811709881108015e-06, + "loss": 0.0001, + "step": 22838 + }, + { + "epoch": 9.287921919479464, + "grad_norm": 0.010728815229713565, + "learning_rate": 3.9803619027589095e-06, + "loss": 0.0001, + "step": 22839 + }, + { + "epoch": 9.28832858885726, + "grad_norm": 5.582395650326726e-05, + "learning_rate": 3.979552879201631e-06, + "loss": 0.0, + "step": 22840 + }, + { + "epoch": 9.288735258235056, + "grad_norm": 0.037862233918388076, + "learning_rate": 3.978743917447273e-06, + "loss": 0.0005, + "step": 22841 + }, + { + "epoch": 9.289141927612851, + "grad_norm": 1.1330657859775102, + "learning_rate": 3.9779350175041395e-06, + "loss": 0.01, + "step": 22842 + }, + { + "epoch": 9.289548596990647, + "grad_norm": 0.3282369556762507, + "learning_rate": 3.977126179380529e-06, + "loss": 0.0026, + "step": 22843 + }, + { + "epoch": 9.289955266368443, + "grad_norm": 0.005095352876038359, + "learning_rate": 3.976317403084755e-06, + "loss": 0.0001, + "step": 22844 + }, + { + "epoch": 9.290361935746239, + "grad_norm": 0.009653622993092484, + "learning_rate": 3.975508688625117e-06, + "loss": 0.0001, + "step": 22845 + }, + { + "epoch": 9.290768605124034, + "grad_norm": 0.007772202074732622, + "learning_rate": 3.974700036009911e-06, + "loss": 0.0001, + "step": 22846 + }, + { + "epoch": 9.29117527450183, + "grad_norm": 14.924485579408133, + "learning_rate": 3.973891445247441e-06, + "loss": 0.3812, + "step": 22847 + }, + { + "epoch": 9.291581943879626, + "grad_norm": 0.004269865393869871, + "learning_rate": 3.973082916346005e-06, + "loss": 0.0, + "step": 22848 + }, + { + "epoch": 9.291988613257422, + "grad_norm": 0.11522068701370523, + "learning_rate": 3.97227444931391e-06, + "loss": 0.0012, + "step": 22849 + }, + { + "epoch": 9.292395282635217, + "grad_norm": 0.04432584927068702, + "learning_rate": 3.971466044159451e-06, + "loss": 0.0004, + "step": 22850 + }, + { + "epoch": 9.292801952013013, + "grad_norm": 0.05977082813177945, + "learning_rate": 3.970657700890927e-06, + "loss": 0.0006, + "step": 22851 + }, + { + "epoch": 9.293208621390809, + "grad_norm": 0.0197716950198681, + "learning_rate": 3.969849419516637e-06, + "loss": 0.0002, + "step": 22852 + }, + { + "epoch": 9.293615290768605, + "grad_norm": 19.735321922266774, + "learning_rate": 3.9690412000448775e-06, + "loss": 0.1148, + "step": 22853 + }, + { + "epoch": 9.2940219601464, + "grad_norm": 0.012885265913628393, + "learning_rate": 3.968233042483945e-06, + "loss": 0.0001, + "step": 22854 + }, + { + "epoch": 9.294428629524196, + "grad_norm": 0.0017706750186000431, + "learning_rate": 3.967424946842132e-06, + "loss": 0.0, + "step": 22855 + }, + { + "epoch": 9.294835298901992, + "grad_norm": 0.013164779398996566, + "learning_rate": 3.966616913127743e-06, + "loss": 0.0001, + "step": 22856 + }, + { + "epoch": 9.295241968279788, + "grad_norm": 0.12998999770813022, + "learning_rate": 3.965808941349068e-06, + "loss": 0.0011, + "step": 22857 + }, + { + "epoch": 9.295648637657584, + "grad_norm": 0.6099128072628682, + "learning_rate": 3.965001031514399e-06, + "loss": 0.0033, + "step": 22858 + }, + { + "epoch": 9.296055307035381, + "grad_norm": 0.055213530462964976, + "learning_rate": 3.9641931836320336e-06, + "loss": 0.0005, + "step": 22859 + }, + { + "epoch": 9.296461976413177, + "grad_norm": 1.1696213006148701, + "learning_rate": 3.963385397710262e-06, + "loss": 0.0073, + "step": 22860 + }, + { + "epoch": 9.296868645790973, + "grad_norm": 1.2448276122738915, + "learning_rate": 3.962577673757374e-06, + "loss": 0.0093, + "step": 22861 + }, + { + "epoch": 9.297275315168768, + "grad_norm": 0.8767244772380701, + "learning_rate": 3.96177001178167e-06, + "loss": 0.0084, + "step": 22862 + }, + { + "epoch": 9.297681984546564, + "grad_norm": 0.006323206397213231, + "learning_rate": 3.960962411791434e-06, + "loss": 0.0001, + "step": 22863 + }, + { + "epoch": 9.29808865392436, + "grad_norm": 0.03296243865395558, + "learning_rate": 3.960154873794958e-06, + "loss": 0.0003, + "step": 22864 + }, + { + "epoch": 9.298495323302156, + "grad_norm": 0.0037612908396540607, + "learning_rate": 3.959347397800533e-06, + "loss": 0.0, + "step": 22865 + }, + { + "epoch": 9.298901992679951, + "grad_norm": 0.0024840806732704087, + "learning_rate": 3.958539983816446e-06, + "loss": 0.0, + "step": 22866 + }, + { + "epoch": 9.299308662057747, + "grad_norm": 0.4061765094673805, + "learning_rate": 3.957732631850988e-06, + "loss": 0.0068, + "step": 22867 + }, + { + "epoch": 9.299715331435543, + "grad_norm": 0.3564861463136281, + "learning_rate": 3.956925341912442e-06, + "loss": 0.0035, + "step": 22868 + }, + { + "epoch": 9.300122000813339, + "grad_norm": 0.025289403570586733, + "learning_rate": 3.956118114009101e-06, + "loss": 0.0001, + "step": 22869 + }, + { + "epoch": 9.300528670191134, + "grad_norm": 0.0382816068222569, + "learning_rate": 3.95531094814925e-06, + "loss": 0.0003, + "step": 22870 + }, + { + "epoch": 9.30093533956893, + "grad_norm": 0.02398645575183389, + "learning_rate": 3.9545038443411755e-06, + "loss": 0.0003, + "step": 22871 + }, + { + "epoch": 9.301342008946726, + "grad_norm": 0.04438398271913903, + "learning_rate": 3.95369680259316e-06, + "loss": 0.0006, + "step": 22872 + }, + { + "epoch": 9.301748678324522, + "grad_norm": 0.038765162685012455, + "learning_rate": 3.952889822913491e-06, + "loss": 0.0005, + "step": 22873 + }, + { + "epoch": 9.302155347702318, + "grad_norm": 0.0033045426806387348, + "learning_rate": 3.952082905310447e-06, + "loss": 0.0, + "step": 22874 + }, + { + "epoch": 9.302562017080113, + "grad_norm": 0.024117477926495302, + "learning_rate": 3.951276049792319e-06, + "loss": 0.0002, + "step": 22875 + }, + { + "epoch": 9.302968686457909, + "grad_norm": 0.20167361097242525, + "learning_rate": 3.950469256367389e-06, + "loss": 0.0022, + "step": 22876 + }, + { + "epoch": 9.303375355835705, + "grad_norm": 0.18687540736808628, + "learning_rate": 3.949662525043935e-06, + "loss": 0.0011, + "step": 22877 + }, + { + "epoch": 9.3037820252135, + "grad_norm": 0.014917799827492842, + "learning_rate": 3.948855855830241e-06, + "loss": 0.0001, + "step": 22878 + }, + { + "epoch": 9.304188694591296, + "grad_norm": 0.09070561396416402, + "learning_rate": 3.948049248734586e-06, + "loss": 0.0007, + "step": 22879 + }, + { + "epoch": 9.304595363969094, + "grad_norm": 0.01117346167313579, + "learning_rate": 3.947242703765252e-06, + "loss": 0.0001, + "step": 22880 + }, + { + "epoch": 9.30500203334689, + "grad_norm": 0.03330240700920875, + "learning_rate": 3.946436220930514e-06, + "loss": 0.0005, + "step": 22881 + }, + { + "epoch": 9.305408702724685, + "grad_norm": 0.002799936096554932, + "learning_rate": 3.945629800238659e-06, + "loss": 0.0, + "step": 22882 + }, + { + "epoch": 9.305815372102481, + "grad_norm": 0.0211501533841486, + "learning_rate": 3.94482344169796e-06, + "loss": 0.0002, + "step": 22883 + }, + { + "epoch": 9.306222041480277, + "grad_norm": 1.2480387890300826, + "learning_rate": 3.944017145316698e-06, + "loss": 0.0149, + "step": 22884 + }, + { + "epoch": 9.306628710858073, + "grad_norm": 0.0257726438701882, + "learning_rate": 3.943210911103146e-06, + "loss": 0.0002, + "step": 22885 + }, + { + "epoch": 9.307035380235869, + "grad_norm": 0.47616069726575766, + "learning_rate": 3.9424047390655825e-06, + "loss": 0.0044, + "step": 22886 + }, + { + "epoch": 9.307442049613664, + "grad_norm": 7.240015341773562, + "learning_rate": 3.941598629212281e-06, + "loss": 0.1194, + "step": 22887 + }, + { + "epoch": 9.30784871899146, + "grad_norm": 0.19548533770821142, + "learning_rate": 3.9407925815515215e-06, + "loss": 0.0018, + "step": 22888 + }, + { + "epoch": 9.308255388369256, + "grad_norm": 0.04648489615853897, + "learning_rate": 3.939986596091575e-06, + "loss": 0.0006, + "step": 22889 + }, + { + "epoch": 9.308662057747052, + "grad_norm": 0.0020490219348795525, + "learning_rate": 3.939180672840716e-06, + "loss": 0.0, + "step": 22890 + }, + { + "epoch": 9.309068727124847, + "grad_norm": 0.04454674744832821, + "learning_rate": 3.938374811807219e-06, + "loss": 0.0005, + "step": 22891 + }, + { + "epoch": 9.309475396502643, + "grad_norm": 0.28535221998391225, + "learning_rate": 3.937569012999354e-06, + "loss": 0.0028, + "step": 22892 + }, + { + "epoch": 9.309882065880439, + "grad_norm": 0.5783729126130125, + "learning_rate": 3.936763276425395e-06, + "loss": 0.0053, + "step": 22893 + }, + { + "epoch": 9.310288735258235, + "grad_norm": 0.00038140570518689686, + "learning_rate": 3.935957602093608e-06, + "loss": 0.0, + "step": 22894 + }, + { + "epoch": 9.31069540463603, + "grad_norm": 0.18614560843390254, + "learning_rate": 3.935151990012273e-06, + "loss": 0.0018, + "step": 22895 + }, + { + "epoch": 9.311102074013826, + "grad_norm": 0.009454328542604157, + "learning_rate": 3.9343464401896546e-06, + "loss": 0.0002, + "step": 22896 + }, + { + "epoch": 9.311508743391622, + "grad_norm": 0.0002674952359464425, + "learning_rate": 3.933540952634023e-06, + "loss": 0.0, + "step": 22897 + }, + { + "epoch": 9.311915412769418, + "grad_norm": 0.04156290339410683, + "learning_rate": 3.9327355273536476e-06, + "loss": 0.0004, + "step": 22898 + }, + { + "epoch": 9.312322082147213, + "grad_norm": 0.0702455598745207, + "learning_rate": 3.931930164356794e-06, + "loss": 0.0007, + "step": 22899 + }, + { + "epoch": 9.312728751525011, + "grad_norm": 3.4099280180894223, + "learning_rate": 3.931124863651729e-06, + "loss": 0.0707, + "step": 22900 + }, + { + "epoch": 9.313135420902807, + "grad_norm": 0.002111361012683079, + "learning_rate": 3.9303196252467236e-06, + "loss": 0.0, + "step": 22901 + }, + { + "epoch": 9.313542090280603, + "grad_norm": 0.030899195576906287, + "learning_rate": 3.929514449150044e-06, + "loss": 0.0003, + "step": 22902 + }, + { + "epoch": 9.313948759658398, + "grad_norm": 0.13481015617925438, + "learning_rate": 3.9287093353699525e-06, + "loss": 0.0016, + "step": 22903 + }, + { + "epoch": 9.314355429036194, + "grad_norm": 0.04336172625584469, + "learning_rate": 3.927904283914716e-06, + "loss": 0.0004, + "step": 22904 + }, + { + "epoch": 9.31476209841399, + "grad_norm": 0.19985382558916356, + "learning_rate": 3.927099294792597e-06, + "loss": 0.0031, + "step": 22905 + }, + { + "epoch": 9.315168767791786, + "grad_norm": 0.05927996226407709, + "learning_rate": 3.92629436801186e-06, + "loss": 0.0004, + "step": 22906 + }, + { + "epoch": 9.315575437169581, + "grad_norm": 0.4849832624999651, + "learning_rate": 3.925489503580766e-06, + "loss": 0.0036, + "step": 22907 + }, + { + "epoch": 9.315982106547377, + "grad_norm": 0.1297458998697152, + "learning_rate": 3.924684701507582e-06, + "loss": 0.0008, + "step": 22908 + }, + { + "epoch": 9.316388775925173, + "grad_norm": 0.009338343318855281, + "learning_rate": 3.9238799618005676e-06, + "loss": 0.0001, + "step": 22909 + }, + { + "epoch": 9.316795445302969, + "grad_norm": 0.015356363679906768, + "learning_rate": 3.9230752844679835e-06, + "loss": 0.0001, + "step": 22910 + }, + { + "epoch": 9.317202114680764, + "grad_norm": 5.681012006074655, + "learning_rate": 3.92227066951809e-06, + "loss": 0.0638, + "step": 22911 + }, + { + "epoch": 9.31760878405856, + "grad_norm": 0.01018884591122426, + "learning_rate": 3.921466116959147e-06, + "loss": 0.0001, + "step": 22912 + }, + { + "epoch": 9.318015453436356, + "grad_norm": 0.0030734975228816777, + "learning_rate": 3.92066162679941e-06, + "loss": 0.0, + "step": 22913 + }, + { + "epoch": 9.318422122814152, + "grad_norm": 0.0020297621356881635, + "learning_rate": 3.919857199047145e-06, + "loss": 0.0, + "step": 22914 + }, + { + "epoch": 9.318828792191947, + "grad_norm": 0.001661105473583862, + "learning_rate": 3.9190528337106045e-06, + "loss": 0.0, + "step": 22915 + }, + { + "epoch": 9.319235461569743, + "grad_norm": 0.0052641763874366546, + "learning_rate": 3.918248530798049e-06, + "loss": 0.0, + "step": 22916 + }, + { + "epoch": 9.319642130947539, + "grad_norm": 0.0006709015986567111, + "learning_rate": 3.9174442903177325e-06, + "loss": 0.0, + "step": 22917 + }, + { + "epoch": 9.320048800325335, + "grad_norm": 0.01793357570712934, + "learning_rate": 3.91664011227791e-06, + "loss": 0.0002, + "step": 22918 + }, + { + "epoch": 9.32045546970313, + "grad_norm": 0.13302258193141703, + "learning_rate": 3.915835996686841e-06, + "loss": 0.0013, + "step": 22919 + }, + { + "epoch": 9.320862139080926, + "grad_norm": 0.7912635074489929, + "learning_rate": 3.915031943552773e-06, + "loss": 0.0103, + "step": 22920 + }, + { + "epoch": 9.321268808458724, + "grad_norm": 0.04374654371043292, + "learning_rate": 3.914227952883968e-06, + "loss": 0.0006, + "step": 22921 + }, + { + "epoch": 9.32167547783652, + "grad_norm": 0.18408076688015426, + "learning_rate": 3.9134240246886755e-06, + "loss": 0.0031, + "step": 22922 + }, + { + "epoch": 9.322082147214315, + "grad_norm": 10.175825248520635, + "learning_rate": 3.91262015897515e-06, + "loss": 0.1898, + "step": 22923 + }, + { + "epoch": 9.322488816592111, + "grad_norm": 0.01527145438926035, + "learning_rate": 3.911816355751641e-06, + "loss": 0.0002, + "step": 22924 + }, + { + "epoch": 9.322895485969907, + "grad_norm": 0.22957235997334882, + "learning_rate": 3.911012615026403e-06, + "loss": 0.0022, + "step": 22925 + }, + { + "epoch": 9.323302155347703, + "grad_norm": 0.234339588285315, + "learning_rate": 3.910208936807679e-06, + "loss": 0.0026, + "step": 22926 + }, + { + "epoch": 9.323708824725498, + "grad_norm": 0.25704350276691995, + "learning_rate": 3.9094053211037305e-06, + "loss": 0.0024, + "step": 22927 + }, + { + "epoch": 9.324115494103294, + "grad_norm": 0.012409635079302083, + "learning_rate": 3.908601767922802e-06, + "loss": 0.0001, + "step": 22928 + }, + { + "epoch": 9.32452216348109, + "grad_norm": 0.1699140637926044, + "learning_rate": 3.907798277273141e-06, + "loss": 0.0018, + "step": 22929 + }, + { + "epoch": 9.324928832858886, + "grad_norm": 2.136654111401363, + "learning_rate": 3.906994849162996e-06, + "loss": 0.0214, + "step": 22930 + }, + { + "epoch": 9.325335502236682, + "grad_norm": 0.5797604273643503, + "learning_rate": 3.906191483600618e-06, + "loss": 0.0054, + "step": 22931 + }, + { + "epoch": 9.325742171614477, + "grad_norm": 0.009295661534650846, + "learning_rate": 3.905388180594249e-06, + "loss": 0.0001, + "step": 22932 + }, + { + "epoch": 9.326148840992273, + "grad_norm": 0.07926222226048478, + "learning_rate": 3.9045849401521354e-06, + "loss": 0.001, + "step": 22933 + }, + { + "epoch": 9.326555510370069, + "grad_norm": 0.44436296862868035, + "learning_rate": 3.903781762282528e-06, + "loss": 0.0048, + "step": 22934 + }, + { + "epoch": 9.326962179747865, + "grad_norm": 0.16024062182564341, + "learning_rate": 3.902978646993669e-06, + "loss": 0.0016, + "step": 22935 + }, + { + "epoch": 9.32736884912566, + "grad_norm": 1.1676759293727919, + "learning_rate": 3.902175594293803e-06, + "loss": 0.0126, + "step": 22936 + }, + { + "epoch": 9.327775518503456, + "grad_norm": 0.0004864235203249693, + "learning_rate": 3.901372604191173e-06, + "loss": 0.0, + "step": 22937 + }, + { + "epoch": 9.328182187881252, + "grad_norm": 0.0029257584252909506, + "learning_rate": 3.900569676694024e-06, + "loss": 0.0, + "step": 22938 + }, + { + "epoch": 9.328588857259048, + "grad_norm": 0.035786080333973305, + "learning_rate": 3.899766811810592e-06, + "loss": 0.0001, + "step": 22939 + }, + { + "epoch": 9.328995526636843, + "grad_norm": 0.0038948159103302047, + "learning_rate": 3.8989640095491286e-06, + "loss": 0.0, + "step": 22940 + }, + { + "epoch": 9.329402196014641, + "grad_norm": 0.3293057702756284, + "learning_rate": 3.8981612699178705e-06, + "loss": 0.0026, + "step": 22941 + }, + { + "epoch": 9.329808865392437, + "grad_norm": 8.92530920999822, + "learning_rate": 3.897358592925056e-06, + "loss": 0.369, + "step": 22942 + }, + { + "epoch": 9.330215534770232, + "grad_norm": 0.012581741898006713, + "learning_rate": 3.896555978578929e-06, + "loss": 0.0001, + "step": 22943 + }, + { + "epoch": 9.330622204148028, + "grad_norm": 0.01183073180091896, + "learning_rate": 3.895753426887726e-06, + "loss": 0.0002, + "step": 22944 + }, + { + "epoch": 9.331028873525824, + "grad_norm": 0.010290735617187453, + "learning_rate": 3.894950937859686e-06, + "loss": 0.0001, + "step": 22945 + }, + { + "epoch": 9.33143554290362, + "grad_norm": 0.06998784972297045, + "learning_rate": 3.894148511503046e-06, + "loss": 0.0007, + "step": 22946 + }, + { + "epoch": 9.331842212281416, + "grad_norm": 2.6679896581453275, + "learning_rate": 3.893346147826047e-06, + "loss": 0.0285, + "step": 22947 + }, + { + "epoch": 9.332248881659211, + "grad_norm": 1.7446255667185002, + "learning_rate": 3.8925438468369215e-06, + "loss": 0.0115, + "step": 22948 + }, + { + "epoch": 9.332655551037007, + "grad_norm": 0.14335442011415256, + "learning_rate": 3.891741608543908e-06, + "loss": 0.0012, + "step": 22949 + }, + { + "epoch": 9.333062220414803, + "grad_norm": 0.07379930635307952, + "learning_rate": 3.890939432955241e-06, + "loss": 0.0007, + "step": 22950 + }, + { + "epoch": 9.333468889792599, + "grad_norm": 1.1536256047091307, + "learning_rate": 3.890137320079151e-06, + "loss": 0.0142, + "step": 22951 + }, + { + "epoch": 9.333875559170394, + "grad_norm": 7.38467538330019, + "learning_rate": 3.889335269923882e-06, + "loss": 0.0762, + "step": 22952 + }, + { + "epoch": 9.33428222854819, + "grad_norm": 0.022677013426360478, + "learning_rate": 3.888533282497659e-06, + "loss": 0.0003, + "step": 22953 + }, + { + "epoch": 9.334688897925986, + "grad_norm": 0.4210352694804297, + "learning_rate": 3.88773135780872e-06, + "loss": 0.0058, + "step": 22954 + }, + { + "epoch": 9.335095567303782, + "grad_norm": 0.027498767470541766, + "learning_rate": 3.886929495865294e-06, + "loss": 0.0001, + "step": 22955 + }, + { + "epoch": 9.335502236681577, + "grad_norm": 0.6150918056000507, + "learning_rate": 3.886127696675613e-06, + "loss": 0.0072, + "step": 22956 + }, + { + "epoch": 9.335908906059373, + "grad_norm": 0.005464376350436822, + "learning_rate": 3.885325960247909e-06, + "loss": 0.0001, + "step": 22957 + }, + { + "epoch": 9.336315575437169, + "grad_norm": 0.004537244688267936, + "learning_rate": 3.884524286590407e-06, + "loss": 0.0001, + "step": 22958 + }, + { + "epoch": 9.336722244814965, + "grad_norm": 0.9022848026932979, + "learning_rate": 3.883722675711347e-06, + "loss": 0.0096, + "step": 22959 + }, + { + "epoch": 9.33712891419276, + "grad_norm": 0.16511974963366316, + "learning_rate": 3.88292112761895e-06, + "loss": 0.002, + "step": 22960 + }, + { + "epoch": 9.337535583570556, + "grad_norm": 1.824975501571302, + "learning_rate": 3.882119642321446e-06, + "loss": 0.0126, + "step": 22961 + }, + { + "epoch": 9.337942252948354, + "grad_norm": 0.24433592618967892, + "learning_rate": 3.881318219827064e-06, + "loss": 0.0026, + "step": 22962 + }, + { + "epoch": 9.33834892232615, + "grad_norm": 0.42827259508956766, + "learning_rate": 3.880516860144029e-06, + "loss": 0.0046, + "step": 22963 + }, + { + "epoch": 9.338755591703945, + "grad_norm": 0.0001493510603086721, + "learning_rate": 3.879715563280565e-06, + "loss": 0.0, + "step": 22964 + }, + { + "epoch": 9.339162261081741, + "grad_norm": 0.16727141677644372, + "learning_rate": 3.878914329244905e-06, + "loss": 0.0012, + "step": 22965 + }, + { + "epoch": 9.339568930459537, + "grad_norm": 0.01854942244601743, + "learning_rate": 3.8781131580452695e-06, + "loss": 0.0001, + "step": 22966 + }, + { + "epoch": 9.339975599837333, + "grad_norm": 0.236621276795289, + "learning_rate": 3.877312049689884e-06, + "loss": 0.0024, + "step": 22967 + }, + { + "epoch": 9.340382269215128, + "grad_norm": 0.06434601954103424, + "learning_rate": 3.876511004186971e-06, + "loss": 0.0006, + "step": 22968 + }, + { + "epoch": 9.340788938592924, + "grad_norm": 2.5588673879355257, + "learning_rate": 3.8757100215447555e-06, + "loss": 0.0256, + "step": 22969 + }, + { + "epoch": 9.34119560797072, + "grad_norm": 0.0896300607920971, + "learning_rate": 3.874909101771458e-06, + "loss": 0.0011, + "step": 22970 + }, + { + "epoch": 9.341602277348516, + "grad_norm": 0.000359482058832506, + "learning_rate": 3.874108244875299e-06, + "loss": 0.0, + "step": 22971 + }, + { + "epoch": 9.342008946726311, + "grad_norm": 0.2297213819324922, + "learning_rate": 3.873307450864504e-06, + "loss": 0.0019, + "step": 22972 + }, + { + "epoch": 9.342415616104107, + "grad_norm": 0.04609315787787823, + "learning_rate": 3.872506719747292e-06, + "loss": 0.0004, + "step": 22973 + }, + { + "epoch": 9.342822285481903, + "grad_norm": 0.6969528605389315, + "learning_rate": 3.871706051531883e-06, + "loss": 0.0074, + "step": 22974 + }, + { + "epoch": 9.343228954859699, + "grad_norm": 0.22404294742934844, + "learning_rate": 3.870905446226495e-06, + "loss": 0.0026, + "step": 22975 + }, + { + "epoch": 9.343635624237494, + "grad_norm": 0.004342432342105694, + "learning_rate": 3.870104903839348e-06, + "loss": 0.0001, + "step": 22976 + }, + { + "epoch": 9.34404229361529, + "grad_norm": 0.5745125223047504, + "learning_rate": 3.8693044243786546e-06, + "loss": 0.007, + "step": 22977 + }, + { + "epoch": 9.344448962993086, + "grad_norm": 0.003174192628531029, + "learning_rate": 3.868504007852641e-06, + "loss": 0.0, + "step": 22978 + }, + { + "epoch": 9.344855632370882, + "grad_norm": 1.041410677227306, + "learning_rate": 3.86770365426952e-06, + "loss": 0.0155, + "step": 22979 + }, + { + "epoch": 9.345262301748678, + "grad_norm": 0.015022174020541249, + "learning_rate": 3.866903363637507e-06, + "loss": 0.0002, + "step": 22980 + }, + { + "epoch": 9.345668971126473, + "grad_norm": 0.2337271430578153, + "learning_rate": 3.866103135964818e-06, + "loss": 0.0035, + "step": 22981 + }, + { + "epoch": 9.34607564050427, + "grad_norm": 0.011133441901086498, + "learning_rate": 3.865302971259667e-06, + "loss": 0.0001, + "step": 22982 + }, + { + "epoch": 9.346482309882067, + "grad_norm": 0.024735649395148242, + "learning_rate": 3.864502869530269e-06, + "loss": 0.0003, + "step": 22983 + }, + { + "epoch": 9.346888979259862, + "grad_norm": 0.022774154467020673, + "learning_rate": 3.863702830784833e-06, + "loss": 0.0003, + "step": 22984 + }, + { + "epoch": 9.347295648637658, + "grad_norm": 0.8788011596025914, + "learning_rate": 3.86290285503158e-06, + "loss": 0.0065, + "step": 22985 + }, + { + "epoch": 9.347702318015454, + "grad_norm": 0.1013586531330272, + "learning_rate": 3.862102942278717e-06, + "loss": 0.001, + "step": 22986 + }, + { + "epoch": 9.34810898739325, + "grad_norm": 0.00020821518020721236, + "learning_rate": 3.861303092534458e-06, + "loss": 0.0, + "step": 22987 + }, + { + "epoch": 9.348515656771045, + "grad_norm": 0.040950597864619215, + "learning_rate": 3.86050330580701e-06, + "loss": 0.0006, + "step": 22988 + }, + { + "epoch": 9.348922326148841, + "grad_norm": 0.0010184971487824253, + "learning_rate": 3.859703582104587e-06, + "loss": 0.0, + "step": 22989 + }, + { + "epoch": 9.349328995526637, + "grad_norm": 3.746689520722897, + "learning_rate": 3.858903921435393e-06, + "loss": 0.0458, + "step": 22990 + }, + { + "epoch": 9.349735664904433, + "grad_norm": 0.02922369771339604, + "learning_rate": 3.858104323807644e-06, + "loss": 0.0002, + "step": 22991 + }, + { + "epoch": 9.350142334282229, + "grad_norm": 0.3823935586981923, + "learning_rate": 3.8573047892295466e-06, + "loss": 0.0043, + "step": 22992 + }, + { + "epoch": 9.350549003660024, + "grad_norm": 0.0687307690633302, + "learning_rate": 3.856505317709307e-06, + "loss": 0.0005, + "step": 22993 + }, + { + "epoch": 9.35095567303782, + "grad_norm": 0.17290133599599397, + "learning_rate": 3.855705909255131e-06, + "loss": 0.0022, + "step": 22994 + }, + { + "epoch": 9.351362342415616, + "grad_norm": 3.8495290105280318, + "learning_rate": 3.854906563875227e-06, + "loss": 0.0316, + "step": 22995 + }, + { + "epoch": 9.351769011793412, + "grad_norm": 0.04731012546475265, + "learning_rate": 3.8541072815778e-06, + "loss": 0.0004, + "step": 22996 + }, + { + "epoch": 9.352175681171207, + "grad_norm": 0.14337441243732948, + "learning_rate": 3.853308062371052e-06, + "loss": 0.0007, + "step": 22997 + }, + { + "epoch": 9.352582350549003, + "grad_norm": 0.026255466285507675, + "learning_rate": 3.8525089062631925e-06, + "loss": 0.0003, + "step": 22998 + }, + { + "epoch": 9.352989019926799, + "grad_norm": 4.680874398101032, + "learning_rate": 3.851709813262423e-06, + "loss": 0.0905, + "step": 22999 + }, + { + "epoch": 9.353395689304595, + "grad_norm": 0.0026238296822027507, + "learning_rate": 3.850910783376948e-06, + "loss": 0.0, + "step": 23000 + }, + { + "epoch": 9.35380235868239, + "grad_norm": 4.231637549551071, + "learning_rate": 3.850111816614967e-06, + "loss": 0.0695, + "step": 23001 + }, + { + "epoch": 9.354209028060186, + "grad_norm": 0.0434545698752392, + "learning_rate": 3.849312912984685e-06, + "loss": 0.0007, + "step": 23002 + }, + { + "epoch": 9.354615697437984, + "grad_norm": 0.1617722770742924, + "learning_rate": 3.848514072494297e-06, + "loss": 0.0011, + "step": 23003 + }, + { + "epoch": 9.35502236681578, + "grad_norm": 0.6807440096539289, + "learning_rate": 3.847715295152012e-06, + "loss": 0.0035, + "step": 23004 + }, + { + "epoch": 9.355429036193575, + "grad_norm": 0.06268306269137564, + "learning_rate": 3.846916580966025e-06, + "loss": 0.0007, + "step": 23005 + }, + { + "epoch": 9.355835705571371, + "grad_norm": 2.7354616240394805, + "learning_rate": 3.846117929944537e-06, + "loss": 0.0109, + "step": 23006 + }, + { + "epoch": 9.356242374949167, + "grad_norm": 0.008032556077932166, + "learning_rate": 3.8453193420957456e-06, + "loss": 0.0001, + "step": 23007 + }, + { + "epoch": 9.356649044326963, + "grad_norm": 0.1256758748263845, + "learning_rate": 3.844520817427848e-06, + "loss": 0.0011, + "step": 23008 + }, + { + "epoch": 9.357055713704758, + "grad_norm": 1.8089673478279267, + "learning_rate": 3.843722355949044e-06, + "loss": 0.0204, + "step": 23009 + }, + { + "epoch": 9.357462383082554, + "grad_norm": 0.0026244244130824547, + "learning_rate": 3.842923957667523e-06, + "loss": 0.0, + "step": 23010 + }, + { + "epoch": 9.35786905246035, + "grad_norm": 0.008070857881093577, + "learning_rate": 3.842125622591489e-06, + "loss": 0.0001, + "step": 23011 + }, + { + "epoch": 9.358275721838146, + "grad_norm": 0.13209725591562363, + "learning_rate": 3.841327350729137e-06, + "loss": 0.0015, + "step": 23012 + }, + { + "epoch": 9.358682391215941, + "grad_norm": 0.0033376277923630364, + "learning_rate": 3.840529142088658e-06, + "loss": 0.0, + "step": 23013 + }, + { + "epoch": 9.359089060593737, + "grad_norm": 0.0170834511653069, + "learning_rate": 3.839730996678248e-06, + "loss": 0.0002, + "step": 23014 + }, + { + "epoch": 9.359495729971533, + "grad_norm": 0.0017640423133653586, + "learning_rate": 3.838932914506099e-06, + "loss": 0.0, + "step": 23015 + }, + { + "epoch": 9.359902399349329, + "grad_norm": 1.312412798390551, + "learning_rate": 3.838134895580402e-06, + "loss": 0.017, + "step": 23016 + }, + { + "epoch": 9.360309068727124, + "grad_norm": 0.0412483759920674, + "learning_rate": 3.837336939909354e-06, + "loss": 0.0003, + "step": 23017 + }, + { + "epoch": 9.36071573810492, + "grad_norm": 0.00559347639802885, + "learning_rate": 3.836539047501146e-06, + "loss": 0.0001, + "step": 23018 + }, + { + "epoch": 9.361122407482716, + "grad_norm": 0.14859142892787267, + "learning_rate": 3.835741218363964e-06, + "loss": 0.002, + "step": 23019 + }, + { + "epoch": 9.361529076860512, + "grad_norm": 0.26009684418911244, + "learning_rate": 3.834943452506001e-06, + "loss": 0.0023, + "step": 23020 + }, + { + "epoch": 9.361935746238307, + "grad_norm": 2.626158351748474, + "learning_rate": 3.834145749935448e-06, + "loss": 0.0639, + "step": 23021 + }, + { + "epoch": 9.362342415616103, + "grad_norm": 0.023359349793608538, + "learning_rate": 3.8333481106604905e-06, + "loss": 0.0002, + "step": 23022 + }, + { + "epoch": 9.3627490849939, + "grad_norm": 0.003049419492872188, + "learning_rate": 3.832550534689317e-06, + "loss": 0.0, + "step": 23023 + }, + { + "epoch": 9.363155754371697, + "grad_norm": 0.014277564323101023, + "learning_rate": 3.831753022030117e-06, + "loss": 0.0002, + "step": 23024 + }, + { + "epoch": 9.363562423749492, + "grad_norm": 0.0007063703099352698, + "learning_rate": 3.8309555726910785e-06, + "loss": 0.0, + "step": 23025 + }, + { + "epoch": 9.363969093127288, + "grad_norm": 1.1491362890156473, + "learning_rate": 3.830158186680385e-06, + "loss": 0.0096, + "step": 23026 + }, + { + "epoch": 9.364375762505084, + "grad_norm": 0.15890975124930629, + "learning_rate": 3.829360864006223e-06, + "loss": 0.0013, + "step": 23027 + }, + { + "epoch": 9.36478243188288, + "grad_norm": 0.061429605904246745, + "learning_rate": 3.828563604676778e-06, + "loss": 0.0005, + "step": 23028 + }, + { + "epoch": 9.365189101260675, + "grad_norm": 0.047101668070885304, + "learning_rate": 3.8277664087002296e-06, + "loss": 0.0006, + "step": 23029 + }, + { + "epoch": 9.365595770638471, + "grad_norm": 0.750792551191283, + "learning_rate": 3.826969276084768e-06, + "loss": 0.0041, + "step": 23030 + }, + { + "epoch": 9.366002440016267, + "grad_norm": 0.021247580939195248, + "learning_rate": 3.826172206838575e-06, + "loss": 0.0002, + "step": 23031 + }, + { + "epoch": 9.366409109394063, + "grad_norm": 0.01653179455258305, + "learning_rate": 3.825375200969832e-06, + "loss": 0.0002, + "step": 23032 + }, + { + "epoch": 9.366815778771858, + "grad_norm": 0.008129156723793707, + "learning_rate": 3.824578258486719e-06, + "loss": 0.0, + "step": 23033 + }, + { + "epoch": 9.367222448149654, + "grad_norm": 6.159010945771678, + "learning_rate": 3.823781379397418e-06, + "loss": 0.083, + "step": 23034 + }, + { + "epoch": 9.36762911752745, + "grad_norm": 0.9793335857755302, + "learning_rate": 3.822984563710111e-06, + "loss": 0.0059, + "step": 23035 + }, + { + "epoch": 9.368035786905246, + "grad_norm": 5.490340042183051, + "learning_rate": 3.8221878114329714e-06, + "loss": 0.0785, + "step": 23036 + }, + { + "epoch": 9.368442456283042, + "grad_norm": 0.7265946007532723, + "learning_rate": 3.821391122574186e-06, + "loss": 0.0097, + "step": 23037 + }, + { + "epoch": 9.368849125660837, + "grad_norm": 3.5831605974073404, + "learning_rate": 3.8205944971419315e-06, + "loss": 0.054, + "step": 23038 + }, + { + "epoch": 9.369255795038633, + "grad_norm": 0.11817537745057399, + "learning_rate": 3.8197979351443845e-06, + "loss": 0.0013, + "step": 23039 + }, + { + "epoch": 9.369662464416429, + "grad_norm": 0.01293398100021064, + "learning_rate": 3.819001436589723e-06, + "loss": 0.0001, + "step": 23040 + }, + { + "epoch": 9.370069133794225, + "grad_norm": 0.0005992976978425969, + "learning_rate": 3.8182050014861215e-06, + "loss": 0.0, + "step": 23041 + }, + { + "epoch": 9.37047580317202, + "grad_norm": 0.07659536953473922, + "learning_rate": 3.817408629841753e-06, + "loss": 0.0008, + "step": 23042 + }, + { + "epoch": 9.370882472549816, + "grad_norm": 0.038752251881723, + "learning_rate": 3.816612321664801e-06, + "loss": 0.0005, + "step": 23043 + }, + { + "epoch": 9.371289141927614, + "grad_norm": 0.03698781534689189, + "learning_rate": 3.815816076963439e-06, + "loss": 0.0004, + "step": 23044 + }, + { + "epoch": 9.37169581130541, + "grad_norm": 0.1978791063331536, + "learning_rate": 3.815019895745833e-06, + "loss": 0.0015, + "step": 23045 + }, + { + "epoch": 9.372102480683205, + "grad_norm": 0.01704336614210296, + "learning_rate": 3.814223778020162e-06, + "loss": 0.0002, + "step": 23046 + }, + { + "epoch": 9.372509150061001, + "grad_norm": 0.0009933103035493355, + "learning_rate": 3.813427723794596e-06, + "loss": 0.0, + "step": 23047 + }, + { + "epoch": 9.372915819438797, + "grad_norm": 0.1882795739600891, + "learning_rate": 3.8126317330773044e-06, + "loss": 0.0018, + "step": 23048 + }, + { + "epoch": 9.373322488816592, + "grad_norm": 0.1233641266191037, + "learning_rate": 3.811835805876466e-06, + "loss": 0.0005, + "step": 23049 + }, + { + "epoch": 9.373729158194388, + "grad_norm": 0.3553597170071443, + "learning_rate": 3.8110399422002473e-06, + "loss": 0.0037, + "step": 23050 + }, + { + "epoch": 9.374135827572184, + "grad_norm": 0.00234746004672828, + "learning_rate": 3.810244142056819e-06, + "loss": 0.0, + "step": 23051 + }, + { + "epoch": 9.37454249694998, + "grad_norm": 5.797295989067953, + "learning_rate": 3.8094484054543502e-06, + "loss": 0.1134, + "step": 23052 + }, + { + "epoch": 9.374949166327776, + "grad_norm": 0.01135003343876354, + "learning_rate": 3.808652732401008e-06, + "loss": 0.0001, + "step": 23053 + }, + { + "epoch": 9.375355835705571, + "grad_norm": 0.05235216848161884, + "learning_rate": 3.8078571229049587e-06, + "loss": 0.0005, + "step": 23054 + }, + { + "epoch": 9.375762505083367, + "grad_norm": 0.10550107490064878, + "learning_rate": 3.8070615769743758e-06, + "loss": 0.0007, + "step": 23055 + }, + { + "epoch": 9.376169174461163, + "grad_norm": 0.0028222887364984504, + "learning_rate": 3.8062660946174226e-06, + "loss": 0.0, + "step": 23056 + }, + { + "epoch": 9.376575843838959, + "grad_norm": 0.004422177839390381, + "learning_rate": 3.8054706758422653e-06, + "loss": 0.0, + "step": 23057 + }, + { + "epoch": 9.376982513216754, + "grad_norm": 0.07675437707534187, + "learning_rate": 3.8046753206570685e-06, + "loss": 0.001, + "step": 23058 + }, + { + "epoch": 9.37738918259455, + "grad_norm": 0.025867382309873337, + "learning_rate": 3.8038800290699984e-06, + "loss": 0.0003, + "step": 23059 + }, + { + "epoch": 9.377795851972346, + "grad_norm": 0.001349166035569871, + "learning_rate": 3.8030848010892187e-06, + "loss": 0.0, + "step": 23060 + }, + { + "epoch": 9.378202521350142, + "grad_norm": 0.020128433145142413, + "learning_rate": 3.8022896367228877e-06, + "loss": 0.0002, + "step": 23061 + }, + { + "epoch": 9.378609190727937, + "grad_norm": 3.1066968450851786, + "learning_rate": 3.8014945359791766e-06, + "loss": 0.0352, + "step": 23062 + }, + { + "epoch": 9.379015860105733, + "grad_norm": 0.0015994866841007411, + "learning_rate": 3.8006994988662436e-06, + "loss": 0.0, + "step": 23063 + }, + { + "epoch": 9.37942252948353, + "grad_norm": 1.3824424585285127, + "learning_rate": 3.799904525392251e-06, + "loss": 0.0113, + "step": 23064 + }, + { + "epoch": 9.379829198861326, + "grad_norm": 0.0478381387858242, + "learning_rate": 3.7991096155653585e-06, + "loss": 0.0004, + "step": 23065 + }, + { + "epoch": 9.380235868239122, + "grad_norm": 0.5861232131459857, + "learning_rate": 3.798314769393726e-06, + "loss": 0.0033, + "step": 23066 + }, + { + "epoch": 9.380642537616918, + "grad_norm": 0.0057715544713120015, + "learning_rate": 3.7975199868855106e-06, + "loss": 0.0001, + "step": 23067 + }, + { + "epoch": 9.381049206994714, + "grad_norm": 0.007523070143170606, + "learning_rate": 3.7967252680488786e-06, + "loss": 0.0001, + "step": 23068 + }, + { + "epoch": 9.38145587637251, + "grad_norm": 0.032063582589445826, + "learning_rate": 3.7959306128919825e-06, + "loss": 0.0003, + "step": 23069 + }, + { + "epoch": 9.381862545750305, + "grad_norm": 0.01079037199002875, + "learning_rate": 3.7951360214229816e-06, + "loss": 0.0002, + "step": 23070 + }, + { + "epoch": 9.382269215128101, + "grad_norm": 9.282039062917963, + "learning_rate": 3.794341493650032e-06, + "loss": 0.1058, + "step": 23071 + }, + { + "epoch": 9.382675884505897, + "grad_norm": 0.016774505809904888, + "learning_rate": 3.7935470295812916e-06, + "loss": 0.0002, + "step": 23072 + }, + { + "epoch": 9.383082553883693, + "grad_norm": 0.0013217470235168446, + "learning_rate": 3.792752629224913e-06, + "loss": 0.0, + "step": 23073 + }, + { + "epoch": 9.383489223261488, + "grad_norm": 0.02445127747850559, + "learning_rate": 3.7919582925890498e-06, + "loss": 0.0002, + "step": 23074 + }, + { + "epoch": 9.383895892639284, + "grad_norm": 0.0005031638284721154, + "learning_rate": 3.791164019681862e-06, + "loss": 0.0, + "step": 23075 + }, + { + "epoch": 9.38430256201708, + "grad_norm": 0.17267000254706788, + "learning_rate": 3.7903698105115017e-06, + "loss": 0.0013, + "step": 23076 + }, + { + "epoch": 9.384709231394876, + "grad_norm": 0.007444046887162321, + "learning_rate": 3.7895756650861193e-06, + "loss": 0.0001, + "step": 23077 + }, + { + "epoch": 9.385115900772671, + "grad_norm": 0.0010090705779679237, + "learning_rate": 3.7887815834138686e-06, + "loss": 0.0, + "step": 23078 + }, + { + "epoch": 9.385522570150467, + "grad_norm": 0.0025764227907516144, + "learning_rate": 3.7879875655029018e-06, + "loss": 0.0, + "step": 23079 + }, + { + "epoch": 9.385929239528263, + "grad_norm": 0.1706031014047761, + "learning_rate": 3.787193611361365e-06, + "loss": 0.0008, + "step": 23080 + }, + { + "epoch": 9.386335908906059, + "grad_norm": 0.03385457972996176, + "learning_rate": 3.7863997209974157e-06, + "loss": 0.0002, + "step": 23081 + }, + { + "epoch": 9.386742578283854, + "grad_norm": 0.3567551277664979, + "learning_rate": 3.785605894419201e-06, + "loss": 0.0023, + "step": 23082 + }, + { + "epoch": 9.38714924766165, + "grad_norm": 0.00905118314707304, + "learning_rate": 3.7848121316348696e-06, + "loss": 0.0001, + "step": 23083 + }, + { + "epoch": 9.387555917039446, + "grad_norm": 0.00021205053501640857, + "learning_rate": 3.7840184326525686e-06, + "loss": 0.0, + "step": 23084 + }, + { + "epoch": 9.387962586417244, + "grad_norm": 0.021963931852911445, + "learning_rate": 3.783224797480448e-06, + "loss": 0.0003, + "step": 23085 + }, + { + "epoch": 9.38836925579504, + "grad_norm": 0.10404394199134313, + "learning_rate": 3.782431226126654e-06, + "loss": 0.0016, + "step": 23086 + }, + { + "epoch": 9.388775925172835, + "grad_norm": 0.3120297825153832, + "learning_rate": 3.781637718599328e-06, + "loss": 0.0026, + "step": 23087 + }, + { + "epoch": 9.38918259455063, + "grad_norm": 0.010330542761234152, + "learning_rate": 3.7808442749066244e-06, + "loss": 0.0001, + "step": 23088 + }, + { + "epoch": 9.389589263928427, + "grad_norm": 0.00887626112566024, + "learning_rate": 3.780050895056685e-06, + "loss": 0.0001, + "step": 23089 + }, + { + "epoch": 9.389995933306222, + "grad_norm": 0.0020880670888365586, + "learning_rate": 3.7792575790576524e-06, + "loss": 0.0, + "step": 23090 + }, + { + "epoch": 9.390402602684018, + "grad_norm": 0.0012430320790097226, + "learning_rate": 3.7784643269176724e-06, + "loss": 0.0, + "step": 23091 + }, + { + "epoch": 9.390809272061814, + "grad_norm": 0.0032620198662818077, + "learning_rate": 3.7776711386448872e-06, + "loss": 0.0, + "step": 23092 + }, + { + "epoch": 9.39121594143961, + "grad_norm": 0.024062030235857106, + "learning_rate": 3.776878014247436e-06, + "loss": 0.0002, + "step": 23093 + }, + { + "epoch": 9.391622610817405, + "grad_norm": 0.03811903040376168, + "learning_rate": 3.7760849537334675e-06, + "loss": 0.0004, + "step": 23094 + }, + { + "epoch": 9.392029280195201, + "grad_norm": 0.08643900027355747, + "learning_rate": 3.775291957111119e-06, + "loss": 0.0007, + "step": 23095 + }, + { + "epoch": 9.392435949572997, + "grad_norm": 0.010878071230557236, + "learning_rate": 3.7744990243885313e-06, + "loss": 0.0001, + "step": 23096 + }, + { + "epoch": 9.392842618950793, + "grad_norm": 0.07234800980019782, + "learning_rate": 3.7737061555738453e-06, + "loss": 0.0011, + "step": 23097 + }, + { + "epoch": 9.393249288328589, + "grad_norm": 0.011577018528156388, + "learning_rate": 3.772913350675198e-06, + "loss": 0.0001, + "step": 23098 + }, + { + "epoch": 9.393655957706384, + "grad_norm": 0.002110455954703133, + "learning_rate": 3.77212060970073e-06, + "loss": 0.0, + "step": 23099 + }, + { + "epoch": 9.39406262708418, + "grad_norm": 0.0015588981777629961, + "learning_rate": 3.771327932658575e-06, + "loss": 0.0, + "step": 23100 + }, + { + "epoch": 9.394469296461976, + "grad_norm": 0.020211265942613837, + "learning_rate": 3.770535319556877e-06, + "loss": 0.0003, + "step": 23101 + }, + { + "epoch": 9.394875965839772, + "grad_norm": 0.014102916021116907, + "learning_rate": 3.769742770403768e-06, + "loss": 0.0001, + "step": 23102 + }, + { + "epoch": 9.395282635217567, + "grad_norm": 0.2690734960628065, + "learning_rate": 3.7689502852073865e-06, + "loss": 0.0008, + "step": 23103 + }, + { + "epoch": 9.395689304595363, + "grad_norm": 4.212177655508086e-05, + "learning_rate": 3.7681578639758666e-06, + "loss": 0.0, + "step": 23104 + }, + { + "epoch": 9.39609597397316, + "grad_norm": 1.3081303391578887, + "learning_rate": 3.767365506717341e-06, + "loss": 0.008, + "step": 23105 + }, + { + "epoch": 9.396502643350956, + "grad_norm": 0.006458057146945557, + "learning_rate": 3.7665732134399436e-06, + "loss": 0.0001, + "step": 23106 + }, + { + "epoch": 9.396909312728752, + "grad_norm": 0.003921394433236015, + "learning_rate": 3.7657809841518112e-06, + "loss": 0.0, + "step": 23107 + }, + { + "epoch": 9.397315982106548, + "grad_norm": 0.07713999320763613, + "learning_rate": 3.764988818861076e-06, + "loss": 0.0006, + "step": 23108 + }, + { + "epoch": 9.397722651484344, + "grad_norm": 0.0005378065420316892, + "learning_rate": 3.7641967175758674e-06, + "loss": 0.0, + "step": 23109 + }, + { + "epoch": 9.39812932086214, + "grad_norm": 1.3011675134429392, + "learning_rate": 3.763404680304319e-06, + "loss": 0.0124, + "step": 23110 + }, + { + "epoch": 9.398535990239935, + "grad_norm": 0.06487269584347698, + "learning_rate": 3.7626127070545593e-06, + "loss": 0.0009, + "step": 23111 + }, + { + "epoch": 9.398942659617731, + "grad_norm": 1.1126855935828097, + "learning_rate": 3.7618207978347197e-06, + "loss": 0.0081, + "step": 23112 + }, + { + "epoch": 9.399349328995527, + "grad_norm": 0.011668389783889191, + "learning_rate": 3.7610289526529265e-06, + "loss": 0.0001, + "step": 23113 + }, + { + "epoch": 9.399755998373323, + "grad_norm": 0.031158430525530505, + "learning_rate": 3.7602371715173146e-06, + "loss": 0.0003, + "step": 23114 + }, + { + "epoch": 9.400162667751118, + "grad_norm": 0.021352884357593416, + "learning_rate": 3.759445454436008e-06, + "loss": 0.0001, + "step": 23115 + }, + { + "epoch": 9.400569337128914, + "grad_norm": 0.04520691284457793, + "learning_rate": 3.7586538014171346e-06, + "loss": 0.0005, + "step": 23116 + }, + { + "epoch": 9.40097600650671, + "grad_norm": 0.055712194120202924, + "learning_rate": 3.7578622124688215e-06, + "loss": 0.0007, + "step": 23117 + }, + { + "epoch": 9.401382675884506, + "grad_norm": 0.00945797736089275, + "learning_rate": 3.7570706875991937e-06, + "loss": 0.0001, + "step": 23118 + }, + { + "epoch": 9.401789345262301, + "grad_norm": 0.32559996978659206, + "learning_rate": 3.756279226816374e-06, + "loss": 0.0046, + "step": 23119 + }, + { + "epoch": 9.402196014640097, + "grad_norm": 2.013888673056833, + "learning_rate": 3.7554878301284936e-06, + "loss": 0.0225, + "step": 23120 + }, + { + "epoch": 9.402602684017893, + "grad_norm": 0.027548821935523892, + "learning_rate": 3.7546964975436737e-06, + "loss": 0.0002, + "step": 23121 + }, + { + "epoch": 9.403009353395689, + "grad_norm": 0.12480702235654172, + "learning_rate": 3.7539052290700386e-06, + "loss": 0.0012, + "step": 23122 + }, + { + "epoch": 9.403416022773484, + "grad_norm": 0.0005715914851709638, + "learning_rate": 3.7531140247157083e-06, + "loss": 0.0, + "step": 23123 + }, + { + "epoch": 9.40382269215128, + "grad_norm": 0.006509858204387818, + "learning_rate": 3.752322884488807e-06, + "loss": 0.0001, + "step": 23124 + }, + { + "epoch": 9.404229361529076, + "grad_norm": 0.0008334728044785834, + "learning_rate": 3.751531808397456e-06, + "loss": 0.0, + "step": 23125 + }, + { + "epoch": 9.404636030906874, + "grad_norm": 0.02604110191441596, + "learning_rate": 3.750740796449772e-06, + "loss": 0.0003, + "step": 23126 + }, + { + "epoch": 9.40504270028467, + "grad_norm": 0.09987064331410057, + "learning_rate": 3.749949848653883e-06, + "loss": 0.0011, + "step": 23127 + }, + { + "epoch": 9.405449369662465, + "grad_norm": 0.24522970112370013, + "learning_rate": 3.749158965017904e-06, + "loss": 0.002, + "step": 23128 + }, + { + "epoch": 9.40585603904026, + "grad_norm": 0.034322006473435605, + "learning_rate": 3.7483681455499543e-06, + "loss": 0.0005, + "step": 23129 + }, + { + "epoch": 9.406262708418057, + "grad_norm": 0.001484301791823438, + "learning_rate": 3.7475773902581513e-06, + "loss": 0.0, + "step": 23130 + }, + { + "epoch": 9.406669377795852, + "grad_norm": 5.465416719735961, + "learning_rate": 3.7467866991506143e-06, + "loss": 0.0879, + "step": 23131 + }, + { + "epoch": 9.407076047173648, + "grad_norm": 0.0023313814875715873, + "learning_rate": 3.7459960722354548e-06, + "loss": 0.0, + "step": 23132 + }, + { + "epoch": 9.407482716551444, + "grad_norm": 0.28459951521776655, + "learning_rate": 3.7452055095207975e-06, + "loss": 0.0039, + "step": 23133 + }, + { + "epoch": 9.40788938592924, + "grad_norm": 0.06721992718601982, + "learning_rate": 3.7444150110147524e-06, + "loss": 0.0005, + "step": 23134 + }, + { + "epoch": 9.408296055307035, + "grad_norm": 1.5750785247678967, + "learning_rate": 3.7436245767254365e-06, + "loss": 0.0162, + "step": 23135 + }, + { + "epoch": 9.408702724684831, + "grad_norm": 0.01626439235986287, + "learning_rate": 3.7428342066609624e-06, + "loss": 0.0001, + "step": 23136 + }, + { + "epoch": 9.409109394062627, + "grad_norm": 0.31162402231150804, + "learning_rate": 3.7420439008294442e-06, + "loss": 0.0038, + "step": 23137 + }, + { + "epoch": 9.409516063440423, + "grad_norm": 0.01049611275914406, + "learning_rate": 3.741253659238996e-06, + "loss": 0.0001, + "step": 23138 + }, + { + "epoch": 9.409922732818218, + "grad_norm": 0.017818071136282556, + "learning_rate": 3.740463481897725e-06, + "loss": 0.0001, + "step": 23139 + }, + { + "epoch": 9.410329402196014, + "grad_norm": 0.5590851426558558, + "learning_rate": 3.73967336881375e-06, + "loss": 0.0073, + "step": 23140 + }, + { + "epoch": 9.41073607157381, + "grad_norm": 9.376231627624925, + "learning_rate": 3.738883319995178e-06, + "loss": 0.0979, + "step": 23141 + }, + { + "epoch": 9.411142740951606, + "grad_norm": 0.002075339018654529, + "learning_rate": 3.7380933354501204e-06, + "loss": 0.0, + "step": 23142 + }, + { + "epoch": 9.411549410329402, + "grad_norm": 0.01087164623003629, + "learning_rate": 3.737303415186686e-06, + "loss": 0.0001, + "step": 23143 + }, + { + "epoch": 9.411956079707197, + "grad_norm": 0.006135129409689375, + "learning_rate": 3.7365135592129855e-06, + "loss": 0.0001, + "step": 23144 + }, + { + "epoch": 9.412362749084993, + "grad_norm": 0.0041973579040012945, + "learning_rate": 3.735723767537124e-06, + "loss": 0.0, + "step": 23145 + }, + { + "epoch": 9.41276941846279, + "grad_norm": 1.4978985832716523, + "learning_rate": 3.734934040167211e-06, + "loss": 0.0127, + "step": 23146 + }, + { + "epoch": 9.413176087840586, + "grad_norm": 0.011914875644748598, + "learning_rate": 3.734144377111354e-06, + "loss": 0.0001, + "step": 23147 + }, + { + "epoch": 9.413582757218382, + "grad_norm": 1.8585068228332098, + "learning_rate": 3.733354778377658e-06, + "loss": 0.0259, + "step": 23148 + }, + { + "epoch": 9.413989426596178, + "grad_norm": 0.00036819350471986335, + "learning_rate": 3.7325652439742288e-06, + "loss": 0.0, + "step": 23149 + }, + { + "epoch": 9.414396095973974, + "grad_norm": 0.0032542666432189373, + "learning_rate": 3.731775773909169e-06, + "loss": 0.0, + "step": 23150 + }, + { + "epoch": 9.41480276535177, + "grad_norm": 0.09234386937628315, + "learning_rate": 3.730986368190589e-06, + "loss": 0.0007, + "step": 23151 + }, + { + "epoch": 9.415209434729565, + "grad_norm": 0.12937682923089575, + "learning_rate": 3.7301970268265884e-06, + "loss": 0.001, + "step": 23152 + }, + { + "epoch": 9.415616104107361, + "grad_norm": 0.36093419505951296, + "learning_rate": 3.7294077498252713e-06, + "loss": 0.005, + "step": 23153 + }, + { + "epoch": 9.416022773485157, + "grad_norm": 0.009549622496816436, + "learning_rate": 3.728618537194739e-06, + "loss": 0.0001, + "step": 23154 + }, + { + "epoch": 9.416429442862952, + "grad_norm": 0.07755933123848738, + "learning_rate": 3.7278293889430937e-06, + "loss": 0.0008, + "step": 23155 + }, + { + "epoch": 9.416836112240748, + "grad_norm": 6.3115769517121105, + "learning_rate": 3.7270403050784374e-06, + "loss": 0.0699, + "step": 23156 + }, + { + "epoch": 9.417242781618544, + "grad_norm": 0.012347383741661914, + "learning_rate": 3.7262512856088663e-06, + "loss": 0.0002, + "step": 23157 + }, + { + "epoch": 9.41764945099634, + "grad_norm": 0.7839178385081387, + "learning_rate": 3.7254623305424853e-06, + "loss": 0.0035, + "step": 23158 + }, + { + "epoch": 9.418056120374136, + "grad_norm": 3.427818882136864, + "learning_rate": 3.724673439887392e-06, + "loss": 0.0737, + "step": 23159 + }, + { + "epoch": 9.418462789751931, + "grad_norm": 0.1338879795545714, + "learning_rate": 3.7238846136516847e-06, + "loss": 0.001, + "step": 23160 + }, + { + "epoch": 9.418869459129727, + "grad_norm": 4.081908017618681, + "learning_rate": 3.72309585184346e-06, + "loss": 0.0671, + "step": 23161 + }, + { + "epoch": 9.419276128507523, + "grad_norm": 0.019707841192383058, + "learning_rate": 3.7223071544708155e-06, + "loss": 0.0002, + "step": 23162 + }, + { + "epoch": 9.419682797885319, + "grad_norm": 1.5091810825149408, + "learning_rate": 3.721518521541847e-06, + "loss": 0.0164, + "step": 23163 + }, + { + "epoch": 9.420089467263114, + "grad_norm": 0.011040805385704159, + "learning_rate": 3.720729953064648e-06, + "loss": 0.0001, + "step": 23164 + }, + { + "epoch": 9.42049613664091, + "grad_norm": 0.07821042127086945, + "learning_rate": 3.7199414490473196e-06, + "loss": 0.0006, + "step": 23165 + }, + { + "epoch": 9.420902806018706, + "grad_norm": 1.978574407368946, + "learning_rate": 3.7191530094979533e-06, + "loss": 0.0213, + "step": 23166 + }, + { + "epoch": 9.421309475396503, + "grad_norm": 0.006878869920874474, + "learning_rate": 3.7183646344246427e-06, + "loss": 0.0001, + "step": 23167 + }, + { + "epoch": 9.4217161447743, + "grad_norm": 1.7427693522517755, + "learning_rate": 3.7175763238354802e-06, + "loss": 0.0092, + "step": 23168 + }, + { + "epoch": 9.422122814152095, + "grad_norm": 0.18922147877599724, + "learning_rate": 3.716788077738558e-06, + "loss": 0.0018, + "step": 23169 + }, + { + "epoch": 9.42252948352989, + "grad_norm": 0.07089510204714118, + "learning_rate": 3.7159998961419654e-06, + "loss": 0.0008, + "step": 23170 + }, + { + "epoch": 9.422936152907686, + "grad_norm": 0.017323477157303223, + "learning_rate": 3.7152117790538002e-06, + "loss": 0.0002, + "step": 23171 + }, + { + "epoch": 9.423342822285482, + "grad_norm": 0.04655783582482611, + "learning_rate": 3.714423726482149e-06, + "loss": 0.0003, + "step": 23172 + }, + { + "epoch": 9.423749491663278, + "grad_norm": 0.28351490728566864, + "learning_rate": 3.713635738435102e-06, + "loss": 0.0022, + "step": 23173 + }, + { + "epoch": 9.424156161041074, + "grad_norm": 0.041051344023735475, + "learning_rate": 3.712847814920747e-06, + "loss": 0.0003, + "step": 23174 + }, + { + "epoch": 9.42456283041887, + "grad_norm": 0.5042710644467866, + "learning_rate": 3.712059955947175e-06, + "loss": 0.0046, + "step": 23175 + }, + { + "epoch": 9.424969499796665, + "grad_norm": 0.012255799983237444, + "learning_rate": 3.711272161522471e-06, + "loss": 0.0001, + "step": 23176 + }, + { + "epoch": 9.425376169174461, + "grad_norm": 0.05520448887985817, + "learning_rate": 3.7104844316547195e-06, + "loss": 0.0004, + "step": 23177 + }, + { + "epoch": 9.425782838552257, + "grad_norm": 22.65858199384258, + "learning_rate": 3.7096967663520156e-06, + "loss": 0.259, + "step": 23178 + }, + { + "epoch": 9.426189507930053, + "grad_norm": 0.7711298006469612, + "learning_rate": 3.7089091656224395e-06, + "loss": 0.0064, + "step": 23179 + }, + { + "epoch": 9.426596177307848, + "grad_norm": 0.004187431787093005, + "learning_rate": 3.7081216294740773e-06, + "loss": 0.0001, + "step": 23180 + }, + { + "epoch": 9.427002846685644, + "grad_norm": 8.055367735907435, + "learning_rate": 3.707334157915012e-06, + "loss": 0.4549, + "step": 23181 + }, + { + "epoch": 9.42740951606344, + "grad_norm": 0.017063478280662837, + "learning_rate": 3.7065467509533305e-06, + "loss": 0.0002, + "step": 23182 + }, + { + "epoch": 9.427816185441236, + "grad_norm": 0.007951476665482175, + "learning_rate": 3.7057594085971094e-06, + "loss": 0.0001, + "step": 23183 + }, + { + "epoch": 9.428222854819031, + "grad_norm": 2.094738418261005, + "learning_rate": 3.7049721308544394e-06, + "loss": 0.0209, + "step": 23184 + }, + { + "epoch": 9.428629524196827, + "grad_norm": 0.011175493565229438, + "learning_rate": 3.7041849177334e-06, + "loss": 0.0001, + "step": 23185 + }, + { + "epoch": 9.429036193574623, + "grad_norm": 0.03693373670715491, + "learning_rate": 3.7033977692420696e-06, + "loss": 0.0003, + "step": 23186 + }, + { + "epoch": 9.42944286295242, + "grad_norm": 0.36692105940213493, + "learning_rate": 3.7026106853885302e-06, + "loss": 0.0032, + "step": 23187 + }, + { + "epoch": 9.429849532330216, + "grad_norm": 3.0428785373006972, + "learning_rate": 3.7018236661808614e-06, + "loss": 0.0312, + "step": 23188 + }, + { + "epoch": 9.430256201708012, + "grad_norm": 0.5806425980937829, + "learning_rate": 3.701036711627143e-06, + "loss": 0.0044, + "step": 23189 + }, + { + "epoch": 9.430662871085808, + "grad_norm": 0.008918417547583183, + "learning_rate": 3.70024982173545e-06, + "loss": 0.0001, + "step": 23190 + }, + { + "epoch": 9.431069540463604, + "grad_norm": 0.07457934358570478, + "learning_rate": 3.699462996513865e-06, + "loss": 0.0007, + "step": 23191 + }, + { + "epoch": 9.4314762098414, + "grad_norm": 0.010552630597051428, + "learning_rate": 3.6986762359704644e-06, + "loss": 0.0001, + "step": 23192 + }, + { + "epoch": 9.431882879219195, + "grad_norm": 0.005781010587413217, + "learning_rate": 3.697889540113324e-06, + "loss": 0.0001, + "step": 23193 + }, + { + "epoch": 9.43228954859699, + "grad_norm": 6.19457271654831e-05, + "learning_rate": 3.6971029089505185e-06, + "loss": 0.0, + "step": 23194 + }, + { + "epoch": 9.432696217974787, + "grad_norm": 0.007864584289275108, + "learning_rate": 3.696316342490124e-06, + "loss": 0.0001, + "step": 23195 + }, + { + "epoch": 9.433102887352582, + "grad_norm": 0.05206229104242705, + "learning_rate": 3.6955298407402118e-06, + "loss": 0.0007, + "step": 23196 + }, + { + "epoch": 9.433509556730378, + "grad_norm": 0.1392766688931215, + "learning_rate": 3.6947434037088615e-06, + "loss": 0.0017, + "step": 23197 + }, + { + "epoch": 9.433916226108174, + "grad_norm": 0.0020818712962624776, + "learning_rate": 3.6939570314041438e-06, + "loss": 0.0, + "step": 23198 + }, + { + "epoch": 9.43432289548597, + "grad_norm": 0.2722948323191722, + "learning_rate": 3.693170723834132e-06, + "loss": 0.0025, + "step": 23199 + }, + { + "epoch": 9.434729564863765, + "grad_norm": 0.0005841896092468947, + "learning_rate": 3.692384481006895e-06, + "loss": 0.0, + "step": 23200 + }, + { + "epoch": 9.435136234241561, + "grad_norm": 0.163323253629252, + "learning_rate": 3.691598302930507e-06, + "loss": 0.0016, + "step": 23201 + }, + { + "epoch": 9.435542903619357, + "grad_norm": 9.883679067743847, + "learning_rate": 3.6908121896130367e-06, + "loss": 0.2301, + "step": 23202 + }, + { + "epoch": 9.435949572997153, + "grad_norm": 7.4767131301949, + "learning_rate": 3.6900261410625526e-06, + "loss": 0.1216, + "step": 23203 + }, + { + "epoch": 9.436356242374949, + "grad_norm": 0.03790971567103162, + "learning_rate": 3.6892401572871286e-06, + "loss": 0.0005, + "step": 23204 + }, + { + "epoch": 9.436762911752744, + "grad_norm": 0.4296204956329352, + "learning_rate": 3.6884542382948294e-06, + "loss": 0.0025, + "step": 23205 + }, + { + "epoch": 9.43716958113054, + "grad_norm": 0.8220714423832228, + "learning_rate": 3.6876683840937254e-06, + "loss": 0.0074, + "step": 23206 + }, + { + "epoch": 9.437576250508336, + "grad_norm": 0.27272757885728494, + "learning_rate": 3.686882594691882e-06, + "loss": 0.0026, + "step": 23207 + }, + { + "epoch": 9.437982919886133, + "grad_norm": 0.0034580257198810953, + "learning_rate": 3.686096870097365e-06, + "loss": 0.0, + "step": 23208 + }, + { + "epoch": 9.43838958926393, + "grad_norm": 2.482883758273309, + "learning_rate": 3.68531121031824e-06, + "loss": 0.0225, + "step": 23209 + }, + { + "epoch": 9.438796258641725, + "grad_norm": 0.16704791743740824, + "learning_rate": 3.6845256153625753e-06, + "loss": 0.0011, + "step": 23210 + }, + { + "epoch": 9.43920292801952, + "grad_norm": 0.0002006441042120442, + "learning_rate": 3.6837400852384343e-06, + "loss": 0.0, + "step": 23211 + }, + { + "epoch": 9.439609597397316, + "grad_norm": 0.001341432067982207, + "learning_rate": 3.68295461995388e-06, + "loss": 0.0, + "step": 23212 + }, + { + "epoch": 9.440016266775112, + "grad_norm": 0.0015287393723772981, + "learning_rate": 3.6821692195169755e-06, + "loss": 0.0, + "step": 23213 + }, + { + "epoch": 9.440422936152908, + "grad_norm": 1.236305030197253, + "learning_rate": 3.681383883935784e-06, + "loss": 0.01, + "step": 23214 + }, + { + "epoch": 9.440829605530704, + "grad_norm": 1.6556883524373187, + "learning_rate": 3.680598613218368e-06, + "loss": 0.0172, + "step": 23215 + }, + { + "epoch": 9.4412362749085, + "grad_norm": 0.22858306096884648, + "learning_rate": 3.679813407372783e-06, + "loss": 0.0024, + "step": 23216 + }, + { + "epoch": 9.441642944286295, + "grad_norm": 0.017454397145354128, + "learning_rate": 3.6790282664070985e-06, + "loss": 0.0002, + "step": 23217 + }, + { + "epoch": 9.442049613664091, + "grad_norm": 0.3799655780935832, + "learning_rate": 3.678243190329369e-06, + "loss": 0.0021, + "step": 23218 + }, + { + "epoch": 9.442456283041887, + "grad_norm": 0.08664104609115457, + "learning_rate": 3.677458179147656e-06, + "loss": 0.0008, + "step": 23219 + }, + { + "epoch": 9.442862952419683, + "grad_norm": 0.003854921815362294, + "learning_rate": 3.676673232870016e-06, + "loss": 0.0, + "step": 23220 + }, + { + "epoch": 9.443269621797478, + "grad_norm": 0.11229732335169654, + "learning_rate": 3.6758883515045085e-06, + "loss": 0.001, + "step": 23221 + }, + { + "epoch": 9.443676291175274, + "grad_norm": 0.793630615027051, + "learning_rate": 3.675103535059187e-06, + "loss": 0.0061, + "step": 23222 + }, + { + "epoch": 9.44408296055307, + "grad_norm": 0.025198163717378454, + "learning_rate": 3.6743187835421135e-06, + "loss": 0.0002, + "step": 23223 + }, + { + "epoch": 9.444489629930866, + "grad_norm": 0.0010189235226964336, + "learning_rate": 3.673534096961341e-06, + "loss": 0.0, + "step": 23224 + }, + { + "epoch": 9.444896299308661, + "grad_norm": 0.0627059723530529, + "learning_rate": 3.672749475324925e-06, + "loss": 0.0007, + "step": 23225 + }, + { + "epoch": 9.445302968686457, + "grad_norm": 4.099657664984634, + "learning_rate": 3.671964918640921e-06, + "loss": 0.0162, + "step": 23226 + }, + { + "epoch": 9.445709638064255, + "grad_norm": 0.0605451576806585, + "learning_rate": 3.671180426917381e-06, + "loss": 0.0007, + "step": 23227 + }, + { + "epoch": 9.44611630744205, + "grad_norm": 15.253929283720593, + "learning_rate": 3.6703960001623596e-06, + "loss": 0.2718, + "step": 23228 + }, + { + "epoch": 9.446522976819846, + "grad_norm": 5.837680602052742, + "learning_rate": 3.6696116383839055e-06, + "loss": 0.1344, + "step": 23229 + }, + { + "epoch": 9.446929646197642, + "grad_norm": 0.008189355920748926, + "learning_rate": 3.6688273415900778e-06, + "loss": 0.0001, + "step": 23230 + }, + { + "epoch": 9.447336315575438, + "grad_norm": 0.37425174304628345, + "learning_rate": 3.6680431097889234e-06, + "loss": 0.0034, + "step": 23231 + }, + { + "epoch": 9.447742984953234, + "grad_norm": 0.0005244467842495101, + "learning_rate": 3.667258942988493e-06, + "loss": 0.0, + "step": 23232 + }, + { + "epoch": 9.44814965433103, + "grad_norm": 0.0009370709675055924, + "learning_rate": 3.6664748411968374e-06, + "loss": 0.0, + "step": 23233 + }, + { + "epoch": 9.448556323708825, + "grad_norm": 0.024720187696941676, + "learning_rate": 3.665690804422004e-06, + "loss": 0.0003, + "step": 23234 + }, + { + "epoch": 9.44896299308662, + "grad_norm": 0.0032420609502651814, + "learning_rate": 3.6649068326720407e-06, + "loss": 0.0, + "step": 23235 + }, + { + "epoch": 9.449369662464417, + "grad_norm": 1.315538167462958, + "learning_rate": 3.6641229259549994e-06, + "loss": 0.0201, + "step": 23236 + }, + { + "epoch": 9.449776331842212, + "grad_norm": 0.008107671347512073, + "learning_rate": 3.663339084278925e-06, + "loss": 0.0001, + "step": 23237 + }, + { + "epoch": 9.450183001220008, + "grad_norm": 0.008282772441029875, + "learning_rate": 3.6625553076518648e-06, + "loss": 0.0001, + "step": 23238 + }, + { + "epoch": 9.450589670597804, + "grad_norm": 0.02602207349721343, + "learning_rate": 3.6617715960818633e-06, + "loss": 0.0002, + "step": 23239 + }, + { + "epoch": 9.4509963399756, + "grad_norm": 0.015351061661564122, + "learning_rate": 3.6609879495769662e-06, + "loss": 0.0001, + "step": 23240 + }, + { + "epoch": 9.451403009353395, + "grad_norm": 0.11848737278105653, + "learning_rate": 3.6602043681452194e-06, + "loss": 0.0005, + "step": 23241 + }, + { + "epoch": 9.451809678731191, + "grad_norm": 1.0578308417463285, + "learning_rate": 3.6594208517946605e-06, + "loss": 0.0099, + "step": 23242 + }, + { + "epoch": 9.452216348108987, + "grad_norm": 0.5496343925053854, + "learning_rate": 3.6586374005333425e-06, + "loss": 0.0056, + "step": 23243 + }, + { + "epoch": 9.452623017486783, + "grad_norm": 11.58062500224509, + "learning_rate": 3.6578540143693055e-06, + "loss": 0.3444, + "step": 23244 + }, + { + "epoch": 9.453029686864578, + "grad_norm": 3.5199183072338878, + "learning_rate": 3.657070693310586e-06, + "loss": 0.0481, + "step": 23245 + }, + { + "epoch": 9.453436356242374, + "grad_norm": 0.04942810703664779, + "learning_rate": 3.656287437365228e-06, + "loss": 0.0004, + "step": 23246 + }, + { + "epoch": 9.45384302562017, + "grad_norm": 0.30113859404223403, + "learning_rate": 3.6555042465412692e-06, + "loss": 0.0029, + "step": 23247 + }, + { + "epoch": 9.454249694997966, + "grad_norm": 0.29523796261029, + "learning_rate": 3.654721120846756e-06, + "loss": 0.0033, + "step": 23248 + }, + { + "epoch": 9.454656364375763, + "grad_norm": 0.5859344300050577, + "learning_rate": 3.6539380602897232e-06, + "loss": 0.0051, + "step": 23249 + }, + { + "epoch": 9.455063033753559, + "grad_norm": 0.006867797178370972, + "learning_rate": 3.6531550648782098e-06, + "loss": 0.0001, + "step": 23250 + }, + { + "epoch": 9.455469703131355, + "grad_norm": 0.011274026578049625, + "learning_rate": 3.6523721346202554e-06, + "loss": 0.0001, + "step": 23251 + }, + { + "epoch": 9.45587637250915, + "grad_norm": 4.318046906496219, + "learning_rate": 3.6515892695238952e-06, + "loss": 0.1289, + "step": 23252 + }, + { + "epoch": 9.456283041886946, + "grad_norm": 0.11236787124393438, + "learning_rate": 3.6508064695971624e-06, + "loss": 0.0012, + "step": 23253 + }, + { + "epoch": 9.456689711264742, + "grad_norm": 0.05933508872968596, + "learning_rate": 3.6500237348481003e-06, + "loss": 0.0006, + "step": 23254 + }, + { + "epoch": 9.457096380642538, + "grad_norm": 0.008111076267063918, + "learning_rate": 3.64924106528474e-06, + "loss": 0.0001, + "step": 23255 + }, + { + "epoch": 9.457503050020334, + "grad_norm": 2.4806254785595496, + "learning_rate": 3.6484584609151174e-06, + "loss": 0.0239, + "step": 23256 + }, + { + "epoch": 9.45790971939813, + "grad_norm": 0.06281454709791434, + "learning_rate": 3.6476759217472648e-06, + "loss": 0.0005, + "step": 23257 + }, + { + "epoch": 9.458316388775925, + "grad_norm": 0.00883036583931754, + "learning_rate": 3.646893447789216e-06, + "loss": 0.0001, + "step": 23258 + }, + { + "epoch": 9.458723058153721, + "grad_norm": 0.3191757716200167, + "learning_rate": 3.646111039049003e-06, + "loss": 0.0025, + "step": 23259 + }, + { + "epoch": 9.459129727531517, + "grad_norm": 0.0727044304683254, + "learning_rate": 3.6453286955346545e-06, + "loss": 0.0008, + "step": 23260 + }, + { + "epoch": 9.459536396909312, + "grad_norm": 0.08482954179631323, + "learning_rate": 3.6445464172542087e-06, + "loss": 0.0008, + "step": 23261 + }, + { + "epoch": 9.459943066287108, + "grad_norm": 0.019256117465936747, + "learning_rate": 3.643764204215694e-06, + "loss": 0.0002, + "step": 23262 + }, + { + "epoch": 9.460349735664904, + "grad_norm": 0.7263912391161665, + "learning_rate": 3.642982056427137e-06, + "loss": 0.0063, + "step": 23263 + }, + { + "epoch": 9.4607564050427, + "grad_norm": 0.00825524568041275, + "learning_rate": 3.64219997389657e-06, + "loss": 0.0001, + "step": 23264 + }, + { + "epoch": 9.461163074420496, + "grad_norm": 0.1577152895270645, + "learning_rate": 3.6414179566320186e-06, + "loss": 0.0014, + "step": 23265 + }, + { + "epoch": 9.461569743798291, + "grad_norm": 6.572089000442817, + "learning_rate": 3.6406360046415136e-06, + "loss": 0.0832, + "step": 23266 + }, + { + "epoch": 9.461976413176087, + "grad_norm": 0.5523362121245147, + "learning_rate": 3.639854117933076e-06, + "loss": 0.0088, + "step": 23267 + }, + { + "epoch": 9.462383082553885, + "grad_norm": 0.07743099569956913, + "learning_rate": 3.6390722965147406e-06, + "loss": 0.0006, + "step": 23268 + }, + { + "epoch": 9.46278975193168, + "grad_norm": 0.8335258852916027, + "learning_rate": 3.63829054039453e-06, + "loss": 0.009, + "step": 23269 + }, + { + "epoch": 9.463196421309476, + "grad_norm": 0.03415486878096959, + "learning_rate": 3.637508849580468e-06, + "loss": 0.0004, + "step": 23270 + }, + { + "epoch": 9.463603090687272, + "grad_norm": 0.020695314554359766, + "learning_rate": 3.6367272240805807e-06, + "loss": 0.0002, + "step": 23271 + }, + { + "epoch": 9.464009760065068, + "grad_norm": 0.5382770398940143, + "learning_rate": 3.63594566390289e-06, + "loss": 0.0078, + "step": 23272 + }, + { + "epoch": 9.464416429442863, + "grad_norm": 0.0035895096157108138, + "learning_rate": 3.6351641690554174e-06, + "loss": 0.0, + "step": 23273 + }, + { + "epoch": 9.46482309882066, + "grad_norm": 0.01292109851858444, + "learning_rate": 3.6343827395461906e-06, + "loss": 0.0002, + "step": 23274 + }, + { + "epoch": 9.465229768198455, + "grad_norm": 0.0012528822940517659, + "learning_rate": 3.63360137538323e-06, + "loss": 0.0, + "step": 23275 + }, + { + "epoch": 9.46563643757625, + "grad_norm": 0.2667421460853288, + "learning_rate": 3.632820076574555e-06, + "loss": 0.0026, + "step": 23276 + }, + { + "epoch": 9.466043106954046, + "grad_norm": 0.003745249288685306, + "learning_rate": 3.632038843128185e-06, + "loss": 0.0, + "step": 23277 + }, + { + "epoch": 9.466449776331842, + "grad_norm": 0.005649972753749946, + "learning_rate": 3.6312576750521422e-06, + "loss": 0.0001, + "step": 23278 + }, + { + "epoch": 9.466856445709638, + "grad_norm": 0.7629177592435735, + "learning_rate": 3.6304765723544455e-06, + "loss": 0.0051, + "step": 23279 + }, + { + "epoch": 9.467263115087434, + "grad_norm": 0.0014504335043519529, + "learning_rate": 3.6296955350431086e-06, + "loss": 0.0, + "step": 23280 + }, + { + "epoch": 9.46766978446523, + "grad_norm": 0.011396499680353368, + "learning_rate": 3.628914563126156e-06, + "loss": 0.0001, + "step": 23281 + }, + { + "epoch": 9.468076453843025, + "grad_norm": 0.09933944508258723, + "learning_rate": 3.628133656611602e-06, + "loss": 0.001, + "step": 23282 + }, + { + "epoch": 9.468483123220821, + "grad_norm": 0.008056068358326394, + "learning_rate": 3.627352815507463e-06, + "loss": 0.0001, + "step": 23283 + }, + { + "epoch": 9.468889792598617, + "grad_norm": 0.004880357603428353, + "learning_rate": 3.6265720398217554e-06, + "loss": 0.0001, + "step": 23284 + }, + { + "epoch": 9.469296461976413, + "grad_norm": 0.4203679718581177, + "learning_rate": 3.6257913295624924e-06, + "loss": 0.0041, + "step": 23285 + }, + { + "epoch": 9.469703131354208, + "grad_norm": 0.10613567305446238, + "learning_rate": 3.6250106847376863e-06, + "loss": 0.0008, + "step": 23286 + }, + { + "epoch": 9.470109800732004, + "grad_norm": 0.02335462805190792, + "learning_rate": 3.6242301053553585e-06, + "loss": 0.0003, + "step": 23287 + }, + { + "epoch": 9.4705164701098, + "grad_norm": 6.79578454160239, + "learning_rate": 3.6234495914235155e-06, + "loss": 0.0719, + "step": 23288 + }, + { + "epoch": 9.470923139487596, + "grad_norm": 0.00027981003471600656, + "learning_rate": 3.6226691429501738e-06, + "loss": 0.0, + "step": 23289 + }, + { + "epoch": 9.471329808865393, + "grad_norm": 0.00720645585375087, + "learning_rate": 3.621888759943342e-06, + "loss": 0.0001, + "step": 23290 + }, + { + "epoch": 9.471736478243189, + "grad_norm": 0.9313705669461839, + "learning_rate": 3.621108442411032e-06, + "loss": 0.0128, + "step": 23291 + }, + { + "epoch": 9.472143147620985, + "grad_norm": 0.1336620918474141, + "learning_rate": 3.6203281903612543e-06, + "loss": 0.0012, + "step": 23292 + }, + { + "epoch": 9.47254981699878, + "grad_norm": 0.0003046719318070274, + "learning_rate": 3.6195480038020157e-06, + "loss": 0.0, + "step": 23293 + }, + { + "epoch": 9.472956486376576, + "grad_norm": 0.32889936021712635, + "learning_rate": 3.618767882741331e-06, + "loss": 0.0024, + "step": 23294 + }, + { + "epoch": 9.473363155754372, + "grad_norm": 0.438537633991439, + "learning_rate": 3.617987827187206e-06, + "loss": 0.0039, + "step": 23295 + }, + { + "epoch": 9.473769825132168, + "grad_norm": 1.089082601687776, + "learning_rate": 3.617207837147647e-06, + "loss": 0.0084, + "step": 23296 + }, + { + "epoch": 9.474176494509964, + "grad_norm": 0.07429831271502998, + "learning_rate": 3.6164279126306624e-06, + "loss": 0.0007, + "step": 23297 + }, + { + "epoch": 9.47458316388776, + "grad_norm": 0.002694667747253, + "learning_rate": 3.6156480536442582e-06, + "loss": 0.0, + "step": 23298 + }, + { + "epoch": 9.474989833265555, + "grad_norm": 0.03599205899102782, + "learning_rate": 3.614868260196437e-06, + "loss": 0.0004, + "step": 23299 + }, + { + "epoch": 9.47539650264335, + "grad_norm": 3.646027574573142, + "learning_rate": 3.614088532295209e-06, + "loss": 0.0607, + "step": 23300 + }, + { + "epoch": 9.475803172021147, + "grad_norm": 0.01742607163303455, + "learning_rate": 3.6133088699485775e-06, + "loss": 0.0002, + "step": 23301 + }, + { + "epoch": 9.476209841398942, + "grad_norm": 0.05123873951728816, + "learning_rate": 3.6125292731645433e-06, + "loss": 0.0006, + "step": 23302 + }, + { + "epoch": 9.476616510776738, + "grad_norm": 2.0310439992258984, + "learning_rate": 3.6117497419511117e-06, + "loss": 0.0242, + "step": 23303 + }, + { + "epoch": 9.477023180154534, + "grad_norm": 0.004153326635366464, + "learning_rate": 3.6109702763162846e-06, + "loss": 0.0001, + "step": 23304 + }, + { + "epoch": 9.47742984953233, + "grad_norm": 0.03185860555512799, + "learning_rate": 3.6101908762680628e-06, + "loss": 0.0002, + "step": 23305 + }, + { + "epoch": 9.477836518910125, + "grad_norm": 5.129775599927541, + "learning_rate": 3.6094115418144437e-06, + "loss": 0.1546, + "step": 23306 + }, + { + "epoch": 9.478243188287921, + "grad_norm": 0.4844486269750847, + "learning_rate": 3.608632272963435e-06, + "loss": 0.0027, + "step": 23307 + }, + { + "epoch": 9.478649857665717, + "grad_norm": 0.0018546831958683627, + "learning_rate": 3.607853069723033e-06, + "loss": 0.0, + "step": 23308 + }, + { + "epoch": 9.479056527043515, + "grad_norm": 0.0012114508185566523, + "learning_rate": 3.607073932101236e-06, + "loss": 0.0, + "step": 23309 + }, + { + "epoch": 9.47946319642131, + "grad_norm": 0.02853919871737084, + "learning_rate": 3.6062948601060423e-06, + "loss": 0.0003, + "step": 23310 + }, + { + "epoch": 9.479869865799106, + "grad_norm": 3.464959180569196, + "learning_rate": 3.6055158537454502e-06, + "loss": 0.0184, + "step": 23311 + }, + { + "epoch": 9.480276535176902, + "grad_norm": 1.4473046778224616, + "learning_rate": 3.6047369130274536e-06, + "loss": 0.0064, + "step": 23312 + }, + { + "epoch": 9.480683204554698, + "grad_norm": 0.9683649114045215, + "learning_rate": 3.6039580379600537e-06, + "loss": 0.004, + "step": 23313 + }, + { + "epoch": 9.481089873932493, + "grad_norm": 3.507845813611218, + "learning_rate": 3.6031792285512436e-06, + "loss": 0.0788, + "step": 23314 + }, + { + "epoch": 9.48149654331029, + "grad_norm": 1.8951768066527155, + "learning_rate": 3.6024004848090178e-06, + "loss": 0.0134, + "step": 23315 + }, + { + "epoch": 9.481903212688085, + "grad_norm": 0.8375978602706214, + "learning_rate": 3.601621806741371e-06, + "loss": 0.0089, + "step": 23316 + }, + { + "epoch": 9.48230988206588, + "grad_norm": 3.9135685116922003, + "learning_rate": 3.6008431943562973e-06, + "loss": 0.0401, + "step": 23317 + }, + { + "epoch": 9.482716551443676, + "grad_norm": 4.5360826984343285, + "learning_rate": 3.6000646476617884e-06, + "loss": 0.1771, + "step": 23318 + }, + { + "epoch": 9.483123220821472, + "grad_norm": 0.030083834005853797, + "learning_rate": 3.599286166665834e-06, + "loss": 0.0002, + "step": 23319 + }, + { + "epoch": 9.483529890199268, + "grad_norm": 0.2943520570581471, + "learning_rate": 3.598507751376431e-06, + "loss": 0.0026, + "step": 23320 + }, + { + "epoch": 9.483936559577064, + "grad_norm": 0.39059525038451964, + "learning_rate": 3.5977294018015686e-06, + "loss": 0.0044, + "step": 23321 + }, + { + "epoch": 9.48434322895486, + "grad_norm": 0.008518846389457704, + "learning_rate": 3.5969511179492356e-06, + "loss": 0.0001, + "step": 23322 + }, + { + "epoch": 9.484749898332655, + "grad_norm": 1.241906832380387, + "learning_rate": 3.5961728998274225e-06, + "loss": 0.0142, + "step": 23323 + }, + { + "epoch": 9.485156567710451, + "grad_norm": 0.11423140527929318, + "learning_rate": 3.5953947474441163e-06, + "loss": 0.0023, + "step": 23324 + }, + { + "epoch": 9.485563237088247, + "grad_norm": 18.628261834979345, + "learning_rate": 3.5946166608073043e-06, + "loss": 0.3026, + "step": 23325 + }, + { + "epoch": 9.485969906466043, + "grad_norm": 0.7716913625012727, + "learning_rate": 3.5938386399249793e-06, + "loss": 0.0061, + "step": 23326 + }, + { + "epoch": 9.486376575843838, + "grad_norm": 0.0019087219133602853, + "learning_rate": 3.5930606848051244e-06, + "loss": 0.0, + "step": 23327 + }, + { + "epoch": 9.486783245221634, + "grad_norm": 0.010779838231739121, + "learning_rate": 3.592282795455726e-06, + "loss": 0.0001, + "step": 23328 + }, + { + "epoch": 9.48718991459943, + "grad_norm": 0.030619291009704827, + "learning_rate": 3.5915049718847693e-06, + "loss": 0.0003, + "step": 23329 + }, + { + "epoch": 9.487596583977226, + "grad_norm": 1.1241168413474505, + "learning_rate": 3.5907272141002404e-06, + "loss": 0.0076, + "step": 23330 + }, + { + "epoch": 9.488003253355023, + "grad_norm": 2.0733326599862143, + "learning_rate": 3.5899495221101223e-06, + "loss": 0.0231, + "step": 23331 + }, + { + "epoch": 9.488409922732819, + "grad_norm": 0.0012983121340399391, + "learning_rate": 3.5891718959223944e-06, + "loss": 0.0, + "step": 23332 + }, + { + "epoch": 9.488816592110615, + "grad_norm": 0.00022851999024800726, + "learning_rate": 3.5883943355450468e-06, + "loss": 0.0, + "step": 23333 + }, + { + "epoch": 9.48922326148841, + "grad_norm": 0.15662011725780778, + "learning_rate": 3.587616840986059e-06, + "loss": 0.0011, + "step": 23334 + }, + { + "epoch": 9.489629930866206, + "grad_norm": 0.03941483555534828, + "learning_rate": 3.586839412253411e-06, + "loss": 0.0003, + "step": 23335 + }, + { + "epoch": 9.490036600244002, + "grad_norm": 0.006658505130645016, + "learning_rate": 3.586062049355085e-06, + "loss": 0.0001, + "step": 23336 + }, + { + "epoch": 9.490443269621798, + "grad_norm": 0.026768856293625902, + "learning_rate": 3.5852847522990585e-06, + "loss": 0.0004, + "step": 23337 + }, + { + "epoch": 9.490849938999594, + "grad_norm": 0.5039760636088545, + "learning_rate": 3.5845075210933102e-06, + "loss": 0.0046, + "step": 23338 + }, + { + "epoch": 9.49125660837739, + "grad_norm": 0.4416278052083909, + "learning_rate": 3.5837303557458246e-06, + "loss": 0.0047, + "step": 23339 + }, + { + "epoch": 9.491663277755185, + "grad_norm": 0.05264833132430368, + "learning_rate": 3.582953256264575e-06, + "loss": 0.0005, + "step": 23340 + }, + { + "epoch": 9.49206994713298, + "grad_norm": 0.038588064783396867, + "learning_rate": 3.5821762226575416e-06, + "loss": 0.0004, + "step": 23341 + }, + { + "epoch": 9.492476616510777, + "grad_norm": 0.0019274852934150109, + "learning_rate": 3.5813992549326982e-06, + "loss": 0.0, + "step": 23342 + }, + { + "epoch": 9.492883285888572, + "grad_norm": 0.0762644359307963, + "learning_rate": 3.580622353098022e-06, + "loss": 0.001, + "step": 23343 + }, + { + "epoch": 9.493289955266368, + "grad_norm": 0.002935274329129898, + "learning_rate": 3.579845517161489e-06, + "loss": 0.0, + "step": 23344 + }, + { + "epoch": 9.493696624644164, + "grad_norm": 0.0004503061406672915, + "learning_rate": 3.5790687471310716e-06, + "loss": 0.0, + "step": 23345 + }, + { + "epoch": 9.49410329402196, + "grad_norm": 0.007308650636694569, + "learning_rate": 3.578292043014745e-06, + "loss": 0.0001, + "step": 23346 + }, + { + "epoch": 9.494509963399755, + "grad_norm": 0.0008336363821249064, + "learning_rate": 3.577515404820484e-06, + "loss": 0.0, + "step": 23347 + }, + { + "epoch": 9.494916632777551, + "grad_norm": 0.051901989370310826, + "learning_rate": 3.576738832556259e-06, + "loss": 0.0004, + "step": 23348 + }, + { + "epoch": 9.495323302155347, + "grad_norm": 0.008268809633958886, + "learning_rate": 3.5759623262300424e-06, + "loss": 0.0001, + "step": 23349 + }, + { + "epoch": 9.495729971533144, + "grad_norm": 0.009088064989371218, + "learning_rate": 3.5751858858498024e-06, + "loss": 0.0001, + "step": 23350 + }, + { + "epoch": 9.49613664091094, + "grad_norm": 0.004706948121436286, + "learning_rate": 3.5744095114235155e-06, + "loss": 0.0001, + "step": 23351 + }, + { + "epoch": 9.496543310288736, + "grad_norm": 7.1122144099923235, + "learning_rate": 3.5736332029591504e-06, + "loss": 0.0727, + "step": 23352 + }, + { + "epoch": 9.496949979666532, + "grad_norm": 0.22164854078546264, + "learning_rate": 3.5728569604646737e-06, + "loss": 0.0009, + "step": 23353 + }, + { + "epoch": 9.497356649044328, + "grad_norm": 0.15683725423335468, + "learning_rate": 3.5720807839480555e-06, + "loss": 0.0012, + "step": 23354 + }, + { + "epoch": 9.497763318422123, + "grad_norm": 0.0034606696057223212, + "learning_rate": 3.5713046734172627e-06, + "loss": 0.0, + "step": 23355 + }, + { + "epoch": 9.498169987799919, + "grad_norm": 0.015957423200222167, + "learning_rate": 3.57052862888026e-06, + "loss": 0.0002, + "step": 23356 + }, + { + "epoch": 9.498576657177715, + "grad_norm": 0.0035137000975661763, + "learning_rate": 3.5697526503450207e-06, + "loss": 0.0, + "step": 23357 + }, + { + "epoch": 9.49898332655551, + "grad_norm": 1.1195098800792491, + "learning_rate": 3.5689767378195063e-06, + "loss": 0.0057, + "step": 23358 + }, + { + "epoch": 9.499389995933306, + "grad_norm": 0.2229447149691959, + "learning_rate": 3.568200891311682e-06, + "loss": 0.0024, + "step": 23359 + }, + { + "epoch": 9.499796665311102, + "grad_norm": 0.5950551602630011, + "learning_rate": 3.5674251108295122e-06, + "loss": 0.0071, + "step": 23360 + }, + { + "epoch": 9.500203334688898, + "grad_norm": 0.3522814887074235, + "learning_rate": 3.5666493963809623e-06, + "loss": 0.0044, + "step": 23361 + }, + { + "epoch": 9.500610004066694, + "grad_norm": 0.21194348087808468, + "learning_rate": 3.5658737479739936e-06, + "loss": 0.0017, + "step": 23362 + }, + { + "epoch": 9.50101667344449, + "grad_norm": 0.0036233936102538643, + "learning_rate": 3.565098165616565e-06, + "loss": 0.0001, + "step": 23363 + }, + { + "epoch": 9.501423342822285, + "grad_norm": 0.019121927595055505, + "learning_rate": 3.564322649316646e-06, + "loss": 0.0001, + "step": 23364 + }, + { + "epoch": 9.501830012200081, + "grad_norm": 0.15230024154250255, + "learning_rate": 3.5635471990821945e-06, + "loss": 0.0007, + "step": 23365 + }, + { + "epoch": 9.502236681577877, + "grad_norm": 0.04268081843347751, + "learning_rate": 3.562771814921171e-06, + "loss": 0.0004, + "step": 23366 + }, + { + "epoch": 9.502643350955672, + "grad_norm": 7.0631204679100925, + "learning_rate": 3.561996496841533e-06, + "loss": 0.102, + "step": 23367 + }, + { + "epoch": 9.503050020333468, + "grad_norm": 1.3558891637931778, + "learning_rate": 3.561221244851243e-06, + "loss": 0.0134, + "step": 23368 + }, + { + "epoch": 9.503456689711264, + "grad_norm": 0.01924435356460981, + "learning_rate": 3.5604460589582524e-06, + "loss": 0.0001, + "step": 23369 + }, + { + "epoch": 9.50386335908906, + "grad_norm": 0.1316997956207898, + "learning_rate": 3.5596709391705286e-06, + "loss": 0.0016, + "step": 23370 + }, + { + "epoch": 9.504270028466856, + "grad_norm": 1.9098326924852904, + "learning_rate": 3.558895885496023e-06, + "loss": 0.0128, + "step": 23371 + }, + { + "epoch": 9.504676697844653, + "grad_norm": 0.15949045119579153, + "learning_rate": 3.558120897942694e-06, + "loss": 0.0012, + "step": 23372 + }, + { + "epoch": 9.505083367222449, + "grad_norm": 0.7729192485921104, + "learning_rate": 3.557345976518495e-06, + "loss": 0.0058, + "step": 23373 + }, + { + "epoch": 9.505490036600245, + "grad_norm": 2.61226033859398, + "learning_rate": 3.5565711212313824e-06, + "loss": 0.0291, + "step": 23374 + }, + { + "epoch": 9.50589670597804, + "grad_norm": 0.23820000575942069, + "learning_rate": 3.555796332089311e-06, + "loss": 0.0024, + "step": 23375 + }, + { + "epoch": 9.506303375355836, + "grad_norm": 0.7941370996211472, + "learning_rate": 3.5550216091002288e-06, + "loss": 0.0068, + "step": 23376 + }, + { + "epoch": 9.506710044733632, + "grad_norm": 0.018052639623061827, + "learning_rate": 3.5542469522720977e-06, + "loss": 0.0003, + "step": 23377 + }, + { + "epoch": 9.507116714111428, + "grad_norm": 0.0004435758737268584, + "learning_rate": 3.553472361612865e-06, + "loss": 0.0, + "step": 23378 + }, + { + "epoch": 9.507523383489223, + "grad_norm": 0.1816370413530485, + "learning_rate": 3.5526978371304833e-06, + "loss": 0.0021, + "step": 23379 + }, + { + "epoch": 9.50793005286702, + "grad_norm": 0.2025396191962711, + "learning_rate": 3.5519233788329033e-06, + "loss": 0.0016, + "step": 23380 + }, + { + "epoch": 9.508336722244815, + "grad_norm": 0.04695454815509824, + "learning_rate": 3.551148986728076e-06, + "loss": 0.0006, + "step": 23381 + }, + { + "epoch": 9.50874339162261, + "grad_norm": 0.03122530535605384, + "learning_rate": 3.5503746608239487e-06, + "loss": 0.0003, + "step": 23382 + }, + { + "epoch": 9.509150061000406, + "grad_norm": 0.016044331605250144, + "learning_rate": 3.549600401128469e-06, + "loss": 0.0001, + "step": 23383 + }, + { + "epoch": 9.509556730378202, + "grad_norm": 0.04250596546350499, + "learning_rate": 3.5488262076495905e-06, + "loss": 0.0003, + "step": 23384 + }, + { + "epoch": 9.509963399755998, + "grad_norm": 0.47589034977242906, + "learning_rate": 3.548052080395258e-06, + "loss": 0.0025, + "step": 23385 + }, + { + "epoch": 9.510370069133794, + "grad_norm": 0.11319706210975132, + "learning_rate": 3.547278019373417e-06, + "loss": 0.0009, + "step": 23386 + }, + { + "epoch": 9.51077673851159, + "grad_norm": 0.34769089015683596, + "learning_rate": 3.5465040245920155e-06, + "loss": 0.0028, + "step": 23387 + }, + { + "epoch": 9.511183407889385, + "grad_norm": 0.0063583869447932356, + "learning_rate": 3.545730096058999e-06, + "loss": 0.0001, + "step": 23388 + }, + { + "epoch": 9.511590077267181, + "grad_norm": 0.11398386623104059, + "learning_rate": 3.5449562337823072e-06, + "loss": 0.0012, + "step": 23389 + }, + { + "epoch": 9.511996746644977, + "grad_norm": 10.649321976604176, + "learning_rate": 3.544182437769892e-06, + "loss": 0.123, + "step": 23390 + }, + { + "epoch": 9.512403416022774, + "grad_norm": 0.007674718723378962, + "learning_rate": 3.5434087080296932e-06, + "loss": 0.0001, + "step": 23391 + }, + { + "epoch": 9.51281008540057, + "grad_norm": 0.3364966946526612, + "learning_rate": 3.542635044569653e-06, + "loss": 0.0032, + "step": 23392 + }, + { + "epoch": 9.513216754778366, + "grad_norm": 0.031871564759477354, + "learning_rate": 3.5418614473977154e-06, + "loss": 0.0004, + "step": 23393 + }, + { + "epoch": 9.513623424156162, + "grad_norm": 0.27469426972626904, + "learning_rate": 3.5410879165218194e-06, + "loss": 0.0014, + "step": 23394 + }, + { + "epoch": 9.514030093533957, + "grad_norm": 0.04420325250460372, + "learning_rate": 3.5403144519499066e-06, + "loss": 0.0004, + "step": 23395 + }, + { + "epoch": 9.514436762911753, + "grad_norm": 0.40615880303922985, + "learning_rate": 3.5395410536899133e-06, + "loss": 0.0063, + "step": 23396 + }, + { + "epoch": 9.514843432289549, + "grad_norm": 0.05635252214101831, + "learning_rate": 3.5387677217497864e-06, + "loss": 0.0007, + "step": 23397 + }, + { + "epoch": 9.515250101667345, + "grad_norm": 0.21254129230389085, + "learning_rate": 3.5379944561374604e-06, + "loss": 0.0019, + "step": 23398 + }, + { + "epoch": 9.51565677104514, + "grad_norm": 0.41317807620762287, + "learning_rate": 3.5372212568608745e-06, + "loss": 0.0039, + "step": 23399 + }, + { + "epoch": 9.516063440422936, + "grad_norm": 0.030636618242650206, + "learning_rate": 3.5364481239279635e-06, + "loss": 0.0005, + "step": 23400 + }, + { + "epoch": 9.516470109800732, + "grad_norm": 0.00043904277912729225, + "learning_rate": 3.535675057346667e-06, + "loss": 0.0, + "step": 23401 + }, + { + "epoch": 9.516876779178528, + "grad_norm": 0.3098708584966777, + "learning_rate": 3.534902057124916e-06, + "loss": 0.0035, + "step": 23402 + }, + { + "epoch": 9.517283448556324, + "grad_norm": 0.002016313495412981, + "learning_rate": 3.534129123270652e-06, + "loss": 0.0, + "step": 23403 + }, + { + "epoch": 9.51769011793412, + "grad_norm": 0.5653184475990815, + "learning_rate": 3.5333562557918066e-06, + "loss": 0.0081, + "step": 23404 + }, + { + "epoch": 9.518096787311915, + "grad_norm": 0.0023240200247715286, + "learning_rate": 3.5325834546963146e-06, + "loss": 0.0, + "step": 23405 + }, + { + "epoch": 9.51850345668971, + "grad_norm": 0.022290362644775363, + "learning_rate": 3.531810719992107e-06, + "loss": 0.0003, + "step": 23406 + }, + { + "epoch": 9.518910126067507, + "grad_norm": 7.722272246045967, + "learning_rate": 3.531038051687119e-06, + "loss": 0.0794, + "step": 23407 + }, + { + "epoch": 9.519316795445302, + "grad_norm": 0.09687954753510579, + "learning_rate": 3.5302654497892808e-06, + "loss": 0.0007, + "step": 23408 + }, + { + "epoch": 9.519723464823098, + "grad_norm": 0.0044685298550825765, + "learning_rate": 3.5294929143065202e-06, + "loss": 0.0001, + "step": 23409 + }, + { + "epoch": 9.520130134200894, + "grad_norm": 0.1474110003108156, + "learning_rate": 3.528720445246776e-06, + "loss": 0.0016, + "step": 23410 + }, + { + "epoch": 9.52053680357869, + "grad_norm": 0.007817238693728079, + "learning_rate": 3.5279480426179725e-06, + "loss": 0.0001, + "step": 23411 + }, + { + "epoch": 9.520943472956485, + "grad_norm": 0.14057601618961055, + "learning_rate": 3.527175706428039e-06, + "loss": 0.0013, + "step": 23412 + }, + { + "epoch": 9.521350142334283, + "grad_norm": 0.003428351411117545, + "learning_rate": 3.5264034366849053e-06, + "loss": 0.0, + "step": 23413 + }, + { + "epoch": 9.521756811712079, + "grad_norm": 0.08292460116636062, + "learning_rate": 3.525631233396498e-06, + "loss": 0.0004, + "step": 23414 + }, + { + "epoch": 9.522163481089875, + "grad_norm": 0.006650184444830273, + "learning_rate": 3.524859096570742e-06, + "loss": 0.0001, + "step": 23415 + }, + { + "epoch": 9.52257015046767, + "grad_norm": 0.004782966517531209, + "learning_rate": 3.524087026215569e-06, + "loss": 0.0001, + "step": 23416 + }, + { + "epoch": 9.522976819845466, + "grad_norm": 0.049314200589073154, + "learning_rate": 3.5233150223389024e-06, + "loss": 0.0006, + "step": 23417 + }, + { + "epoch": 9.523383489223262, + "grad_norm": 0.07226078331964715, + "learning_rate": 3.5225430849486665e-06, + "loss": 0.0007, + "step": 23418 + }, + { + "epoch": 9.523790158601058, + "grad_norm": 0.006999651521165954, + "learning_rate": 3.521771214052785e-06, + "loss": 0.0001, + "step": 23419 + }, + { + "epoch": 9.524196827978853, + "grad_norm": 0.021995730781874855, + "learning_rate": 3.5209994096591836e-06, + "loss": 0.0003, + "step": 23420 + }, + { + "epoch": 9.52460349735665, + "grad_norm": 0.10612918180125898, + "learning_rate": 3.5202276717757834e-06, + "loss": 0.001, + "step": 23421 + }, + { + "epoch": 9.525010166734445, + "grad_norm": 0.2725211319691505, + "learning_rate": 3.5194560004105048e-06, + "loss": 0.0009, + "step": 23422 + }, + { + "epoch": 9.52541683611224, + "grad_norm": 0.19208797403001002, + "learning_rate": 3.5186843955712747e-06, + "loss": 0.0019, + "step": 23423 + }, + { + "epoch": 9.525823505490036, + "grad_norm": 0.6564256815310971, + "learning_rate": 3.5179128572660113e-06, + "loss": 0.0063, + "step": 23424 + }, + { + "epoch": 9.526230174867832, + "grad_norm": 0.014154876013322602, + "learning_rate": 3.5171413855026347e-06, + "loss": 0.0001, + "step": 23425 + }, + { + "epoch": 9.526636844245628, + "grad_norm": 0.28992049382002477, + "learning_rate": 3.516369980289065e-06, + "loss": 0.0033, + "step": 23426 + }, + { + "epoch": 9.527043513623424, + "grad_norm": 0.918093202783379, + "learning_rate": 3.51559864163322e-06, + "loss": 0.0045, + "step": 23427 + }, + { + "epoch": 9.52745018300122, + "grad_norm": 0.129510859425127, + "learning_rate": 3.5148273695430145e-06, + "loss": 0.0014, + "step": 23428 + }, + { + "epoch": 9.527856852379015, + "grad_norm": 0.050182362264411265, + "learning_rate": 3.5140561640263747e-06, + "loss": 0.0005, + "step": 23429 + }, + { + "epoch": 9.528263521756811, + "grad_norm": 2.609937034203586, + "learning_rate": 3.513285025091212e-06, + "loss": 0.0191, + "step": 23430 + }, + { + "epoch": 9.528670191134607, + "grad_norm": 0.01735209849025468, + "learning_rate": 3.5125139527454432e-06, + "loss": 0.0002, + "step": 23431 + }, + { + "epoch": 9.529076860512404, + "grad_norm": 0.008458396254579248, + "learning_rate": 3.5117429469969844e-06, + "loss": 0.0001, + "step": 23432 + }, + { + "epoch": 9.5294835298902, + "grad_norm": 0.09384721023671785, + "learning_rate": 3.510972007853749e-06, + "loss": 0.0009, + "step": 23433 + }, + { + "epoch": 9.529890199267996, + "grad_norm": 0.1012880036487677, + "learning_rate": 3.510201135323652e-06, + "loss": 0.0009, + "step": 23434 + }, + { + "epoch": 9.530296868645792, + "grad_norm": 0.10095202090591245, + "learning_rate": 3.5094303294146036e-06, + "loss": 0.0009, + "step": 23435 + }, + { + "epoch": 9.530703538023587, + "grad_norm": 0.0024803946560537846, + "learning_rate": 3.508659590134522e-06, + "loss": 0.0, + "step": 23436 + }, + { + "epoch": 9.531110207401383, + "grad_norm": 0.03807537731255031, + "learning_rate": 3.507888917491318e-06, + "loss": 0.0003, + "step": 23437 + }, + { + "epoch": 9.531516876779179, + "grad_norm": 0.013751209488155065, + "learning_rate": 3.5071183114929006e-06, + "loss": 0.0002, + "step": 23438 + }, + { + "epoch": 9.531923546156975, + "grad_norm": 7.582060879580912e-05, + "learning_rate": 3.5063477721471816e-06, + "loss": 0.0, + "step": 23439 + }, + { + "epoch": 9.53233021553477, + "grad_norm": 0.01358693116761156, + "learning_rate": 3.505577299462072e-06, + "loss": 0.0001, + "step": 23440 + }, + { + "epoch": 9.532736884912566, + "grad_norm": 13.147924482340942, + "learning_rate": 3.5048068934454748e-06, + "loss": 0.5486, + "step": 23441 + }, + { + "epoch": 9.533143554290362, + "grad_norm": 0.022245570743726763, + "learning_rate": 3.504036554105309e-06, + "loss": 0.0001, + "step": 23442 + }, + { + "epoch": 9.533550223668158, + "grad_norm": 0.0032657076038699155, + "learning_rate": 3.50326628144948e-06, + "loss": 0.0, + "step": 23443 + }, + { + "epoch": 9.533956893045954, + "grad_norm": 0.008991769602002578, + "learning_rate": 3.5024960754858884e-06, + "loss": 0.0001, + "step": 23444 + }, + { + "epoch": 9.53436356242375, + "grad_norm": 0.2176716268093993, + "learning_rate": 3.501725936222445e-06, + "loss": 0.0016, + "step": 23445 + }, + { + "epoch": 9.534770231801545, + "grad_norm": 0.1189697187287639, + "learning_rate": 3.5009558636670525e-06, + "loss": 0.0012, + "step": 23446 + }, + { + "epoch": 9.53517690117934, + "grad_norm": 0.495346053207816, + "learning_rate": 3.5001858578276217e-06, + "loss": 0.0048, + "step": 23447 + }, + { + "epoch": 9.535583570557137, + "grad_norm": 0.025051154207580133, + "learning_rate": 3.4994159187120547e-06, + "loss": 0.0003, + "step": 23448 + }, + { + "epoch": 9.535990239934932, + "grad_norm": 0.06763318565193428, + "learning_rate": 3.4986460463282547e-06, + "loss": 0.0006, + "step": 23449 + }, + { + "epoch": 9.536396909312728, + "grad_norm": 0.0020633310817358957, + "learning_rate": 3.497876240684126e-06, + "loss": 0.0, + "step": 23450 + }, + { + "epoch": 9.536803578690524, + "grad_norm": 0.011780921587614616, + "learning_rate": 3.4971065017875692e-06, + "loss": 0.0001, + "step": 23451 + }, + { + "epoch": 9.53721024806832, + "grad_norm": 0.01731236489522257, + "learning_rate": 3.4963368296464873e-06, + "loss": 0.0002, + "step": 23452 + }, + { + "epoch": 9.537616917446115, + "grad_norm": 0.031437771414740794, + "learning_rate": 3.4955672242687777e-06, + "loss": 0.0002, + "step": 23453 + }, + { + "epoch": 9.538023586823913, + "grad_norm": 2.569331214433575, + "learning_rate": 3.494797685662348e-06, + "loss": 0.0404, + "step": 23454 + }, + { + "epoch": 9.538430256201709, + "grad_norm": 0.5130887629730355, + "learning_rate": 3.4940282138350933e-06, + "loss": 0.0058, + "step": 23455 + }, + { + "epoch": 9.538836925579504, + "grad_norm": 0.06925156441515497, + "learning_rate": 3.493258808794914e-06, + "loss": 0.0008, + "step": 23456 + }, + { + "epoch": 9.5392435949573, + "grad_norm": 0.016888787770622277, + "learning_rate": 3.492489470549707e-06, + "loss": 0.0002, + "step": 23457 + }, + { + "epoch": 9.539650264335096, + "grad_norm": 4.305123402124481, + "learning_rate": 3.4917201991073712e-06, + "loss": 0.0871, + "step": 23458 + }, + { + "epoch": 9.540056933712892, + "grad_norm": 0.027904400613479863, + "learning_rate": 3.490950994475799e-06, + "loss": 0.0003, + "step": 23459 + }, + { + "epoch": 9.540463603090688, + "grad_norm": 0.004001369440232656, + "learning_rate": 3.4901818566628955e-06, + "loss": 0.0, + "step": 23460 + }, + { + "epoch": 9.540870272468483, + "grad_norm": 11.835935412617614, + "learning_rate": 3.4894127856765503e-06, + "loss": 0.4101, + "step": 23461 + }, + { + "epoch": 9.541276941846279, + "grad_norm": 12.83969400688198, + "learning_rate": 3.488643781524661e-06, + "loss": 0.6261, + "step": 23462 + }, + { + "epoch": 9.541683611224075, + "grad_norm": 0.007937423361632084, + "learning_rate": 3.4878748442151187e-06, + "loss": 0.0001, + "step": 23463 + }, + { + "epoch": 9.54209028060187, + "grad_norm": 0.11614426328798833, + "learning_rate": 3.4871059737558198e-06, + "loss": 0.0009, + "step": 23464 + }, + { + "epoch": 9.542496949979666, + "grad_norm": 0.005532626224208518, + "learning_rate": 3.4863371701546554e-06, + "loss": 0.0001, + "step": 23465 + }, + { + "epoch": 9.542903619357462, + "grad_norm": 0.004375989151007475, + "learning_rate": 3.485568433419515e-06, + "loss": 0.0, + "step": 23466 + }, + { + "epoch": 9.543310288735258, + "grad_norm": 0.6276081040027714, + "learning_rate": 3.4847997635582974e-06, + "loss": 0.0062, + "step": 23467 + }, + { + "epoch": 9.543716958113054, + "grad_norm": 0.01625829725985914, + "learning_rate": 3.4840311605788877e-06, + "loss": 0.0002, + "step": 23468 + }, + { + "epoch": 9.54412362749085, + "grad_norm": 1.1370013840915714, + "learning_rate": 3.4832626244891786e-06, + "loss": 0.0126, + "step": 23469 + }, + { + "epoch": 9.544530296868645, + "grad_norm": 0.00012594822444572426, + "learning_rate": 3.4824941552970593e-06, + "loss": 0.0, + "step": 23470 + }, + { + "epoch": 9.544936966246441, + "grad_norm": 0.26884134425836487, + "learning_rate": 3.4817257530104166e-06, + "loss": 0.0014, + "step": 23471 + }, + { + "epoch": 9.545343635624237, + "grad_norm": 0.19056571116516902, + "learning_rate": 3.4809574176371363e-06, + "loss": 0.0022, + "step": 23472 + }, + { + "epoch": 9.545750305002034, + "grad_norm": 3.622950212520711, + "learning_rate": 3.4801891491851123e-06, + "loss": 0.0368, + "step": 23473 + }, + { + "epoch": 9.54615697437983, + "grad_norm": 0.06758740134292568, + "learning_rate": 3.4794209476622277e-06, + "loss": 0.0007, + "step": 23474 + }, + { + "epoch": 9.546563643757626, + "grad_norm": 3.4061176294561286, + "learning_rate": 3.47865281307637e-06, + "loss": 0.0363, + "step": 23475 + }, + { + "epoch": 9.546970313135422, + "grad_norm": 4.868460459663292, + "learning_rate": 3.4778847454354226e-06, + "loss": 0.0402, + "step": 23476 + }, + { + "epoch": 9.547376982513217, + "grad_norm": 3.723812466851282, + "learning_rate": 3.477116744747271e-06, + "loss": 0.0431, + "step": 23477 + }, + { + "epoch": 9.547783651891013, + "grad_norm": 0.014114196128148458, + "learning_rate": 3.4763488110197983e-06, + "loss": 0.0001, + "step": 23478 + }, + { + "epoch": 9.548190321268809, + "grad_norm": 0.10292142748531699, + "learning_rate": 3.475580944260886e-06, + "loss": 0.0012, + "step": 23479 + }, + { + "epoch": 9.548596990646605, + "grad_norm": 0.019812041465589597, + "learning_rate": 3.4748131444784207e-06, + "loss": 0.0003, + "step": 23480 + }, + { + "epoch": 9.5490036600244, + "grad_norm": 0.00025983869181560657, + "learning_rate": 3.4740454116802837e-06, + "loss": 0.0, + "step": 23481 + }, + { + "epoch": 9.549410329402196, + "grad_norm": 0.06770404202261274, + "learning_rate": 3.4732777458743547e-06, + "loss": 0.0005, + "step": 23482 + }, + { + "epoch": 9.549816998779992, + "grad_norm": 1.7944699510796653, + "learning_rate": 3.472510147068515e-06, + "loss": 0.0199, + "step": 23483 + }, + { + "epoch": 9.550223668157788, + "grad_norm": 0.0044424590931887034, + "learning_rate": 3.4717426152706434e-06, + "loss": 0.0001, + "step": 23484 + }, + { + "epoch": 9.550630337535583, + "grad_norm": 0.1190323327415817, + "learning_rate": 3.47097515048862e-06, + "loss": 0.0012, + "step": 23485 + }, + { + "epoch": 9.55103700691338, + "grad_norm": 0.004559684655477117, + "learning_rate": 3.470207752730319e-06, + "loss": 0.0, + "step": 23486 + }, + { + "epoch": 9.551443676291175, + "grad_norm": 0.07803035551937733, + "learning_rate": 3.469440422003625e-06, + "loss": 0.0008, + "step": 23487 + }, + { + "epoch": 9.55185034566897, + "grad_norm": 15.808697148506406, + "learning_rate": 3.4686731583164115e-06, + "loss": 0.0392, + "step": 23488 + }, + { + "epoch": 9.552257015046766, + "grad_norm": 0.07915849962601318, + "learning_rate": 3.4679059616765555e-06, + "loss": 0.0012, + "step": 23489 + }, + { + "epoch": 9.552663684424562, + "grad_norm": 0.008879063133281911, + "learning_rate": 3.467138832091932e-06, + "loss": 0.0001, + "step": 23490 + }, + { + "epoch": 9.553070353802358, + "grad_norm": 0.12155946598151944, + "learning_rate": 3.466371769570417e-06, + "loss": 0.0019, + "step": 23491 + }, + { + "epoch": 9.553477023180154, + "grad_norm": 0.03988391557350295, + "learning_rate": 3.4656047741198806e-06, + "loss": 0.0003, + "step": 23492 + }, + { + "epoch": 9.55388369255795, + "grad_norm": 4.335818129517078e-05, + "learning_rate": 3.4648378457482035e-06, + "loss": 0.0, + "step": 23493 + }, + { + "epoch": 9.554290361935745, + "grad_norm": 1.0275544409140618, + "learning_rate": 3.4640709844632535e-06, + "loss": 0.011, + "step": 23494 + }, + { + "epoch": 9.554697031313543, + "grad_norm": 0.01996692557691928, + "learning_rate": 3.463304190272906e-06, + "loss": 0.0002, + "step": 23495 + }, + { + "epoch": 9.555103700691339, + "grad_norm": 0.050393299659610404, + "learning_rate": 3.4625374631850305e-06, + "loss": 0.0006, + "step": 23496 + }, + { + "epoch": 9.555510370069134, + "grad_norm": 0.15352427259059603, + "learning_rate": 3.461770803207497e-06, + "loss": 0.0015, + "step": 23497 + }, + { + "epoch": 9.55591703944693, + "grad_norm": 0.005977288309618652, + "learning_rate": 3.4610042103481778e-06, + "loss": 0.0001, + "step": 23498 + }, + { + "epoch": 9.556323708824726, + "grad_norm": 14.268848063449013, + "learning_rate": 3.460237684614939e-06, + "loss": 0.4191, + "step": 23499 + }, + { + "epoch": 9.556730378202522, + "grad_norm": 0.04073275637292632, + "learning_rate": 3.4594712260156537e-06, + "loss": 0.0003, + "step": 23500 + }, + { + "epoch": 9.557137047580317, + "grad_norm": 0.0442128460526557, + "learning_rate": 3.4587048345581888e-06, + "loss": 0.0006, + "step": 23501 + }, + { + "epoch": 9.557543716958113, + "grad_norm": 12.918055659155483, + "learning_rate": 3.4579385102504093e-06, + "loss": 0.1391, + "step": 23502 + }, + { + "epoch": 9.557950386335909, + "grad_norm": 0.042161353293379264, + "learning_rate": 3.457172253100185e-06, + "loss": 0.0004, + "step": 23503 + }, + { + "epoch": 9.558357055713705, + "grad_norm": 0.09697442501408286, + "learning_rate": 3.4564060631153804e-06, + "loss": 0.0012, + "step": 23504 + }, + { + "epoch": 9.5587637250915, + "grad_norm": 0.02761389900914899, + "learning_rate": 3.4556399403038574e-06, + "loss": 0.0003, + "step": 23505 + }, + { + "epoch": 9.559170394469296, + "grad_norm": 0.10688555567531585, + "learning_rate": 3.4548738846734876e-06, + "loss": 0.0013, + "step": 23506 + }, + { + "epoch": 9.559577063847092, + "grad_norm": 0.46217169473696806, + "learning_rate": 3.454107896232132e-06, + "loss": 0.0045, + "step": 23507 + }, + { + "epoch": 9.559983733224888, + "grad_norm": 0.6023103010783284, + "learning_rate": 3.453341974987652e-06, + "loss": 0.004, + "step": 23508 + }, + { + "epoch": 9.560390402602684, + "grad_norm": 0.0016749595063268378, + "learning_rate": 3.4525761209479124e-06, + "loss": 0.0, + "step": 23509 + }, + { + "epoch": 9.56079707198048, + "grad_norm": 0.009246911129311858, + "learning_rate": 3.451810334120773e-06, + "loss": 0.0001, + "step": 23510 + }, + { + "epoch": 9.561203741358275, + "grad_norm": 0.021320157162841576, + "learning_rate": 3.4510446145140972e-06, + "loss": 0.0002, + "step": 23511 + }, + { + "epoch": 9.56161041073607, + "grad_norm": 0.3171513790874972, + "learning_rate": 3.450278962135741e-06, + "loss": 0.0031, + "step": 23512 + }, + { + "epoch": 9.562017080113867, + "grad_norm": 2.521885810014064, + "learning_rate": 3.4495133769935705e-06, + "loss": 0.0285, + "step": 23513 + }, + { + "epoch": 9.562423749491664, + "grad_norm": 0.1988365831000451, + "learning_rate": 3.4487478590954426e-06, + "loss": 0.0023, + "step": 23514 + }, + { + "epoch": 9.56283041886946, + "grad_norm": 0.09933463362615776, + "learning_rate": 3.4479824084492154e-06, + "loss": 0.0008, + "step": 23515 + }, + { + "epoch": 9.563237088247256, + "grad_norm": 0.028891137207939836, + "learning_rate": 3.4472170250627457e-06, + "loss": 0.0003, + "step": 23516 + }, + { + "epoch": 9.563643757625051, + "grad_norm": 0.6066465246048944, + "learning_rate": 3.446451708943892e-06, + "loss": 0.0034, + "step": 23517 + }, + { + "epoch": 9.564050427002847, + "grad_norm": 0.0376981737960521, + "learning_rate": 3.445686460100506e-06, + "loss": 0.0002, + "step": 23518 + }, + { + "epoch": 9.564457096380643, + "grad_norm": 0.0010285297397917691, + "learning_rate": 3.4449212785404495e-06, + "loss": 0.0, + "step": 23519 + }, + { + "epoch": 9.564863765758439, + "grad_norm": 0.02672337468589624, + "learning_rate": 3.444156164271576e-06, + "loss": 0.0004, + "step": 23520 + }, + { + "epoch": 9.565270435136235, + "grad_norm": 0.5606698410620138, + "learning_rate": 3.4433911173017396e-06, + "loss": 0.0053, + "step": 23521 + }, + { + "epoch": 9.56567710451403, + "grad_norm": 0.49171826845754185, + "learning_rate": 3.4426261376387926e-06, + "loss": 0.0082, + "step": 23522 + }, + { + "epoch": 9.566083773891826, + "grad_norm": 0.0005611162254513477, + "learning_rate": 3.4418612252905894e-06, + "loss": 0.0, + "step": 23523 + }, + { + "epoch": 9.566490443269622, + "grad_norm": 0.10089914969468546, + "learning_rate": 3.44109638026498e-06, + "loss": 0.0012, + "step": 23524 + }, + { + "epoch": 9.566897112647418, + "grad_norm": 0.05176317385626713, + "learning_rate": 3.4403316025698143e-06, + "loss": 0.0005, + "step": 23525 + }, + { + "epoch": 9.567303782025213, + "grad_norm": 0.1432628317344324, + "learning_rate": 3.43956689221295e-06, + "loss": 0.0015, + "step": 23526 + }, + { + "epoch": 9.56771045140301, + "grad_norm": 0.05174604131322449, + "learning_rate": 3.4388022492022334e-06, + "loss": 0.0005, + "step": 23527 + }, + { + "epoch": 9.568117120780805, + "grad_norm": 0.03151363860216454, + "learning_rate": 3.4380376735455135e-06, + "loss": 0.0004, + "step": 23528 + }, + { + "epoch": 9.5685237901586, + "grad_norm": 0.011347690454286808, + "learning_rate": 3.4372731652506384e-06, + "loss": 0.0002, + "step": 23529 + }, + { + "epoch": 9.568930459536396, + "grad_norm": 24.323922925687565, + "learning_rate": 3.436508724325459e-06, + "loss": 0.132, + "step": 23530 + }, + { + "epoch": 9.569337128914192, + "grad_norm": 0.043959761022550455, + "learning_rate": 3.4357443507778165e-06, + "loss": 0.0004, + "step": 23531 + }, + { + "epoch": 9.569743798291988, + "grad_norm": 0.18430653537157757, + "learning_rate": 3.4349800446155657e-06, + "loss": 0.0025, + "step": 23532 + }, + { + "epoch": 9.570150467669784, + "grad_norm": 0.0001808907717569154, + "learning_rate": 3.434215805846549e-06, + "loss": 0.0, + "step": 23533 + }, + { + "epoch": 9.57055713704758, + "grad_norm": 0.07056189233801287, + "learning_rate": 3.4334516344786117e-06, + "loss": 0.0007, + "step": 23534 + }, + { + "epoch": 9.570963806425375, + "grad_norm": 0.018326163037198848, + "learning_rate": 3.432687530519597e-06, + "loss": 0.0002, + "step": 23535 + }, + { + "epoch": 9.571370475803173, + "grad_norm": 0.03396644218140325, + "learning_rate": 3.4319234939773516e-06, + "loss": 0.0004, + "step": 23536 + }, + { + "epoch": 9.571777145180969, + "grad_norm": 2.8854564435315644, + "learning_rate": 3.4311595248597164e-06, + "loss": 0.0373, + "step": 23537 + }, + { + "epoch": 9.572183814558764, + "grad_norm": 0.06043931746117997, + "learning_rate": 3.4303956231745317e-06, + "loss": 0.0006, + "step": 23538 + }, + { + "epoch": 9.57259048393656, + "grad_norm": 0.6821195421552334, + "learning_rate": 3.4296317889296448e-06, + "loss": 0.0077, + "step": 23539 + }, + { + "epoch": 9.572997153314356, + "grad_norm": 0.20451262974846976, + "learning_rate": 3.428868022132895e-06, + "loss": 0.0022, + "step": 23540 + }, + { + "epoch": 9.573403822692152, + "grad_norm": 0.33153973590646896, + "learning_rate": 3.428104322792123e-06, + "loss": 0.0037, + "step": 23541 + }, + { + "epoch": 9.573810492069947, + "grad_norm": 0.5476576357153643, + "learning_rate": 3.4273406909151664e-06, + "loss": 0.0065, + "step": 23542 + }, + { + "epoch": 9.574217161447743, + "grad_norm": 0.0157243713082582, + "learning_rate": 3.426577126509867e-06, + "loss": 0.0002, + "step": 23543 + }, + { + "epoch": 9.574623830825539, + "grad_norm": 3.6837202929118096, + "learning_rate": 3.4258136295840606e-06, + "loss": 0.0572, + "step": 23544 + }, + { + "epoch": 9.575030500203335, + "grad_norm": 0.130266171603997, + "learning_rate": 3.4250502001455853e-06, + "loss": 0.001, + "step": 23545 + }, + { + "epoch": 9.57543716958113, + "grad_norm": 0.07162285874663404, + "learning_rate": 3.42428683820228e-06, + "loss": 0.0007, + "step": 23546 + }, + { + "epoch": 9.575843838958926, + "grad_norm": 0.4717361445592854, + "learning_rate": 3.4235235437619797e-06, + "loss": 0.005, + "step": 23547 + }, + { + "epoch": 9.576250508336722, + "grad_norm": 0.21894341288731045, + "learning_rate": 3.4227603168325196e-06, + "loss": 0.0026, + "step": 23548 + }, + { + "epoch": 9.576657177714518, + "grad_norm": 0.029195505532952814, + "learning_rate": 3.4219971574217324e-06, + "loss": 0.0005, + "step": 23549 + }, + { + "epoch": 9.577063847092314, + "grad_norm": 0.048664460853051573, + "learning_rate": 3.421234065537458e-06, + "loss": 0.0004, + "step": 23550 + }, + { + "epoch": 9.57747051647011, + "grad_norm": 0.384904879233439, + "learning_rate": 3.4204710411875263e-06, + "loss": 0.0032, + "step": 23551 + }, + { + "epoch": 9.577877185847905, + "grad_norm": 0.0026855945364815496, + "learning_rate": 3.4197080843797716e-06, + "loss": 0.0, + "step": 23552 + }, + { + "epoch": 9.5782838552257, + "grad_norm": 0.16625863819297584, + "learning_rate": 3.4189451951220254e-06, + "loss": 0.0023, + "step": 23553 + }, + { + "epoch": 9.578690524603497, + "grad_norm": 0.12969217569910305, + "learning_rate": 3.4181823734221185e-06, + "loss": 0.0017, + "step": 23554 + }, + { + "epoch": 9.579097193981294, + "grad_norm": 0.05001208637916284, + "learning_rate": 3.4174196192878817e-06, + "loss": 0.0005, + "step": 23555 + }, + { + "epoch": 9.57950386335909, + "grad_norm": 0.02888119260958345, + "learning_rate": 3.4166569327271425e-06, + "loss": 0.0002, + "step": 23556 + }, + { + "epoch": 9.579910532736886, + "grad_norm": 3.9019294788915997, + "learning_rate": 3.415894313747737e-06, + "loss": 0.0567, + "step": 23557 + }, + { + "epoch": 9.580317202114681, + "grad_norm": 0.11850606109707232, + "learning_rate": 3.4151317623574898e-06, + "loss": 0.001, + "step": 23558 + }, + { + "epoch": 9.580723871492477, + "grad_norm": 0.010358885781405229, + "learning_rate": 3.414369278564229e-06, + "loss": 0.0001, + "step": 23559 + }, + { + "epoch": 9.581130540870273, + "grad_norm": 0.0122818316635432, + "learning_rate": 3.4136068623757823e-06, + "loss": 0.0001, + "step": 23560 + }, + { + "epoch": 9.581537210248069, + "grad_norm": 0.0610507382083044, + "learning_rate": 3.4128445137999754e-06, + "loss": 0.0006, + "step": 23561 + }, + { + "epoch": 9.581943879625864, + "grad_norm": 10.574418953126633, + "learning_rate": 3.4120822328446323e-06, + "loss": 0.262, + "step": 23562 + }, + { + "epoch": 9.58235054900366, + "grad_norm": 0.06960247529702188, + "learning_rate": 3.4113200195175834e-06, + "loss": 0.0005, + "step": 23563 + }, + { + "epoch": 9.582757218381456, + "grad_norm": 0.7098165112908602, + "learning_rate": 3.4105578738266508e-06, + "loss": 0.0046, + "step": 23564 + }, + { + "epoch": 9.583163887759252, + "grad_norm": 0.1478578573224215, + "learning_rate": 3.409795795779658e-06, + "loss": 0.002, + "step": 23565 + }, + { + "epoch": 9.583570557137048, + "grad_norm": 0.07709524966800868, + "learning_rate": 3.409033785384428e-06, + "loss": 0.0008, + "step": 23566 + }, + { + "epoch": 9.583977226514843, + "grad_norm": 0.06618051493825101, + "learning_rate": 3.408271842648784e-06, + "loss": 0.001, + "step": 23567 + }, + { + "epoch": 9.584383895892639, + "grad_norm": 0.03444776956366095, + "learning_rate": 3.4075099675805456e-06, + "loss": 0.0004, + "step": 23568 + }, + { + "epoch": 9.584790565270435, + "grad_norm": 0.027818534626825476, + "learning_rate": 3.406748160187533e-06, + "loss": 0.0002, + "step": 23569 + }, + { + "epoch": 9.58519723464823, + "grad_norm": 0.00544180834196115, + "learning_rate": 3.405986420477573e-06, + "loss": 0.0, + "step": 23570 + }, + { + "epoch": 9.585603904026026, + "grad_norm": 0.01781994212671726, + "learning_rate": 3.40522474845848e-06, + "loss": 0.0002, + "step": 23571 + }, + { + "epoch": 9.586010573403822, + "grad_norm": 2.130736726418217, + "learning_rate": 3.404463144138074e-06, + "loss": 0.0169, + "step": 23572 + }, + { + "epoch": 9.586417242781618, + "grad_norm": 4.076071763328643, + "learning_rate": 3.4037016075241738e-06, + "loss": 0.0783, + "step": 23573 + }, + { + "epoch": 9.586823912159414, + "grad_norm": 0.021468056148381563, + "learning_rate": 3.402940138624595e-06, + "loss": 0.0003, + "step": 23574 + }, + { + "epoch": 9.58723058153721, + "grad_norm": 0.04526425480171859, + "learning_rate": 3.4021787374471547e-06, + "loss": 0.0006, + "step": 23575 + }, + { + "epoch": 9.587637250915005, + "grad_norm": 2.997367520976089, + "learning_rate": 3.401417403999673e-06, + "loss": 0.0707, + "step": 23576 + }, + { + "epoch": 9.588043920292803, + "grad_norm": 0.11649874745869924, + "learning_rate": 3.4006561382899616e-06, + "loss": 0.0014, + "step": 23577 + }, + { + "epoch": 9.588450589670598, + "grad_norm": 1.8289100069926225, + "learning_rate": 3.3998949403258376e-06, + "loss": 0.0193, + "step": 23578 + }, + { + "epoch": 9.588857259048394, + "grad_norm": 0.004789243017747345, + "learning_rate": 3.399133810115113e-06, + "loss": 0.0001, + "step": 23579 + }, + { + "epoch": 9.58926392842619, + "grad_norm": 0.12217853996611212, + "learning_rate": 3.3983727476656026e-06, + "loss": 0.001, + "step": 23580 + }, + { + "epoch": 9.589670597803986, + "grad_norm": 0.009754490960847764, + "learning_rate": 3.3976117529851173e-06, + "loss": 0.0001, + "step": 23581 + }, + { + "epoch": 9.590077267181782, + "grad_norm": 0.03197063517422836, + "learning_rate": 3.3968508260814682e-06, + "loss": 0.0003, + "step": 23582 + }, + { + "epoch": 9.590483936559577, + "grad_norm": 0.0027796729579351296, + "learning_rate": 3.3960899669624715e-06, + "loss": 0.0, + "step": 23583 + }, + { + "epoch": 9.590890605937373, + "grad_norm": 0.03566654842060314, + "learning_rate": 3.3953291756359354e-06, + "loss": 0.0004, + "step": 23584 + }, + { + "epoch": 9.591297275315169, + "grad_norm": 0.002108231486193858, + "learning_rate": 3.394568452109668e-06, + "loss": 0.0, + "step": 23585 + }, + { + "epoch": 9.591703944692965, + "grad_norm": 0.030536311130578024, + "learning_rate": 3.3938077963914808e-06, + "loss": 0.0004, + "step": 23586 + }, + { + "epoch": 9.59211061407076, + "grad_norm": 0.07446368374118777, + "learning_rate": 3.393047208489182e-06, + "loss": 0.0006, + "step": 23587 + }, + { + "epoch": 9.592517283448556, + "grad_norm": 0.0005727998323468363, + "learning_rate": 3.3922866884105743e-06, + "loss": 0.0, + "step": 23588 + }, + { + "epoch": 9.592923952826352, + "grad_norm": 0.08087224071612313, + "learning_rate": 3.391526236163474e-06, + "loss": 0.0007, + "step": 23589 + }, + { + "epoch": 9.593330622204148, + "grad_norm": 0.005229255134131147, + "learning_rate": 3.390765851755682e-06, + "loss": 0.0, + "step": 23590 + }, + { + "epoch": 9.593737291581943, + "grad_norm": 0.10509456138804098, + "learning_rate": 3.3900055351950044e-06, + "loss": 0.0014, + "step": 23591 + }, + { + "epoch": 9.59414396095974, + "grad_norm": 0.02218865721196394, + "learning_rate": 3.3892452864892478e-06, + "loss": 0.0003, + "step": 23592 + }, + { + "epoch": 9.594550630337535, + "grad_norm": 0.006797125143716319, + "learning_rate": 3.3884851056462153e-06, + "loss": 0.0001, + "step": 23593 + }, + { + "epoch": 9.59495729971533, + "grad_norm": 0.21813738885273856, + "learning_rate": 3.38772499267371e-06, + "loss": 0.0016, + "step": 23594 + }, + { + "epoch": 9.595363969093126, + "grad_norm": 0.017958012492525842, + "learning_rate": 3.386964947579533e-06, + "loss": 0.0001, + "step": 23595 + }, + { + "epoch": 9.595770638470924, + "grad_norm": 0.2171593815392156, + "learning_rate": 3.3862049703714916e-06, + "loss": 0.0018, + "step": 23596 + }, + { + "epoch": 9.59617730784872, + "grad_norm": 0.05596592619705149, + "learning_rate": 3.3854450610573853e-06, + "loss": 0.0005, + "step": 23597 + }, + { + "epoch": 9.596583977226516, + "grad_norm": 0.007637916508627494, + "learning_rate": 3.384685219645014e-06, + "loss": 0.0001, + "step": 23598 + }, + { + "epoch": 9.596990646604311, + "grad_norm": 0.1946831955362303, + "learning_rate": 3.38392544614218e-06, + "loss": 0.002, + "step": 23599 + }, + { + "epoch": 9.597397315982107, + "grad_norm": 0.005843727762348748, + "learning_rate": 3.3831657405566797e-06, + "loss": 0.0, + "step": 23600 + }, + { + "epoch": 9.597803985359903, + "grad_norm": 0.008420764613090928, + "learning_rate": 3.382406102896313e-06, + "loss": 0.0001, + "step": 23601 + }, + { + "epoch": 9.598210654737699, + "grad_norm": 0.6702131042891447, + "learning_rate": 3.381646533168875e-06, + "loss": 0.0052, + "step": 23602 + }, + { + "epoch": 9.598617324115494, + "grad_norm": 0.024553207083838006, + "learning_rate": 3.380887031382171e-06, + "loss": 0.0003, + "step": 23603 + }, + { + "epoch": 9.59902399349329, + "grad_norm": 0.5096549155916336, + "learning_rate": 3.3801275975439918e-06, + "loss": 0.0068, + "step": 23604 + }, + { + "epoch": 9.599430662871086, + "grad_norm": 0.5353815346231494, + "learning_rate": 3.3793682316621344e-06, + "loss": 0.0042, + "step": 23605 + }, + { + "epoch": 9.599837332248882, + "grad_norm": 0.0040864663945293705, + "learning_rate": 3.378608933744394e-06, + "loss": 0.0, + "step": 23606 + }, + { + "epoch": 9.600244001626677, + "grad_norm": 0.23124215282475816, + "learning_rate": 3.377849703798566e-06, + "loss": 0.0019, + "step": 23607 + }, + { + "epoch": 9.600650671004473, + "grad_norm": 3.3819407739563334, + "learning_rate": 3.3770905418324395e-06, + "loss": 0.0509, + "step": 23608 + }, + { + "epoch": 9.601057340382269, + "grad_norm": 0.002089525487260527, + "learning_rate": 3.376331447853817e-06, + "loss": 0.0, + "step": 23609 + }, + { + "epoch": 9.601464009760065, + "grad_norm": 7.384383309493231, + "learning_rate": 3.3755724218704845e-06, + "loss": 0.104, + "step": 23610 + }, + { + "epoch": 9.60187067913786, + "grad_norm": 0.007374437943818099, + "learning_rate": 3.3748134638902343e-06, + "loss": 0.0001, + "step": 23611 + }, + { + "epoch": 9.602277348515656, + "grad_norm": 0.004949033208659529, + "learning_rate": 3.37405457392086e-06, + "loss": 0.0, + "step": 23612 + }, + { + "epoch": 9.602684017893452, + "grad_norm": 2.1368318073765806, + "learning_rate": 3.373295751970149e-06, + "loss": 0.0253, + "step": 23613 + }, + { + "epoch": 9.603090687271248, + "grad_norm": 0.022568738542369522, + "learning_rate": 3.3725369980458923e-06, + "loss": 0.0003, + "step": 23614 + }, + { + "epoch": 9.603497356649044, + "grad_norm": 0.043629436734071506, + "learning_rate": 3.3717783121558767e-06, + "loss": 0.0004, + "step": 23615 + }, + { + "epoch": 9.60390402602684, + "grad_norm": 0.032762457379885604, + "learning_rate": 3.3710196943078944e-06, + "loss": 0.0003, + "step": 23616 + }, + { + "epoch": 9.604310695404635, + "grad_norm": 3.038879088598475, + "learning_rate": 3.3702611445097324e-06, + "loss": 0.0586, + "step": 23617 + }, + { + "epoch": 9.604717364782433, + "grad_norm": 0.01103792997423412, + "learning_rate": 3.3695026627691764e-06, + "loss": 0.0001, + "step": 23618 + }, + { + "epoch": 9.605124034160228, + "grad_norm": 0.019466981231996656, + "learning_rate": 3.3687442490940113e-06, + "loss": 0.0002, + "step": 23619 + }, + { + "epoch": 9.605530703538024, + "grad_norm": 0.05431021833472323, + "learning_rate": 3.367985903492025e-06, + "loss": 0.0003, + "step": 23620 + }, + { + "epoch": 9.60593737291582, + "grad_norm": 0.0015612786637511904, + "learning_rate": 3.3672276259709978e-06, + "loss": 0.0, + "step": 23621 + }, + { + "epoch": 9.606344042293616, + "grad_norm": 0.0034696547021087773, + "learning_rate": 3.3664694165387213e-06, + "loss": 0.0, + "step": 23622 + }, + { + "epoch": 9.606750711671411, + "grad_norm": 0.8476477329178463, + "learning_rate": 3.3657112752029742e-06, + "loss": 0.0095, + "step": 23623 + }, + { + "epoch": 9.607157381049207, + "grad_norm": 0.1490103104847341, + "learning_rate": 3.364953201971539e-06, + "loss": 0.0017, + "step": 23624 + }, + { + "epoch": 9.607564050427003, + "grad_norm": 0.029246773442662754, + "learning_rate": 3.3641951968521993e-06, + "loss": 0.0004, + "step": 23625 + }, + { + "epoch": 9.607970719804799, + "grad_norm": 0.2798740372329034, + "learning_rate": 3.363437259852735e-06, + "loss": 0.0019, + "step": 23626 + }, + { + "epoch": 9.608377389182595, + "grad_norm": 0.000579073883661894, + "learning_rate": 3.3626793909809285e-06, + "loss": 0.0, + "step": 23627 + }, + { + "epoch": 9.60878405856039, + "grad_norm": 0.0187598885433517, + "learning_rate": 3.3619215902445546e-06, + "loss": 0.0002, + "step": 23628 + }, + { + "epoch": 9.609190727938186, + "grad_norm": 0.43056431359059183, + "learning_rate": 3.3611638576513994e-06, + "loss": 0.0024, + "step": 23629 + }, + { + "epoch": 9.609597397315982, + "grad_norm": 0.013652481608035793, + "learning_rate": 3.360406193209239e-06, + "loss": 0.0001, + "step": 23630 + }, + { + "epoch": 9.610004066693778, + "grad_norm": 0.5805743775770962, + "learning_rate": 3.3596485969258507e-06, + "loss": 0.0033, + "step": 23631 + }, + { + "epoch": 9.610410736071573, + "grad_norm": 0.023502254443927145, + "learning_rate": 3.3588910688090117e-06, + "loss": 0.0003, + "step": 23632 + }, + { + "epoch": 9.61081740544937, + "grad_norm": 0.019795151898240763, + "learning_rate": 3.3581336088664974e-06, + "loss": 0.0002, + "step": 23633 + }, + { + "epoch": 9.611224074827165, + "grad_norm": 1.891432573044906, + "learning_rate": 3.357376217106082e-06, + "loss": 0.0224, + "step": 23634 + }, + { + "epoch": 9.61163074420496, + "grad_norm": 0.5622718637060804, + "learning_rate": 3.356618893535546e-06, + "loss": 0.0043, + "step": 23635 + }, + { + "epoch": 9.612037413582756, + "grad_norm": 0.4629674211320001, + "learning_rate": 3.3558616381626594e-06, + "loss": 0.0059, + "step": 23636 + }, + { + "epoch": 9.612444082960554, + "grad_norm": 0.052443300379230805, + "learning_rate": 3.355104450995199e-06, + "loss": 0.0007, + "step": 23637 + }, + { + "epoch": 9.61285075233835, + "grad_norm": 0.0006620088628414588, + "learning_rate": 3.354347332040935e-06, + "loss": 0.0, + "step": 23638 + }, + { + "epoch": 9.613257421716146, + "grad_norm": 1.6082941816526821, + "learning_rate": 3.353590281307639e-06, + "loss": 0.0153, + "step": 23639 + }, + { + "epoch": 9.613664091093941, + "grad_norm": 0.71712150940196, + "learning_rate": 3.3528332988030853e-06, + "loss": 0.0077, + "step": 23640 + }, + { + "epoch": 9.614070760471737, + "grad_norm": 0.013751664532885493, + "learning_rate": 3.35207638453504e-06, + "loss": 0.0001, + "step": 23641 + }, + { + "epoch": 9.614477429849533, + "grad_norm": 0.023853697593761262, + "learning_rate": 3.351319538511283e-06, + "loss": 0.0002, + "step": 23642 + }, + { + "epoch": 9.614884099227329, + "grad_norm": 0.11236659989899729, + "learning_rate": 3.350562760739573e-06, + "loss": 0.0009, + "step": 23643 + }, + { + "epoch": 9.615290768605124, + "grad_norm": 0.010335422560392696, + "learning_rate": 3.349806051227683e-06, + "loss": 0.0001, + "step": 23644 + }, + { + "epoch": 9.61569743798292, + "grad_norm": 0.0005585783101353008, + "learning_rate": 3.3490494099833816e-06, + "loss": 0.0, + "step": 23645 + }, + { + "epoch": 9.616104107360716, + "grad_norm": 0.008143784297018042, + "learning_rate": 3.3482928370144317e-06, + "loss": 0.0001, + "step": 23646 + }, + { + "epoch": 9.616510776738512, + "grad_norm": 0.05902217328048642, + "learning_rate": 3.347536332328607e-06, + "loss": 0.0009, + "step": 23647 + }, + { + "epoch": 9.616917446116307, + "grad_norm": 0.019792901925584114, + "learning_rate": 3.3467798959336695e-06, + "loss": 0.0002, + "step": 23648 + }, + { + "epoch": 9.617324115494103, + "grad_norm": 2.2968502127982906, + "learning_rate": 3.3460235278373854e-06, + "loss": 0.0101, + "step": 23649 + }, + { + "epoch": 9.617730784871899, + "grad_norm": 0.02438399604033284, + "learning_rate": 3.3452672280475174e-06, + "loss": 0.0002, + "step": 23650 + }, + { + "epoch": 9.618137454249695, + "grad_norm": 0.1980499271707476, + "learning_rate": 3.3445109965718315e-06, + "loss": 0.0026, + "step": 23651 + }, + { + "epoch": 9.61854412362749, + "grad_norm": 0.006416681000647889, + "learning_rate": 3.343754833418087e-06, + "loss": 0.0001, + "step": 23652 + }, + { + "epoch": 9.618950793005286, + "grad_norm": 0.0026148550238706356, + "learning_rate": 3.342998738594052e-06, + "loss": 0.0, + "step": 23653 + }, + { + "epoch": 9.619357462383082, + "grad_norm": 0.08820855712953843, + "learning_rate": 3.3422427121074854e-06, + "loss": 0.001, + "step": 23654 + }, + { + "epoch": 9.619764131760878, + "grad_norm": 0.27204653985223276, + "learning_rate": 3.3414867539661476e-06, + "loss": 0.0028, + "step": 23655 + }, + { + "epoch": 9.620170801138674, + "grad_norm": 0.003987931455609784, + "learning_rate": 3.3407308641778e-06, + "loss": 0.0, + "step": 23656 + }, + { + "epoch": 9.62057747051647, + "grad_norm": 0.013802039344421408, + "learning_rate": 3.339975042750202e-06, + "loss": 0.0002, + "step": 23657 + }, + { + "epoch": 9.620984139894265, + "grad_norm": 0.02777092365108725, + "learning_rate": 3.3392192896911113e-06, + "loss": 0.0003, + "step": 23658 + }, + { + "epoch": 9.621390809272063, + "grad_norm": 0.0015716243951002977, + "learning_rate": 3.3384636050082843e-06, + "loss": 0.0, + "step": 23659 + }, + { + "epoch": 9.621797478649858, + "grad_norm": 0.16191889797862943, + "learning_rate": 3.337707988709484e-06, + "loss": 0.0015, + "step": 23660 + }, + { + "epoch": 9.622204148027654, + "grad_norm": 0.07870262115249727, + "learning_rate": 3.3369524408024644e-06, + "loss": 0.0006, + "step": 23661 + }, + { + "epoch": 9.62261081740545, + "grad_norm": 0.0021583863435783207, + "learning_rate": 3.336196961294982e-06, + "loss": 0.0, + "step": 23662 + }, + { + "epoch": 9.623017486783246, + "grad_norm": 0.4426834687337292, + "learning_rate": 3.33544155019479e-06, + "loss": 0.003, + "step": 23663 + }, + { + "epoch": 9.623424156161041, + "grad_norm": 0.029790469581118542, + "learning_rate": 3.3346862075096462e-06, + "loss": 0.0004, + "step": 23664 + }, + { + "epoch": 9.623830825538837, + "grad_norm": 0.07313937865531399, + "learning_rate": 3.333930933247299e-06, + "loss": 0.0009, + "step": 23665 + }, + { + "epoch": 9.624237494916633, + "grad_norm": 0.000705776833195053, + "learning_rate": 3.3331757274155097e-06, + "loss": 0.0, + "step": 23666 + }, + { + "epoch": 9.624644164294429, + "grad_norm": 0.07971872177547315, + "learning_rate": 3.3324205900220262e-06, + "loss": 0.001, + "step": 23667 + }, + { + "epoch": 9.625050833672224, + "grad_norm": 0.047428717238801196, + "learning_rate": 3.331665521074602e-06, + "loss": 0.0004, + "step": 23668 + }, + { + "epoch": 9.62545750305002, + "grad_norm": 0.017136461929807636, + "learning_rate": 3.3309105205809856e-06, + "loss": 0.0002, + "step": 23669 + }, + { + "epoch": 9.625864172427816, + "grad_norm": 0.09417770037043957, + "learning_rate": 3.33015558854893e-06, + "loss": 0.0013, + "step": 23670 + }, + { + "epoch": 9.626270841805612, + "grad_norm": 1.2973728303859373, + "learning_rate": 3.3294007249861847e-06, + "loss": 0.0139, + "step": 23671 + }, + { + "epoch": 9.626677511183408, + "grad_norm": 0.06285653155540288, + "learning_rate": 3.3286459299004945e-06, + "loss": 0.0007, + "step": 23672 + }, + { + "epoch": 9.627084180561203, + "grad_norm": 0.031211733612939347, + "learning_rate": 3.3278912032996145e-06, + "loss": 0.0003, + "step": 23673 + }, + { + "epoch": 9.627490849938999, + "grad_norm": 0.00016205903887468636, + "learning_rate": 3.3271365451912907e-06, + "loss": 0.0, + "step": 23674 + }, + { + "epoch": 9.627897519316795, + "grad_norm": 0.02906354196716101, + "learning_rate": 3.3263819555832676e-06, + "loss": 0.0002, + "step": 23675 + }, + { + "epoch": 9.62830418869459, + "grad_norm": 0.005672292386955191, + "learning_rate": 3.3256274344832918e-06, + "loss": 0.0001, + "step": 23676 + }, + { + "epoch": 9.628710858072386, + "grad_norm": 0.2804302405859314, + "learning_rate": 3.324872981899111e-06, + "loss": 0.0025, + "step": 23677 + }, + { + "epoch": 9.629117527450184, + "grad_norm": 0.06527656126177928, + "learning_rate": 3.3241185978384636e-06, + "loss": 0.0006, + "step": 23678 + }, + { + "epoch": 9.62952419682798, + "grad_norm": 0.03740088624262415, + "learning_rate": 3.3233642823091027e-06, + "loss": 0.0003, + "step": 23679 + }, + { + "epoch": 9.629930866205775, + "grad_norm": 0.010250679482205927, + "learning_rate": 3.3226100353187684e-06, + "loss": 0.0001, + "step": 23680 + }, + { + "epoch": 9.630337535583571, + "grad_norm": 0.0008790280995367111, + "learning_rate": 3.3218558568752026e-06, + "loss": 0.0, + "step": 23681 + }, + { + "epoch": 9.630744204961367, + "grad_norm": 0.7290231618091597, + "learning_rate": 3.3211017469861463e-06, + "loss": 0.0071, + "step": 23682 + }, + { + "epoch": 9.631150874339163, + "grad_norm": 0.03640416609106582, + "learning_rate": 3.3203477056593435e-06, + "loss": 0.0002, + "step": 23683 + }, + { + "epoch": 9.631557543716958, + "grad_norm": 0.008143610452698487, + "learning_rate": 3.319593732902532e-06, + "loss": 0.0001, + "step": 23684 + }, + { + "epoch": 9.631964213094754, + "grad_norm": 0.0005783984360923028, + "learning_rate": 3.3188398287234504e-06, + "loss": 0.0, + "step": 23685 + }, + { + "epoch": 9.63237088247255, + "grad_norm": 0.0011688201403970597, + "learning_rate": 3.3180859931298437e-06, + "loss": 0.0, + "step": 23686 + }, + { + "epoch": 9.632777551850346, + "grad_norm": 0.5758885008521459, + "learning_rate": 3.317332226129447e-06, + "loss": 0.0072, + "step": 23687 + }, + { + "epoch": 9.633184221228142, + "grad_norm": 0.003417289305419608, + "learning_rate": 3.3165785277299987e-06, + "loss": 0.0, + "step": 23688 + }, + { + "epoch": 9.633590890605937, + "grad_norm": 0.001586481334107393, + "learning_rate": 3.315824897939236e-06, + "loss": 0.0, + "step": 23689 + }, + { + "epoch": 9.633997559983733, + "grad_norm": 0.011143329880584196, + "learning_rate": 3.3150713367648936e-06, + "loss": 0.0001, + "step": 23690 + }, + { + "epoch": 9.634404229361529, + "grad_norm": 0.08081864403634811, + "learning_rate": 3.314317844214706e-06, + "loss": 0.0004, + "step": 23691 + }, + { + "epoch": 9.634810898739325, + "grad_norm": 0.15145733965235603, + "learning_rate": 3.3135644202964133e-06, + "loss": 0.002, + "step": 23692 + }, + { + "epoch": 9.63521756811712, + "grad_norm": 0.08982458689782731, + "learning_rate": 3.312811065017747e-06, + "loss": 0.0011, + "step": 23693 + }, + { + "epoch": 9.635624237494916, + "grad_norm": 0.15788059877496052, + "learning_rate": 3.3120577783864405e-06, + "loss": 0.0016, + "step": 23694 + }, + { + "epoch": 9.636030906872712, + "grad_norm": 0.1369640885828109, + "learning_rate": 3.3113045604102278e-06, + "loss": 0.0017, + "step": 23695 + }, + { + "epoch": 9.636437576250508, + "grad_norm": 0.03830598295504454, + "learning_rate": 3.3105514110968397e-06, + "loss": 0.0003, + "step": 23696 + }, + { + "epoch": 9.636844245628303, + "grad_norm": 11.806921631692843, + "learning_rate": 3.309798330454008e-06, + "loss": 0.1022, + "step": 23697 + }, + { + "epoch": 9.6372509150061, + "grad_norm": 4.801692163299091, + "learning_rate": 3.30904531848946e-06, + "loss": 0.1284, + "step": 23698 + }, + { + "epoch": 9.637657584383895, + "grad_norm": 0.15028021199114058, + "learning_rate": 3.308292375210933e-06, + "loss": 0.0013, + "step": 23699 + }, + { + "epoch": 9.638064253761693, + "grad_norm": 0.03531839638077055, + "learning_rate": 3.3075395006261524e-06, + "loss": 0.0006, + "step": 23700 + }, + { + "epoch": 9.638470923139488, + "grad_norm": 0.11772225388274968, + "learning_rate": 3.306786694742847e-06, + "loss": 0.0007, + "step": 23701 + }, + { + "epoch": 9.638877592517284, + "grad_norm": 11.520141761308228, + "learning_rate": 3.306033957568745e-06, + "loss": 0.0505, + "step": 23702 + }, + { + "epoch": 9.63928426189508, + "grad_norm": 0.033899743886646405, + "learning_rate": 3.305281289111574e-06, + "loss": 0.0004, + "step": 23703 + }, + { + "epoch": 9.639690931272876, + "grad_norm": 0.06717440505987562, + "learning_rate": 3.304528689379056e-06, + "loss": 0.0005, + "step": 23704 + }, + { + "epoch": 9.640097600650671, + "grad_norm": 0.22656058333003412, + "learning_rate": 3.303776158378924e-06, + "loss": 0.0013, + "step": 23705 + }, + { + "epoch": 9.640504270028467, + "grad_norm": 2.1528007826629407, + "learning_rate": 3.3030236961189e-06, + "loss": 0.0334, + "step": 23706 + }, + { + "epoch": 9.640910939406263, + "grad_norm": 0.044787360041546924, + "learning_rate": 3.302271302606709e-06, + "loss": 0.0004, + "step": 23707 + }, + { + "epoch": 9.641317608784059, + "grad_norm": 0.9746175818919562, + "learning_rate": 3.3015189778500723e-06, + "loss": 0.0091, + "step": 23708 + }, + { + "epoch": 9.641724278161854, + "grad_norm": 0.009722005325660608, + "learning_rate": 3.3007667218567163e-06, + "loss": 0.0001, + "step": 23709 + }, + { + "epoch": 9.64213094753965, + "grad_norm": 0.06624717683887611, + "learning_rate": 3.3000145346343605e-06, + "loss": 0.0007, + "step": 23710 + }, + { + "epoch": 9.642537616917446, + "grad_norm": 0.004742136801495524, + "learning_rate": 3.299262416190725e-06, + "loss": 0.0001, + "step": 23711 + }, + { + "epoch": 9.642944286295242, + "grad_norm": 0.060512780351775826, + "learning_rate": 3.298510366533535e-06, + "loss": 0.0006, + "step": 23712 + }, + { + "epoch": 9.643350955673037, + "grad_norm": 0.7326552748131158, + "learning_rate": 3.297758385670511e-06, + "loss": 0.0069, + "step": 23713 + }, + { + "epoch": 9.643757625050833, + "grad_norm": 0.000540421244445508, + "learning_rate": 3.2970064736093687e-06, + "loss": 0.0, + "step": 23714 + }, + { + "epoch": 9.644164294428629, + "grad_norm": 10.191855532552788, + "learning_rate": 3.296254630357829e-06, + "loss": 0.0933, + "step": 23715 + }, + { + "epoch": 9.644570963806425, + "grad_norm": 0.045141593219027595, + "learning_rate": 3.295502855923609e-06, + "loss": 0.0005, + "step": 23716 + }, + { + "epoch": 9.64497763318422, + "grad_norm": 0.0027199890082999258, + "learning_rate": 3.294751150314427e-06, + "loss": 0.0, + "step": 23717 + }, + { + "epoch": 9.645384302562016, + "grad_norm": 0.06822650455611393, + "learning_rate": 3.2939995135379963e-06, + "loss": 0.0006, + "step": 23718 + }, + { + "epoch": 9.645790971939814, + "grad_norm": 0.019546002195181364, + "learning_rate": 3.2932479456020382e-06, + "loss": 0.0001, + "step": 23719 + }, + { + "epoch": 9.64619764131761, + "grad_norm": 2.750322155625744e-05, + "learning_rate": 3.292496446514266e-06, + "loss": 0.0, + "step": 23720 + }, + { + "epoch": 9.646604310695405, + "grad_norm": 0.08935018895687472, + "learning_rate": 3.2917450162823927e-06, + "loss": 0.0011, + "step": 23721 + }, + { + "epoch": 9.647010980073201, + "grad_norm": 0.03975879925833925, + "learning_rate": 3.290993654914134e-06, + "loss": 0.0004, + "step": 23722 + }, + { + "epoch": 9.647417649450997, + "grad_norm": 0.012219731745624084, + "learning_rate": 3.290242362417201e-06, + "loss": 0.0001, + "step": 23723 + }, + { + "epoch": 9.647824318828793, + "grad_norm": 0.005197497494018632, + "learning_rate": 3.2894911387993046e-06, + "loss": 0.0001, + "step": 23724 + }, + { + "epoch": 9.648230988206588, + "grad_norm": 1.1114645830016074, + "learning_rate": 3.288739984068162e-06, + "loss": 0.0149, + "step": 23725 + }, + { + "epoch": 9.648637657584384, + "grad_norm": 5.115594746007414, + "learning_rate": 3.2879888982314813e-06, + "loss": 0.1378, + "step": 23726 + }, + { + "epoch": 9.64904432696218, + "grad_norm": 0.004066056791001877, + "learning_rate": 3.287237881296972e-06, + "loss": 0.0, + "step": 23727 + }, + { + "epoch": 9.649450996339976, + "grad_norm": 0.0019633806995197223, + "learning_rate": 3.286486933272345e-06, + "loss": 0.0, + "step": 23728 + }, + { + "epoch": 9.649857665717771, + "grad_norm": 0.1010655202727706, + "learning_rate": 3.285736054165308e-06, + "loss": 0.0008, + "step": 23729 + }, + { + "epoch": 9.650264335095567, + "grad_norm": 2.6962268263291174, + "learning_rate": 3.2849852439835707e-06, + "loss": 0.0168, + "step": 23730 + }, + { + "epoch": 9.650671004473363, + "grad_norm": 0.7052038917193705, + "learning_rate": 3.2842345027348356e-06, + "loss": 0.0054, + "step": 23731 + }, + { + "epoch": 9.651077673851159, + "grad_norm": 0.001832878221968678, + "learning_rate": 3.2834838304268157e-06, + "loss": 0.0, + "step": 23732 + }, + { + "epoch": 9.651484343228955, + "grad_norm": 0.0076860334638249135, + "learning_rate": 3.2827332270672153e-06, + "loss": 0.0001, + "step": 23733 + }, + { + "epoch": 9.65189101260675, + "grad_norm": 0.09792979548080089, + "learning_rate": 3.281982692663739e-06, + "loss": 0.0009, + "step": 23734 + }, + { + "epoch": 9.652297681984546, + "grad_norm": 1.5732212597297428, + "learning_rate": 3.2812322272240916e-06, + "loss": 0.019, + "step": 23735 + }, + { + "epoch": 9.652704351362342, + "grad_norm": 0.011005621199505383, + "learning_rate": 3.280481830755977e-06, + "loss": 0.0001, + "step": 23736 + }, + { + "epoch": 9.653111020740138, + "grad_norm": 0.12270315398428117, + "learning_rate": 3.279731503267095e-06, + "loss": 0.0013, + "step": 23737 + }, + { + "epoch": 9.653517690117933, + "grad_norm": 0.033361316377048145, + "learning_rate": 3.278981244765155e-06, + "loss": 0.0005, + "step": 23738 + }, + { + "epoch": 9.65392435949573, + "grad_norm": 1.9743505369808956, + "learning_rate": 3.2782310552578545e-06, + "loss": 0.0183, + "step": 23739 + }, + { + "epoch": 9.654331028873525, + "grad_norm": 0.001526769077541791, + "learning_rate": 3.2774809347528946e-06, + "loss": 0.0, + "step": 23740 + }, + { + "epoch": 9.654737698251322, + "grad_norm": 0.0016382753370697365, + "learning_rate": 3.276730883257977e-06, + "loss": 0.0, + "step": 23741 + }, + { + "epoch": 9.655144367629118, + "grad_norm": 0.01431885211087713, + "learning_rate": 3.2759809007808007e-06, + "loss": 0.0002, + "step": 23742 + }, + { + "epoch": 9.655551037006914, + "grad_norm": 0.004530202076050019, + "learning_rate": 3.2752309873290645e-06, + "loss": 0.0, + "step": 23743 + }, + { + "epoch": 9.65595770638471, + "grad_norm": 2.109321968970548, + "learning_rate": 3.2744811429104674e-06, + "loss": 0.0363, + "step": 23744 + }, + { + "epoch": 9.656364375762506, + "grad_norm": 0.13799997652328327, + "learning_rate": 3.2737313675327054e-06, + "loss": 0.0009, + "step": 23745 + }, + { + "epoch": 9.656771045140301, + "grad_norm": 0.12240755733868192, + "learning_rate": 3.272981661203476e-06, + "loss": 0.0011, + "step": 23746 + }, + { + "epoch": 9.657177714518097, + "grad_norm": 0.01363975175851106, + "learning_rate": 3.2722320239304763e-06, + "loss": 0.0001, + "step": 23747 + }, + { + "epoch": 9.657584383895893, + "grad_norm": 0.008900787859970692, + "learning_rate": 3.2714824557213997e-06, + "loss": 0.0001, + "step": 23748 + }, + { + "epoch": 9.657991053273689, + "grad_norm": 6.557283968565255, + "learning_rate": 3.27073295658394e-06, + "loss": 0.1544, + "step": 23749 + }, + { + "epoch": 9.658397722651484, + "grad_norm": 0.012947762456063204, + "learning_rate": 3.2699835265257963e-06, + "loss": 0.0002, + "step": 23750 + }, + { + "epoch": 9.65880439202928, + "grad_norm": 0.0021872056537332626, + "learning_rate": 3.2692341655546576e-06, + "loss": 0.0, + "step": 23751 + }, + { + "epoch": 9.659211061407076, + "grad_norm": 0.007818148089129834, + "learning_rate": 3.2684848736782195e-06, + "loss": 0.0001, + "step": 23752 + }, + { + "epoch": 9.659617730784872, + "grad_norm": 12.68509759261519, + "learning_rate": 3.2677356509041713e-06, + "loss": 0.1318, + "step": 23753 + }, + { + "epoch": 9.660024400162667, + "grad_norm": 0.004165467362959723, + "learning_rate": 3.2669864972402044e-06, + "loss": 0.0, + "step": 23754 + }, + { + "epoch": 9.660431069540463, + "grad_norm": 0.002126198770873418, + "learning_rate": 3.2662374126940066e-06, + "loss": 0.0, + "step": 23755 + }, + { + "epoch": 9.660837738918259, + "grad_norm": 0.961991436519254, + "learning_rate": 3.2654883972732753e-06, + "loss": 0.0086, + "step": 23756 + }, + { + "epoch": 9.661244408296055, + "grad_norm": 0.0005759372997232999, + "learning_rate": 3.2647394509856944e-06, + "loss": 0.0, + "step": 23757 + }, + { + "epoch": 9.66165107767385, + "grad_norm": 0.003969017181773725, + "learning_rate": 3.2639905738389523e-06, + "loss": 0.0, + "step": 23758 + }, + { + "epoch": 9.662057747051646, + "grad_norm": 8.178751539442711, + "learning_rate": 3.263241765840738e-06, + "loss": 0.0452, + "step": 23759 + }, + { + "epoch": 9.662464416429444, + "grad_norm": 0.05447624336473448, + "learning_rate": 3.2624930269987366e-06, + "loss": 0.0004, + "step": 23760 + }, + { + "epoch": 9.66287108580724, + "grad_norm": 0.006828832219001071, + "learning_rate": 3.2617443573206353e-06, + "loss": 0.0001, + "step": 23761 + }, + { + "epoch": 9.663277755185035, + "grad_norm": 0.001598788876226365, + "learning_rate": 3.260995756814116e-06, + "loss": 0.0, + "step": 23762 + }, + { + "epoch": 9.663684424562831, + "grad_norm": 0.04021470815761314, + "learning_rate": 3.260247225486871e-06, + "loss": 0.0002, + "step": 23763 + }, + { + "epoch": 9.664091093940627, + "grad_norm": 0.19940771338174423, + "learning_rate": 3.25949876334658e-06, + "loss": 0.0025, + "step": 23764 + }, + { + "epoch": 9.664497763318423, + "grad_norm": 0.22859248540557423, + "learning_rate": 3.258750370400926e-06, + "loss": 0.0018, + "step": 23765 + }, + { + "epoch": 9.664904432696218, + "grad_norm": 0.004863238277561637, + "learning_rate": 3.2580020466575936e-06, + "loss": 0.0, + "step": 23766 + }, + { + "epoch": 9.665311102074014, + "grad_norm": 0.21331053893984328, + "learning_rate": 3.257253792124262e-06, + "loss": 0.002, + "step": 23767 + }, + { + "epoch": 9.66571777145181, + "grad_norm": 0.16536780191127368, + "learning_rate": 3.256505606808611e-06, + "loss": 0.0018, + "step": 23768 + }, + { + "epoch": 9.666124440829606, + "grad_norm": 0.03125975820769418, + "learning_rate": 3.2557574907183275e-06, + "loss": 0.0003, + "step": 23769 + }, + { + "epoch": 9.666531110207401, + "grad_norm": 0.03702768447718123, + "learning_rate": 3.2550094438610867e-06, + "loss": 0.0004, + "step": 23770 + }, + { + "epoch": 9.666937779585197, + "grad_norm": 0.1235324027880496, + "learning_rate": 3.2542614662445694e-06, + "loss": 0.0009, + "step": 23771 + }, + { + "epoch": 9.667344448962993, + "grad_norm": 0.05626958101403764, + "learning_rate": 3.253513557876453e-06, + "loss": 0.0005, + "step": 23772 + }, + { + "epoch": 9.667751118340789, + "grad_norm": 0.28887450752297045, + "learning_rate": 3.252765718764416e-06, + "loss": 0.0046, + "step": 23773 + }, + { + "epoch": 9.668157787718584, + "grad_norm": 0.03222733373169551, + "learning_rate": 3.252017948916134e-06, + "loss": 0.0004, + "step": 23774 + }, + { + "epoch": 9.66856445709638, + "grad_norm": 0.0455121947806624, + "learning_rate": 3.251270248339281e-06, + "loss": 0.0005, + "step": 23775 + }, + { + "epoch": 9.668971126474176, + "grad_norm": 0.0013941762416428511, + "learning_rate": 3.2505226170415383e-06, + "loss": 0.0, + "step": 23776 + }, + { + "epoch": 9.669377795851972, + "grad_norm": 0.02557535929117124, + "learning_rate": 3.2497750550305783e-06, + "loss": 0.0002, + "step": 23777 + }, + { + "epoch": 9.669784465229768, + "grad_norm": 3.1226092369686747, + "learning_rate": 3.2490275623140743e-06, + "loss": 0.0541, + "step": 23778 + }, + { + "epoch": 9.670191134607563, + "grad_norm": 0.7756954453950987, + "learning_rate": 3.248280138899701e-06, + "loss": 0.006, + "step": 23779 + }, + { + "epoch": 9.670597803985359, + "grad_norm": 0.021113277415522335, + "learning_rate": 3.2475327847951286e-06, + "loss": 0.0002, + "step": 23780 + }, + { + "epoch": 9.671004473363155, + "grad_norm": 0.00021399309654649524, + "learning_rate": 3.2467855000080285e-06, + "loss": 0.0, + "step": 23781 + }, + { + "epoch": 9.671411142740952, + "grad_norm": 0.05162615204852659, + "learning_rate": 3.246038284546077e-06, + "loss": 0.0005, + "step": 23782 + }, + { + "epoch": 9.671817812118748, + "grad_norm": 0.0001515256682373726, + "learning_rate": 3.2452911384169417e-06, + "loss": 0.0, + "step": 23783 + }, + { + "epoch": 9.672224481496544, + "grad_norm": 0.05059583602538733, + "learning_rate": 3.244544061628293e-06, + "loss": 0.0005, + "step": 23784 + }, + { + "epoch": 9.67263115087434, + "grad_norm": 0.034052290512943846, + "learning_rate": 3.2437970541877985e-06, + "loss": 0.0004, + "step": 23785 + }, + { + "epoch": 9.673037820252135, + "grad_norm": 1.5993404358117695, + "learning_rate": 3.243050116103128e-06, + "loss": 0.0144, + "step": 23786 + }, + { + "epoch": 9.673444489629931, + "grad_norm": 0.07926391920100774, + "learning_rate": 3.2423032473819495e-06, + "loss": 0.0005, + "step": 23787 + }, + { + "epoch": 9.673851159007727, + "grad_norm": 0.8909472848164501, + "learning_rate": 3.241556448031925e-06, + "loss": 0.0114, + "step": 23788 + }, + { + "epoch": 9.674257828385523, + "grad_norm": 0.01671911406814145, + "learning_rate": 3.240809718060728e-06, + "loss": 0.0002, + "step": 23789 + }, + { + "epoch": 9.674664497763318, + "grad_norm": 0.0051076522872691215, + "learning_rate": 3.2400630574760218e-06, + "loss": 0.0001, + "step": 23790 + }, + { + "epoch": 9.675071167141114, + "grad_norm": 0.006980516467756385, + "learning_rate": 3.2393164662854703e-06, + "loss": 0.0001, + "step": 23791 + }, + { + "epoch": 9.67547783651891, + "grad_norm": 1.151992805733899, + "learning_rate": 3.238569944496738e-06, + "loss": 0.0066, + "step": 23792 + }, + { + "epoch": 9.675884505896706, + "grad_norm": 0.27756407043100495, + "learning_rate": 3.237823492117488e-06, + "loss": 0.0015, + "step": 23793 + }, + { + "epoch": 9.676291175274502, + "grad_norm": 0.7362215662202751, + "learning_rate": 3.23707710915538e-06, + "loss": 0.0053, + "step": 23794 + }, + { + "epoch": 9.676697844652297, + "grad_norm": 0.0014958104656764412, + "learning_rate": 3.2363307956180823e-06, + "loss": 0.0, + "step": 23795 + }, + { + "epoch": 9.677104514030093, + "grad_norm": 0.005463131343281697, + "learning_rate": 3.2355845515132532e-06, + "loss": 0.0001, + "step": 23796 + }, + { + "epoch": 9.677511183407889, + "grad_norm": 1.4734771589747357, + "learning_rate": 3.2348383768485536e-06, + "loss": 0.0163, + "step": 23797 + }, + { + "epoch": 9.677917852785685, + "grad_norm": 0.04314209751797592, + "learning_rate": 3.2340922716316415e-06, + "loss": 0.0005, + "step": 23798 + }, + { + "epoch": 9.67832452216348, + "grad_norm": 1.6907169551375365, + "learning_rate": 3.2333462358701783e-06, + "loss": 0.0187, + "step": 23799 + }, + { + "epoch": 9.678731191541276, + "grad_norm": 0.0013223588425504977, + "learning_rate": 3.2326002695718216e-06, + "loss": 0.0, + "step": 23800 + }, + { + "epoch": 9.679137860919074, + "grad_norm": 0.0496510187906084, + "learning_rate": 3.2318543727442264e-06, + "loss": 0.0007, + "step": 23801 + }, + { + "epoch": 9.67954453029687, + "grad_norm": 7.450140736585953e-05, + "learning_rate": 3.231108545395055e-06, + "loss": 0.0, + "step": 23802 + }, + { + "epoch": 9.679951199674665, + "grad_norm": 0.006517886483195693, + "learning_rate": 3.230362787531961e-06, + "loss": 0.0001, + "step": 23803 + }, + { + "epoch": 9.680357869052461, + "grad_norm": 0.6135560702399833, + "learning_rate": 3.229617099162601e-06, + "loss": 0.0065, + "step": 23804 + }, + { + "epoch": 9.680764538430257, + "grad_norm": 4.246209848061689, + "learning_rate": 3.2288714802946273e-06, + "loss": 0.0347, + "step": 23805 + }, + { + "epoch": 9.681171207808053, + "grad_norm": 0.06521443655939271, + "learning_rate": 3.2281259309356973e-06, + "loss": 0.0009, + "step": 23806 + }, + { + "epoch": 9.681577877185848, + "grad_norm": 0.7719740463451824, + "learning_rate": 3.2273804510934592e-06, + "loss": 0.0053, + "step": 23807 + }, + { + "epoch": 9.681984546563644, + "grad_norm": 0.00119039264626618, + "learning_rate": 3.226635040775572e-06, + "loss": 0.0, + "step": 23808 + }, + { + "epoch": 9.68239121594144, + "grad_norm": 0.05831217821128694, + "learning_rate": 3.225889699989685e-06, + "loss": 0.0007, + "step": 23809 + }, + { + "epoch": 9.682797885319236, + "grad_norm": 0.42266389513824626, + "learning_rate": 3.2251444287434498e-06, + "loss": 0.0044, + "step": 23810 + }, + { + "epoch": 9.683204554697031, + "grad_norm": 0.023331942651126394, + "learning_rate": 3.224399227044517e-06, + "loss": 0.0003, + "step": 23811 + }, + { + "epoch": 9.683611224074827, + "grad_norm": 0.00103418889935793, + "learning_rate": 3.223654094900537e-06, + "loss": 0.0, + "step": 23812 + }, + { + "epoch": 9.684017893452623, + "grad_norm": 0.005757246206866284, + "learning_rate": 3.222909032319157e-06, + "loss": 0.0001, + "step": 23813 + }, + { + "epoch": 9.684424562830419, + "grad_norm": 0.9181108368730648, + "learning_rate": 3.2221640393080235e-06, + "loss": 0.0107, + "step": 23814 + }, + { + "epoch": 9.684831232208214, + "grad_norm": 0.13626553710347195, + "learning_rate": 3.221419115874791e-06, + "loss": 0.0012, + "step": 23815 + }, + { + "epoch": 9.68523790158601, + "grad_norm": 0.021300795729630676, + "learning_rate": 3.2206742620271025e-06, + "loss": 0.0003, + "step": 23816 + }, + { + "epoch": 9.685644570963806, + "grad_norm": 1.8274627301314335, + "learning_rate": 3.219929477772604e-06, + "loss": 0.0126, + "step": 23817 + }, + { + "epoch": 9.686051240341602, + "grad_norm": 0.4072152746960867, + "learning_rate": 3.2191847631189432e-06, + "loss": 0.0037, + "step": 23818 + }, + { + "epoch": 9.686457909719397, + "grad_norm": 0.04099012416272571, + "learning_rate": 3.218440118073762e-06, + "loss": 0.0002, + "step": 23819 + }, + { + "epoch": 9.686864579097193, + "grad_norm": 1.3519876599753176, + "learning_rate": 3.2176955426447075e-06, + "loss": 0.0254, + "step": 23820 + }, + { + "epoch": 9.687271248474989, + "grad_norm": 0.1780554729076311, + "learning_rate": 3.2169510368394187e-06, + "loss": 0.0018, + "step": 23821 + }, + { + "epoch": 9.687677917852785, + "grad_norm": 0.01263572655947456, + "learning_rate": 3.2162066006655435e-06, + "loss": 0.0001, + "step": 23822 + }, + { + "epoch": 9.688084587230582, + "grad_norm": 0.3428232218906701, + "learning_rate": 3.2154622341307217e-06, + "loss": 0.0028, + "step": 23823 + }, + { + "epoch": 9.688491256608378, + "grad_norm": 0.0016324743888124736, + "learning_rate": 3.2147179372425954e-06, + "loss": 0.0, + "step": 23824 + }, + { + "epoch": 9.688897925986174, + "grad_norm": 0.11355425641681556, + "learning_rate": 3.2139737100088043e-06, + "loss": 0.0011, + "step": 23825 + }, + { + "epoch": 9.68930459536397, + "grad_norm": 0.15868204980729128, + "learning_rate": 3.2132295524369873e-06, + "loss": 0.0017, + "step": 23826 + }, + { + "epoch": 9.689711264741765, + "grad_norm": 0.1335565547961393, + "learning_rate": 3.2124854645347827e-06, + "loss": 0.0012, + "step": 23827 + }, + { + "epoch": 9.690117934119561, + "grad_norm": 0.01742840957257995, + "learning_rate": 3.211741446309834e-06, + "loss": 0.0001, + "step": 23828 + }, + { + "epoch": 9.690524603497357, + "grad_norm": 0.04015444713481874, + "learning_rate": 3.2109974977697755e-06, + "loss": 0.0006, + "step": 23829 + }, + { + "epoch": 9.690931272875153, + "grad_norm": 0.06696982384609072, + "learning_rate": 3.2102536189222445e-06, + "loss": 0.0007, + "step": 23830 + }, + { + "epoch": 9.691337942252948, + "grad_norm": 0.0009266529626569555, + "learning_rate": 3.2095098097748766e-06, + "loss": 0.0, + "step": 23831 + }, + { + "epoch": 9.691744611630744, + "grad_norm": 0.0021789764140355923, + "learning_rate": 3.2087660703353095e-06, + "loss": 0.0, + "step": 23832 + }, + { + "epoch": 9.69215128100854, + "grad_norm": 0.728251729253523, + "learning_rate": 3.208022400611176e-06, + "loss": 0.0059, + "step": 23833 + }, + { + "epoch": 9.692557950386336, + "grad_norm": 0.006260130172304304, + "learning_rate": 3.2072788006101076e-06, + "loss": 0.0, + "step": 23834 + }, + { + "epoch": 9.692964619764131, + "grad_norm": 0.20248492257999046, + "learning_rate": 3.2065352703397436e-06, + "loss": 0.002, + "step": 23835 + }, + { + "epoch": 9.693371289141927, + "grad_norm": 0.41714779345298086, + "learning_rate": 3.2057918098077155e-06, + "loss": 0.0033, + "step": 23836 + }, + { + "epoch": 9.693777958519723, + "grad_norm": 0.08278562654385155, + "learning_rate": 3.2050484190216525e-06, + "loss": 0.0008, + "step": 23837 + }, + { + "epoch": 9.694184627897519, + "grad_norm": 0.028107010381917246, + "learning_rate": 3.204305097989189e-06, + "loss": 0.0004, + "step": 23838 + }, + { + "epoch": 9.694591297275315, + "grad_norm": 0.1209762214152774, + "learning_rate": 3.203561846717952e-06, + "loss": 0.0016, + "step": 23839 + }, + { + "epoch": 9.69499796665311, + "grad_norm": 0.0011969820155742317, + "learning_rate": 3.2028186652155715e-06, + "loss": 0.0, + "step": 23840 + }, + { + "epoch": 9.695404636030906, + "grad_norm": 0.25484005497009515, + "learning_rate": 3.2020755534896807e-06, + "loss": 0.002, + "step": 23841 + }, + { + "epoch": 9.695811305408704, + "grad_norm": 0.003690912038755276, + "learning_rate": 3.201332511547909e-06, + "loss": 0.0, + "step": 23842 + }, + { + "epoch": 9.6962179747865, + "grad_norm": 0.04461401322652713, + "learning_rate": 3.2005895393978782e-06, + "loss": 0.0004, + "step": 23843 + }, + { + "epoch": 9.696624644164295, + "grad_norm": 0.014125645260649505, + "learning_rate": 3.199846637047217e-06, + "loss": 0.0001, + "step": 23844 + }, + { + "epoch": 9.697031313542091, + "grad_norm": 0.004538434857176728, + "learning_rate": 3.1991038045035495e-06, + "loss": 0.0, + "step": 23845 + }, + { + "epoch": 9.697437982919887, + "grad_norm": 1.5321975255030529, + "learning_rate": 3.198361041774508e-06, + "loss": 0.0164, + "step": 23846 + }, + { + "epoch": 9.697844652297682, + "grad_norm": 0.043146889670081186, + "learning_rate": 3.1976183488677126e-06, + "loss": 0.0006, + "step": 23847 + }, + { + "epoch": 9.698251321675478, + "grad_norm": 5.812587324474777, + "learning_rate": 3.19687572579079e-06, + "loss": 0.0521, + "step": 23848 + }, + { + "epoch": 9.698657991053274, + "grad_norm": 0.0005658612254279848, + "learning_rate": 3.196133172551361e-06, + "loss": 0.0, + "step": 23849 + }, + { + "epoch": 9.69906466043107, + "grad_norm": 0.0015675629799567618, + "learning_rate": 3.1953906891570487e-06, + "loss": 0.0, + "step": 23850 + }, + { + "epoch": 9.699471329808866, + "grad_norm": 0.001558107818101491, + "learning_rate": 3.1946482756154763e-06, + "loss": 0.0, + "step": 23851 + }, + { + "epoch": 9.699877999186661, + "grad_norm": 1.2217240527518858, + "learning_rate": 3.1939059319342613e-06, + "loss": 0.0058, + "step": 23852 + }, + { + "epoch": 9.700284668564457, + "grad_norm": 0.0002295974788982224, + "learning_rate": 3.193163658121031e-06, + "loss": 0.0, + "step": 23853 + }, + { + "epoch": 9.700691337942253, + "grad_norm": 0.16814480016449312, + "learning_rate": 3.192421454183401e-06, + "loss": 0.0009, + "step": 23854 + }, + { + "epoch": 9.701098007320049, + "grad_norm": 0.6484099151646265, + "learning_rate": 3.1916793201289906e-06, + "loss": 0.0065, + "step": 23855 + }, + { + "epoch": 9.701504676697844, + "grad_norm": 8.306743218016884, + "learning_rate": 3.1909372559654193e-06, + "loss": 0.0325, + "step": 23856 + }, + { + "epoch": 9.70191134607564, + "grad_norm": 0.017983992228905215, + "learning_rate": 3.190195261700304e-06, + "loss": 0.0002, + "step": 23857 + }, + { + "epoch": 9.702318015453436, + "grad_norm": 0.6148540111985741, + "learning_rate": 3.1894533373412574e-06, + "loss": 0.0022, + "step": 23858 + }, + { + "epoch": 9.702724684831232, + "grad_norm": 0.6820323735372407, + "learning_rate": 3.188711482895904e-06, + "loss": 0.0053, + "step": 23859 + }, + { + "epoch": 9.703131354209027, + "grad_norm": 0.27713651604319073, + "learning_rate": 3.1879696983718557e-06, + "loss": 0.0027, + "step": 23860 + }, + { + "epoch": 9.703538023586823, + "grad_norm": 0.2651228762115414, + "learning_rate": 3.187227983776726e-06, + "loss": 0.002, + "step": 23861 + }, + { + "epoch": 9.703944692964619, + "grad_norm": 0.00597472518282446, + "learning_rate": 3.18648633911813e-06, + "loss": 0.0001, + "step": 23862 + }, + { + "epoch": 9.704351362342415, + "grad_norm": 0.009859882137646468, + "learning_rate": 3.185744764403681e-06, + "loss": 0.0001, + "step": 23863 + }, + { + "epoch": 9.704758031720212, + "grad_norm": 14.369832042855363, + "learning_rate": 3.185003259640992e-06, + "loss": 0.1687, + "step": 23864 + }, + { + "epoch": 9.705164701098008, + "grad_norm": 0.6335159149034304, + "learning_rate": 3.18426182483767e-06, + "loss": 0.0058, + "step": 23865 + }, + { + "epoch": 9.705571370475804, + "grad_norm": 0.04505740926813122, + "learning_rate": 3.1835204600013347e-06, + "loss": 0.0006, + "step": 23866 + }, + { + "epoch": 9.7059780398536, + "grad_norm": 0.021633499824141048, + "learning_rate": 3.1827791651395935e-06, + "loss": 0.0003, + "step": 23867 + }, + { + "epoch": 9.706384709231395, + "grad_norm": 0.019995033741619195, + "learning_rate": 3.1820379402600533e-06, + "loss": 0.0001, + "step": 23868 + }, + { + "epoch": 9.706791378609191, + "grad_norm": 0.20921145882069328, + "learning_rate": 3.181296785370326e-06, + "loss": 0.002, + "step": 23869 + }, + { + "epoch": 9.707198047986987, + "grad_norm": 0.0033962023534284454, + "learning_rate": 3.1805557004780196e-06, + "loss": 0.0, + "step": 23870 + }, + { + "epoch": 9.707604717364783, + "grad_norm": 0.06437496854859603, + "learning_rate": 3.1798146855907363e-06, + "loss": 0.0005, + "step": 23871 + }, + { + "epoch": 9.708011386742578, + "grad_norm": 0.3625245693869768, + "learning_rate": 3.179073740716092e-06, + "loss": 0.0042, + "step": 23872 + }, + { + "epoch": 9.708418056120374, + "grad_norm": 0.010208250562154492, + "learning_rate": 3.17833286586169e-06, + "loss": 0.0001, + "step": 23873 + }, + { + "epoch": 9.70882472549817, + "grad_norm": 0.321583022729111, + "learning_rate": 3.1775920610351317e-06, + "loss": 0.0028, + "step": 23874 + }, + { + "epoch": 9.709231394875966, + "grad_norm": 0.12562027978198248, + "learning_rate": 3.176851326244026e-06, + "loss": 0.0008, + "step": 23875 + }, + { + "epoch": 9.709638064253761, + "grad_norm": 0.005870050798132239, + "learning_rate": 3.176110661495976e-06, + "loss": 0.0001, + "step": 23876 + }, + { + "epoch": 9.710044733631557, + "grad_norm": 0.19925562635833777, + "learning_rate": 3.1753700667985833e-06, + "loss": 0.0009, + "step": 23877 + }, + { + "epoch": 9.710451403009353, + "grad_norm": 0.0062718265580386625, + "learning_rate": 3.1746295421594487e-06, + "loss": 0.0001, + "step": 23878 + }, + { + "epoch": 9.710858072387149, + "grad_norm": 0.012843943235205723, + "learning_rate": 3.1738890875861804e-06, + "loss": 0.0002, + "step": 23879 + }, + { + "epoch": 9.711264741764944, + "grad_norm": 0.4954740647233112, + "learning_rate": 3.173148703086376e-06, + "loss": 0.0042, + "step": 23880 + }, + { + "epoch": 9.71167141114274, + "grad_norm": 0.06263908870614898, + "learning_rate": 3.172408388667636e-06, + "loss": 0.0005, + "step": 23881 + }, + { + "epoch": 9.712078080520536, + "grad_norm": 0.1475009806499941, + "learning_rate": 3.171668144337561e-06, + "loss": 0.0014, + "step": 23882 + }, + { + "epoch": 9.712484749898334, + "grad_norm": 0.050303215780788, + "learning_rate": 3.1709279701037476e-06, + "loss": 0.0006, + "step": 23883 + }, + { + "epoch": 9.71289141927613, + "grad_norm": 0.015739931155286928, + "learning_rate": 3.170187865973794e-06, + "loss": 0.0001, + "step": 23884 + }, + { + "epoch": 9.713298088653925, + "grad_norm": 0.17381326613875414, + "learning_rate": 3.169447831955301e-06, + "loss": 0.0013, + "step": 23885 + }, + { + "epoch": 9.71370475803172, + "grad_norm": 8.661765416658636, + "learning_rate": 3.1687078680558648e-06, + "loss": 0.0759, + "step": 23886 + }, + { + "epoch": 9.714111427409517, + "grad_norm": 0.017752137276555176, + "learning_rate": 3.1679679742830806e-06, + "loss": 0.0001, + "step": 23887 + }, + { + "epoch": 9.714518096787312, + "grad_norm": 0.22349956528028914, + "learning_rate": 3.1672281506445423e-06, + "loss": 0.0018, + "step": 23888 + }, + { + "epoch": 9.714924766165108, + "grad_norm": 0.8501961208443071, + "learning_rate": 3.1664883971478465e-06, + "loss": 0.0049, + "step": 23889 + }, + { + "epoch": 9.715331435542904, + "grad_norm": 0.00017156223335863268, + "learning_rate": 3.165748713800587e-06, + "loss": 0.0, + "step": 23890 + }, + { + "epoch": 9.7157381049207, + "grad_norm": 0.003943715306153951, + "learning_rate": 3.1650091006103535e-06, + "loss": 0.0001, + "step": 23891 + }, + { + "epoch": 9.716144774298495, + "grad_norm": 0.001901573857807141, + "learning_rate": 3.164269557584745e-06, + "loss": 0.0, + "step": 23892 + }, + { + "epoch": 9.716551443676291, + "grad_norm": 0.007602825589098211, + "learning_rate": 3.1635300847313487e-06, + "loss": 0.0001, + "step": 23893 + }, + { + "epoch": 9.716958113054087, + "grad_norm": 0.24966192032510057, + "learning_rate": 3.162790682057757e-06, + "loss": 0.0023, + "step": 23894 + }, + { + "epoch": 9.717364782431883, + "grad_norm": 0.00024592692142499397, + "learning_rate": 3.162051349571561e-06, + "loss": 0.0, + "step": 23895 + }, + { + "epoch": 9.717771451809678, + "grad_norm": 0.020827078770474957, + "learning_rate": 3.161312087280348e-06, + "loss": 0.0002, + "step": 23896 + }, + { + "epoch": 9.718178121187474, + "grad_norm": 1.836077539941136, + "learning_rate": 3.160572895191706e-06, + "loss": 0.0261, + "step": 23897 + }, + { + "epoch": 9.71858479056527, + "grad_norm": 0.019795097426392693, + "learning_rate": 3.1598337733132288e-06, + "loss": 0.0001, + "step": 23898 + }, + { + "epoch": 9.718991459943066, + "grad_norm": 0.23512807376075, + "learning_rate": 3.1590947216525e-06, + "loss": 0.0027, + "step": 23899 + }, + { + "epoch": 9.719398129320862, + "grad_norm": 0.12408866364517206, + "learning_rate": 3.1583557402171073e-06, + "loss": 0.0012, + "step": 23900 + }, + { + "epoch": 9.719804798698657, + "grad_norm": 0.098211732640084, + "learning_rate": 3.157616829014636e-06, + "loss": 0.0005, + "step": 23901 + }, + { + "epoch": 9.720211468076453, + "grad_norm": 0.2652854816857001, + "learning_rate": 3.156877988052671e-06, + "loss": 0.0019, + "step": 23902 + }, + { + "epoch": 9.720618137454249, + "grad_norm": 0.019904935591773497, + "learning_rate": 3.1561392173387984e-06, + "loss": 0.0003, + "step": 23903 + }, + { + "epoch": 9.721024806832045, + "grad_norm": 6.602179918252782, + "learning_rate": 3.155400516880598e-06, + "loss": 0.0754, + "step": 23904 + }, + { + "epoch": 9.721431476209842, + "grad_norm": 0.034711065604866746, + "learning_rate": 3.1546618866856593e-06, + "loss": 0.0003, + "step": 23905 + }, + { + "epoch": 9.721838145587638, + "grad_norm": 0.0019924772817464394, + "learning_rate": 3.153923326761561e-06, + "loss": 0.0, + "step": 23906 + }, + { + "epoch": 9.722244814965434, + "grad_norm": 4.427251265697292, + "learning_rate": 3.1531848371158856e-06, + "loss": 0.0522, + "step": 23907 + }, + { + "epoch": 9.72265148434323, + "grad_norm": 2.939696903353877, + "learning_rate": 3.152446417756213e-06, + "loss": 0.0299, + "step": 23908 + }, + { + "epoch": 9.723058153721025, + "grad_norm": 0.004589283616458892, + "learning_rate": 3.151708068690126e-06, + "loss": 0.0, + "step": 23909 + }, + { + "epoch": 9.723464823098821, + "grad_norm": 0.0006744500921271469, + "learning_rate": 3.150969789925198e-06, + "loss": 0.0, + "step": 23910 + }, + { + "epoch": 9.723871492476617, + "grad_norm": 14.29358609834883, + "learning_rate": 3.1502315814690153e-06, + "loss": 0.3466, + "step": 23911 + }, + { + "epoch": 9.724278161854413, + "grad_norm": 2.2165915639891978, + "learning_rate": 3.1494934433291525e-06, + "loss": 0.0124, + "step": 23912 + }, + { + "epoch": 9.724684831232208, + "grad_norm": 0.0884910534777452, + "learning_rate": 3.1487553755131883e-06, + "loss": 0.0012, + "step": 23913 + }, + { + "epoch": 9.725091500610004, + "grad_norm": 0.1462848435757535, + "learning_rate": 3.148017378028697e-06, + "loss": 0.0013, + "step": 23914 + }, + { + "epoch": 9.7254981699878, + "grad_norm": 0.4108985882966071, + "learning_rate": 3.1472794508832573e-06, + "loss": 0.0042, + "step": 23915 + }, + { + "epoch": 9.725904839365596, + "grad_norm": 0.0003257376824796496, + "learning_rate": 3.146541594084442e-06, + "loss": 0.0, + "step": 23916 + }, + { + "epoch": 9.726311508743391, + "grad_norm": 0.004691086232042441, + "learning_rate": 3.1458038076398232e-06, + "loss": 0.0001, + "step": 23917 + }, + { + "epoch": 9.726718178121187, + "grad_norm": 0.257638670067054, + "learning_rate": 3.145066091556982e-06, + "loss": 0.0032, + "step": 23918 + }, + { + "epoch": 9.727124847498983, + "grad_norm": 0.02970924557278581, + "learning_rate": 3.1443284458434875e-06, + "loss": 0.0004, + "step": 23919 + }, + { + "epoch": 9.727531516876779, + "grad_norm": 1.943317633808364, + "learning_rate": 3.1435908705069107e-06, + "loss": 0.0179, + "step": 23920 + }, + { + "epoch": 9.727938186254574, + "grad_norm": 0.004549164879539645, + "learning_rate": 3.1428533655548254e-06, + "loss": 0.0001, + "step": 23921 + }, + { + "epoch": 9.72834485563237, + "grad_norm": 0.15990040373182998, + "learning_rate": 3.142115930994801e-06, + "loss": 0.0012, + "step": 23922 + }, + { + "epoch": 9.728751525010166, + "grad_norm": 0.02791757847749209, + "learning_rate": 3.141378566834404e-06, + "loss": 0.0002, + "step": 23923 + }, + { + "epoch": 9.729158194387963, + "grad_norm": 0.06273410323779079, + "learning_rate": 3.140641273081213e-06, + "loss": 0.0007, + "step": 23924 + }, + { + "epoch": 9.72956486376576, + "grad_norm": 1.5389962365891643, + "learning_rate": 3.1399040497427903e-06, + "loss": 0.0183, + "step": 23925 + }, + { + "epoch": 9.729971533143555, + "grad_norm": 0.03306764118763371, + "learning_rate": 3.139166896826705e-06, + "loss": 0.0004, + "step": 23926 + }, + { + "epoch": 9.73037820252135, + "grad_norm": 0.0007415346862512563, + "learning_rate": 3.1384298143405257e-06, + "loss": 0.0, + "step": 23927 + }, + { + "epoch": 9.730784871899147, + "grad_norm": 0.2910349839647253, + "learning_rate": 3.1376928022918164e-06, + "loss": 0.0019, + "step": 23928 + }, + { + "epoch": 9.731191541276942, + "grad_norm": 0.008003144335817891, + "learning_rate": 3.1369558606881446e-06, + "loss": 0.0001, + "step": 23929 + }, + { + "epoch": 9.731598210654738, + "grad_norm": 0.02871633655498485, + "learning_rate": 3.136218989537072e-06, + "loss": 0.0004, + "step": 23930 + }, + { + "epoch": 9.732004880032534, + "grad_norm": 4.261457762607044, + "learning_rate": 3.1354821888461695e-06, + "loss": 0.0533, + "step": 23931 + }, + { + "epoch": 9.73241154941033, + "grad_norm": 0.4441842331823448, + "learning_rate": 3.1347454586229963e-06, + "loss": 0.0044, + "step": 23932 + }, + { + "epoch": 9.732818218788125, + "grad_norm": 0.0014683514681201682, + "learning_rate": 3.1340087988751167e-06, + "loss": 0.0, + "step": 23933 + }, + { + "epoch": 9.733224888165921, + "grad_norm": 0.006404633678479524, + "learning_rate": 3.133272209610092e-06, + "loss": 0.0001, + "step": 23934 + }, + { + "epoch": 9.733631557543717, + "grad_norm": 0.04101089159864206, + "learning_rate": 3.1325356908354842e-06, + "loss": 0.0004, + "step": 23935 + }, + { + "epoch": 9.734038226921513, + "grad_norm": 0.05449766526018082, + "learning_rate": 3.131799242558854e-06, + "loss": 0.0005, + "step": 23936 + }, + { + "epoch": 9.734444896299308, + "grad_norm": 5.465076596314648, + "learning_rate": 3.1310628647877584e-06, + "loss": 0.0705, + "step": 23937 + }, + { + "epoch": 9.734851565677104, + "grad_norm": 0.024412830059042944, + "learning_rate": 3.1303265575297624e-06, + "loss": 0.0002, + "step": 23938 + }, + { + "epoch": 9.7352582350549, + "grad_norm": 11.654034072830669, + "learning_rate": 3.1295903207924215e-06, + "loss": 0.3976, + "step": 23939 + }, + { + "epoch": 9.735664904432696, + "grad_norm": 0.01276549050330461, + "learning_rate": 3.128854154583294e-06, + "loss": 0.0001, + "step": 23940 + }, + { + "epoch": 9.736071573810491, + "grad_norm": 0.03101890404955397, + "learning_rate": 3.128118058909937e-06, + "loss": 0.0003, + "step": 23941 + }, + { + "epoch": 9.736478243188287, + "grad_norm": 6.245532382766804, + "learning_rate": 3.1273820337799067e-06, + "loss": 0.1023, + "step": 23942 + }, + { + "epoch": 9.736884912566083, + "grad_norm": 6.8339434194850295, + "learning_rate": 3.126646079200758e-06, + "loss": 0.1506, + "step": 23943 + }, + { + "epoch": 9.737291581943879, + "grad_norm": 0.1124959090506257, + "learning_rate": 3.1259101951800473e-06, + "loss": 0.0017, + "step": 23944 + }, + { + "epoch": 9.737698251321675, + "grad_norm": 3.202829334426125e-05, + "learning_rate": 3.1251743817253277e-06, + "loss": 0.0, + "step": 23945 + }, + { + "epoch": 9.738104920699472, + "grad_norm": 0.01630424280355168, + "learning_rate": 3.124438638844153e-06, + "loss": 0.0001, + "step": 23946 + }, + { + "epoch": 9.738511590077268, + "grad_norm": 0.07675649128752783, + "learning_rate": 3.1237029665440766e-06, + "loss": 0.0006, + "step": 23947 + }, + { + "epoch": 9.738918259455064, + "grad_norm": 0.9378850950706158, + "learning_rate": 3.1229673648326463e-06, + "loss": 0.0079, + "step": 23948 + }, + { + "epoch": 9.73932492883286, + "grad_norm": 0.4368582099780931, + "learning_rate": 3.1222318337174205e-06, + "loss": 0.0026, + "step": 23949 + }, + { + "epoch": 9.739731598210655, + "grad_norm": 7.410684030868305, + "learning_rate": 3.1214963732059467e-06, + "loss": 0.1409, + "step": 23950 + }, + { + "epoch": 9.740138267588451, + "grad_norm": 0.00810853189065326, + "learning_rate": 3.120760983305774e-06, + "loss": 0.0001, + "step": 23951 + }, + { + "epoch": 9.740544936966247, + "grad_norm": 0.8628052788597276, + "learning_rate": 3.120025664024452e-06, + "loss": 0.0069, + "step": 23952 + }, + { + "epoch": 9.740951606344042, + "grad_norm": 0.10446378020731749, + "learning_rate": 3.1192904153695304e-06, + "loss": 0.0011, + "step": 23953 + }, + { + "epoch": 9.741358275721838, + "grad_norm": 0.2528131352913296, + "learning_rate": 3.118555237348554e-06, + "loss": 0.0017, + "step": 23954 + }, + { + "epoch": 9.741764945099634, + "grad_norm": 0.09921308281248208, + "learning_rate": 3.1178201299690693e-06, + "loss": 0.0011, + "step": 23955 + }, + { + "epoch": 9.74217161447743, + "grad_norm": 0.007425301441775666, + "learning_rate": 3.1170850932386276e-06, + "loss": 0.0001, + "step": 23956 + }, + { + "epoch": 9.742578283855226, + "grad_norm": 0.26560749755758095, + "learning_rate": 3.1163501271647724e-06, + "loss": 0.0014, + "step": 23957 + }, + { + "epoch": 9.742984953233021, + "grad_norm": 0.06264438970869378, + "learning_rate": 3.1156152317550458e-06, + "loss": 0.0005, + "step": 23958 + }, + { + "epoch": 9.743391622610817, + "grad_norm": 0.0019782442832060603, + "learning_rate": 3.1148804070169947e-06, + "loss": 0.0, + "step": 23959 + }, + { + "epoch": 9.743798291988613, + "grad_norm": 0.054221206552754225, + "learning_rate": 3.1141456529581615e-06, + "loss": 0.0005, + "step": 23960 + }, + { + "epoch": 9.744204961366409, + "grad_norm": 0.7883691622789495, + "learning_rate": 3.113410969586086e-06, + "loss": 0.0087, + "step": 23961 + }, + { + "epoch": 9.744611630744204, + "grad_norm": 0.00612879042844543, + "learning_rate": 3.112676356908315e-06, + "loss": 0.0, + "step": 23962 + }, + { + "epoch": 9.745018300122, + "grad_norm": 0.0030479457063848176, + "learning_rate": 3.1119418149323877e-06, + "loss": 0.0, + "step": 23963 + }, + { + "epoch": 9.745424969499796, + "grad_norm": 0.1395027750447083, + "learning_rate": 3.111207343665845e-06, + "loss": 0.0004, + "step": 23964 + }, + { + "epoch": 9.745831638877593, + "grad_norm": 0.5927025219951413, + "learning_rate": 3.110472943116226e-06, + "loss": 0.0041, + "step": 23965 + }, + { + "epoch": 9.74623830825539, + "grad_norm": 0.004600109790511323, + "learning_rate": 3.109738613291069e-06, + "loss": 0.0, + "step": 23966 + }, + { + "epoch": 9.746644977633185, + "grad_norm": 0.2488926114648911, + "learning_rate": 3.109004354197913e-06, + "loss": 0.0019, + "step": 23967 + }, + { + "epoch": 9.74705164701098, + "grad_norm": 0.0024158372423688013, + "learning_rate": 3.108270165844293e-06, + "loss": 0.0, + "step": 23968 + }, + { + "epoch": 9.747458316388776, + "grad_norm": 0.0918140363234596, + "learning_rate": 3.1075360482377505e-06, + "loss": 0.0011, + "step": 23969 + }, + { + "epoch": 9.747864985766572, + "grad_norm": 0.12684604773711702, + "learning_rate": 3.10680200138582e-06, + "loss": 0.0012, + "step": 23970 + }, + { + "epoch": 9.748271655144368, + "grad_norm": 0.30403175400646537, + "learning_rate": 3.106068025296036e-06, + "loss": 0.0022, + "step": 23971 + }, + { + "epoch": 9.748678324522164, + "grad_norm": 0.17916735467139214, + "learning_rate": 3.1053341199759335e-06, + "loss": 0.002, + "step": 23972 + }, + { + "epoch": 9.74908499389996, + "grad_norm": 0.022742343211493616, + "learning_rate": 3.104600285433046e-06, + "loss": 0.0002, + "step": 23973 + }, + { + "epoch": 9.749491663277755, + "grad_norm": 0.025737674590047457, + "learning_rate": 3.103866521674904e-06, + "loss": 0.0002, + "step": 23974 + }, + { + "epoch": 9.749898332655551, + "grad_norm": 0.01705159640172643, + "learning_rate": 3.1031328287090467e-06, + "loss": 0.0001, + "step": 23975 + }, + { + "epoch": 9.750305002033347, + "grad_norm": 0.49175331037848014, + "learning_rate": 3.1023992065430007e-06, + "loss": 0.0043, + "step": 23976 + }, + { + "epoch": 9.750711671411143, + "grad_norm": 0.7824756808873947, + "learning_rate": 3.1016656551842995e-06, + "loss": 0.0046, + "step": 23977 + }, + { + "epoch": 9.751118340788938, + "grad_norm": 0.03461162658121445, + "learning_rate": 3.100932174640472e-06, + "loss": 0.0002, + "step": 23978 + }, + { + "epoch": 9.751525010166734, + "grad_norm": 0.02607857192806473, + "learning_rate": 3.100198764919048e-06, + "loss": 0.0003, + "step": 23979 + }, + { + "epoch": 9.75193167954453, + "grad_norm": 0.061515172940886785, + "learning_rate": 3.099465426027556e-06, + "loss": 0.0009, + "step": 23980 + }, + { + "epoch": 9.752338348922326, + "grad_norm": 0.016081427141937204, + "learning_rate": 3.098732157973521e-06, + "loss": 0.0002, + "step": 23981 + }, + { + "epoch": 9.752745018300121, + "grad_norm": 0.0016980483384725867, + "learning_rate": 3.0979989607644767e-06, + "loss": 0.0, + "step": 23982 + }, + { + "epoch": 9.753151687677917, + "grad_norm": 0.1640317802275476, + "learning_rate": 3.0972658344079475e-06, + "loss": 0.0011, + "step": 23983 + }, + { + "epoch": 9.753558357055713, + "grad_norm": 0.1949875889617368, + "learning_rate": 3.0965327789114574e-06, + "loss": 0.0021, + "step": 23984 + }, + { + "epoch": 9.753965026433509, + "grad_norm": 3.8878069760136187, + "learning_rate": 3.0957997942825337e-06, + "loss": 0.056, + "step": 23985 + }, + { + "epoch": 9.754371695811304, + "grad_norm": 0.001118802352589326, + "learning_rate": 3.0950668805286997e-06, + "loss": 0.0, + "step": 23986 + }, + { + "epoch": 9.754778365189102, + "grad_norm": 0.004476898072986323, + "learning_rate": 3.094334037657477e-06, + "loss": 0.0, + "step": 23987 + }, + { + "epoch": 9.755185034566898, + "grad_norm": 0.0021651753218134106, + "learning_rate": 3.0936012656763937e-06, + "loss": 0.0, + "step": 23988 + }, + { + "epoch": 9.755591703944694, + "grad_norm": 1.7586762049472011, + "learning_rate": 3.092868564592969e-06, + "loss": 0.0175, + "step": 23989 + }, + { + "epoch": 9.75599837332249, + "grad_norm": 0.08699723148211254, + "learning_rate": 3.092135934414724e-06, + "loss": 0.0011, + "step": 23990 + }, + { + "epoch": 9.756405042700285, + "grad_norm": 0.34660483218714655, + "learning_rate": 3.0914033751491814e-06, + "loss": 0.0026, + "step": 23991 + }, + { + "epoch": 9.75681171207808, + "grad_norm": 12.095909942282951, + "learning_rate": 3.0906708868038605e-06, + "loss": 0.2229, + "step": 23992 + }, + { + "epoch": 9.757218381455877, + "grad_norm": 0.13646172896582556, + "learning_rate": 3.0899384693862787e-06, + "loss": 0.0009, + "step": 23993 + }, + { + "epoch": 9.757625050833672, + "grad_norm": 28.318143826287958, + "learning_rate": 3.089206122903955e-06, + "loss": 0.2512, + "step": 23994 + }, + { + "epoch": 9.758031720211468, + "grad_norm": 0.15307463395960194, + "learning_rate": 3.0884738473644104e-06, + "loss": 0.0016, + "step": 23995 + }, + { + "epoch": 9.758438389589264, + "grad_norm": 15.004498506559612, + "learning_rate": 3.0877416427751607e-06, + "loss": 0.153, + "step": 23996 + }, + { + "epoch": 9.75884505896706, + "grad_norm": 0.011229961951983376, + "learning_rate": 3.0870095091437224e-06, + "loss": 0.0001, + "step": 23997 + }, + { + "epoch": 9.759251728344855, + "grad_norm": 5.272561451235926, + "learning_rate": 3.0862774464776102e-06, + "loss": 0.043, + "step": 23998 + }, + { + "epoch": 9.759658397722651, + "grad_norm": 0.005259735576680971, + "learning_rate": 3.08554545478434e-06, + "loss": 0.0001, + "step": 23999 + }, + { + "epoch": 9.760065067100447, + "grad_norm": 0.1305170932588221, + "learning_rate": 3.0848135340714212e-06, + "loss": 0.001, + "step": 24000 + }, + { + "epoch": 9.760471736478243, + "grad_norm": 0.006628487252934526, + "learning_rate": 3.0840816843463762e-06, + "loss": 0.0, + "step": 24001 + }, + { + "epoch": 9.760878405856038, + "grad_norm": 0.2658505415327547, + "learning_rate": 3.083349905616714e-06, + "loss": 0.0017, + "step": 24002 + }, + { + "epoch": 9.761285075233834, + "grad_norm": 2.5518029937437787, + "learning_rate": 3.082618197889945e-06, + "loss": 0.0113, + "step": 24003 + }, + { + "epoch": 9.76169174461163, + "grad_norm": 0.00476678471936231, + "learning_rate": 3.0818865611735816e-06, + "loss": 0.0, + "step": 24004 + }, + { + "epoch": 9.762098413989426, + "grad_norm": 0.009231643556987948, + "learning_rate": 3.0811549954751352e-06, + "loss": 0.0001, + "step": 24005 + }, + { + "epoch": 9.762505083367223, + "grad_norm": 3.8454874030501656, + "learning_rate": 3.0804235008021142e-06, + "loss": 0.0532, + "step": 24006 + }, + { + "epoch": 9.762911752745019, + "grad_norm": 0.06254682487811557, + "learning_rate": 3.0796920771620254e-06, + "loss": 0.0007, + "step": 24007 + }, + { + "epoch": 9.763318422122815, + "grad_norm": 1.2998515906279726, + "learning_rate": 3.0789607245623845e-06, + "loss": 0.0101, + "step": 24008 + }, + { + "epoch": 9.76372509150061, + "grad_norm": 0.00041641815013883463, + "learning_rate": 3.078229443010694e-06, + "loss": 0.0, + "step": 24009 + }, + { + "epoch": 9.764131760878406, + "grad_norm": 0.15355168932949237, + "learning_rate": 3.077498232514462e-06, + "loss": 0.0015, + "step": 24010 + }, + { + "epoch": 9.764538430256202, + "grad_norm": 0.09019055019005094, + "learning_rate": 3.0767670930811956e-06, + "loss": 0.001, + "step": 24011 + }, + { + "epoch": 9.764945099633998, + "grad_norm": 0.006280510159426959, + "learning_rate": 3.0760360247183984e-06, + "loss": 0.0001, + "step": 24012 + }, + { + "epoch": 9.765351769011794, + "grad_norm": 0.03874087048808229, + "learning_rate": 3.075305027433574e-06, + "loss": 0.0004, + "step": 24013 + }, + { + "epoch": 9.76575843838959, + "grad_norm": 0.014234479519099586, + "learning_rate": 3.07457410123423e-06, + "loss": 0.0002, + "step": 24014 + }, + { + "epoch": 9.766165107767385, + "grad_norm": 0.0011244912122507862, + "learning_rate": 3.07384324612787e-06, + "loss": 0.0, + "step": 24015 + }, + { + "epoch": 9.766571777145181, + "grad_norm": 0.10587596198364145, + "learning_rate": 3.0731124621219943e-06, + "loss": 0.0012, + "step": 24016 + }, + { + "epoch": 9.766978446522977, + "grad_norm": 0.03254426139590817, + "learning_rate": 3.072381749224106e-06, + "loss": 0.0003, + "step": 24017 + }, + { + "epoch": 9.767385115900773, + "grad_norm": 0.00209533338388022, + "learning_rate": 3.0716511074417043e-06, + "loss": 0.0, + "step": 24018 + }, + { + "epoch": 9.767791785278568, + "grad_norm": 0.0007926509751846095, + "learning_rate": 3.0709205367822926e-06, + "loss": 0.0, + "step": 24019 + }, + { + "epoch": 9.768198454656364, + "grad_norm": 0.41856838961554826, + "learning_rate": 3.0701900372533645e-06, + "loss": 0.0018, + "step": 24020 + }, + { + "epoch": 9.76860512403416, + "grad_norm": 0.08902626629544902, + "learning_rate": 3.0694596088624274e-06, + "loss": 0.001, + "step": 24021 + }, + { + "epoch": 9.769011793411956, + "grad_norm": 21.86955943843197, + "learning_rate": 3.0687292516169753e-06, + "loss": 0.5334, + "step": 24022 + }, + { + "epoch": 9.769418462789751, + "grad_norm": 0.9494594318327383, + "learning_rate": 3.0679989655245045e-06, + "loss": 0.0123, + "step": 24023 + }, + { + "epoch": 9.769825132167547, + "grad_norm": 1.2574573730589051, + "learning_rate": 3.0672687505925146e-06, + "loss": 0.0135, + "step": 24024 + }, + { + "epoch": 9.770231801545343, + "grad_norm": 0.0675632229551965, + "learning_rate": 3.0665386068284997e-06, + "loss": 0.0007, + "step": 24025 + }, + { + "epoch": 9.770638470923139, + "grad_norm": 1.1312301636511444, + "learning_rate": 3.0658085342399523e-06, + "loss": 0.0138, + "step": 24026 + }, + { + "epoch": 9.771045140300934, + "grad_norm": 0.14334834098417187, + "learning_rate": 3.0650785328343746e-06, + "loss": 0.0011, + "step": 24027 + }, + { + "epoch": 9.771451809678732, + "grad_norm": 0.019191526697233563, + "learning_rate": 3.0643486026192547e-06, + "loss": 0.0001, + "step": 24028 + }, + { + "epoch": 9.771858479056528, + "grad_norm": 0.12126537548928538, + "learning_rate": 3.063618743602087e-06, + "loss": 0.0017, + "step": 24029 + }, + { + "epoch": 9.772265148434323, + "grad_norm": 0.4717498602705826, + "learning_rate": 3.062888955790364e-06, + "loss": 0.0025, + "step": 24030 + }, + { + "epoch": 9.77267181781212, + "grad_norm": 1.1446219190082083, + "learning_rate": 3.062159239191578e-06, + "loss": 0.0126, + "step": 24031 + }, + { + "epoch": 9.773078487189915, + "grad_norm": 0.10444462276046453, + "learning_rate": 3.0614295938132187e-06, + "loss": 0.0009, + "step": 24032 + }, + { + "epoch": 9.77348515656771, + "grad_norm": 5.2429852834408734, + "learning_rate": 3.0607000196627736e-06, + "loss": 0.0325, + "step": 24033 + }, + { + "epoch": 9.773891825945507, + "grad_norm": 0.35199245992917105, + "learning_rate": 3.059970516747738e-06, + "loss": 0.0031, + "step": 24034 + }, + { + "epoch": 9.774298495323302, + "grad_norm": 0.0810454541057691, + "learning_rate": 3.0592410850755983e-06, + "loss": 0.0008, + "step": 24035 + }, + { + "epoch": 9.774705164701098, + "grad_norm": 0.42530195106998053, + "learning_rate": 3.058511724653842e-06, + "loss": 0.0072, + "step": 24036 + }, + { + "epoch": 9.775111834078894, + "grad_norm": 0.9432779428405085, + "learning_rate": 3.0577824354899566e-06, + "loss": 0.0076, + "step": 24037 + }, + { + "epoch": 9.77551850345669, + "grad_norm": 0.04074793643931566, + "learning_rate": 3.057053217591428e-06, + "loss": 0.0004, + "step": 24038 + }, + { + "epoch": 9.775925172834485, + "grad_norm": 0.08760519156189238, + "learning_rate": 3.0563240709657426e-06, + "loss": 0.0011, + "step": 24039 + }, + { + "epoch": 9.776331842212281, + "grad_norm": 0.00048336890448612827, + "learning_rate": 3.0555949956203824e-06, + "loss": 0.0, + "step": 24040 + }, + { + "epoch": 9.776738511590077, + "grad_norm": 0.005612496413308643, + "learning_rate": 3.054865991562841e-06, + "loss": 0.0001, + "step": 24041 + }, + { + "epoch": 9.777145180967873, + "grad_norm": 0.061787096747530496, + "learning_rate": 3.0541370588005915e-06, + "loss": 0.0008, + "step": 24042 + }, + { + "epoch": 9.777551850345668, + "grad_norm": 0.02772409496951365, + "learning_rate": 3.0534081973411224e-06, + "loss": 0.0002, + "step": 24043 + }, + { + "epoch": 9.777958519723464, + "grad_norm": 0.08983077386922701, + "learning_rate": 3.052679407191913e-06, + "loss": 0.0009, + "step": 24044 + }, + { + "epoch": 9.77836518910126, + "grad_norm": 0.10354034769122951, + "learning_rate": 3.0519506883604435e-06, + "loss": 0.0007, + "step": 24045 + }, + { + "epoch": 9.778771858479056, + "grad_norm": 0.047155112233793484, + "learning_rate": 3.0512220408541993e-06, + "loss": 0.0006, + "step": 24046 + }, + { + "epoch": 9.779178527856853, + "grad_norm": 0.016282011769662146, + "learning_rate": 3.0504934646806594e-06, + "loss": 0.0002, + "step": 24047 + }, + { + "epoch": 9.779585197234649, + "grad_norm": 0.023895198953789137, + "learning_rate": 3.049764959847301e-06, + "loss": 0.0003, + "step": 24048 + }, + { + "epoch": 9.779991866612445, + "grad_norm": 0.02495375560909171, + "learning_rate": 3.049036526361604e-06, + "loss": 0.0002, + "step": 24049 + }, + { + "epoch": 9.78039853599024, + "grad_norm": 1.3496964746521176, + "learning_rate": 3.048308164231044e-06, + "loss": 0.0146, + "step": 24050 + }, + { + "epoch": 9.780805205368036, + "grad_norm": 0.010885093218519317, + "learning_rate": 3.0475798734630966e-06, + "loss": 0.0001, + "step": 24051 + }, + { + "epoch": 9.781211874745832, + "grad_norm": 0.0007585438950281862, + "learning_rate": 3.046851654065245e-06, + "loss": 0.0, + "step": 24052 + }, + { + "epoch": 9.781618544123628, + "grad_norm": 0.0009903239963381747, + "learning_rate": 3.04612350604496e-06, + "loss": 0.0, + "step": 24053 + }, + { + "epoch": 9.782025213501424, + "grad_norm": 0.7934659337498011, + "learning_rate": 3.045395429409718e-06, + "loss": 0.0081, + "step": 24054 + }, + { + "epoch": 9.78243188287922, + "grad_norm": 0.056332879521338516, + "learning_rate": 3.0446674241669903e-06, + "loss": 0.0005, + "step": 24055 + }, + { + "epoch": 9.782838552257015, + "grad_norm": 0.003127302538188787, + "learning_rate": 3.0439394903242536e-06, + "loss": 0.0, + "step": 24056 + }, + { + "epoch": 9.783245221634811, + "grad_norm": 0.18440104187785325, + "learning_rate": 3.043211627888978e-06, + "loss": 0.0029, + "step": 24057 + }, + { + "epoch": 9.783651891012607, + "grad_norm": 12.844356129839268, + "learning_rate": 3.042483836868634e-06, + "loss": 0.1997, + "step": 24058 + }, + { + "epoch": 9.784058560390402, + "grad_norm": 0.000894663001013781, + "learning_rate": 3.041756117270698e-06, + "loss": 0.0, + "step": 24059 + }, + { + "epoch": 9.784465229768198, + "grad_norm": 0.2069163547974832, + "learning_rate": 3.041028469102636e-06, + "loss": 0.0021, + "step": 24060 + }, + { + "epoch": 9.784871899145994, + "grad_norm": 0.0018560608756980766, + "learning_rate": 3.0403008923719214e-06, + "loss": 0.0, + "step": 24061 + }, + { + "epoch": 9.78527856852379, + "grad_norm": 0.029349063746508898, + "learning_rate": 3.039573387086019e-06, + "loss": 0.0004, + "step": 24062 + }, + { + "epoch": 9.785685237901586, + "grad_norm": 0.0011064937107042505, + "learning_rate": 3.0388459532523995e-06, + "loss": 0.0, + "step": 24063 + }, + { + "epoch": 9.786091907279381, + "grad_norm": 0.24711728524121332, + "learning_rate": 3.038118590878527e-06, + "loss": 0.0014, + "step": 24064 + }, + { + "epoch": 9.786498576657177, + "grad_norm": 2.63495300380808, + "learning_rate": 3.0373912999718735e-06, + "loss": 0.0339, + "step": 24065 + }, + { + "epoch": 9.786905246034973, + "grad_norm": 0.45899060798658714, + "learning_rate": 3.036664080539904e-06, + "loss": 0.0025, + "step": 24066 + }, + { + "epoch": 9.787311915412769, + "grad_norm": 5.708356283268918, + "learning_rate": 3.035936932590081e-06, + "loss": 0.0235, + "step": 24067 + }, + { + "epoch": 9.787718584790564, + "grad_norm": 0.0026614873082810445, + "learning_rate": 3.0352098561298704e-06, + "loss": 0.0, + "step": 24068 + }, + { + "epoch": 9.788125254168362, + "grad_norm": 0.0021594432904737783, + "learning_rate": 3.0344828511667355e-06, + "loss": 0.0, + "step": 24069 + }, + { + "epoch": 9.788531923546158, + "grad_norm": 2.1637619247219755, + "learning_rate": 3.033755917708141e-06, + "loss": 0.0691, + "step": 24070 + }, + { + "epoch": 9.788938592923953, + "grad_norm": 0.046276474333894514, + "learning_rate": 3.0330290557615437e-06, + "loss": 0.0004, + "step": 24071 + }, + { + "epoch": 9.78934526230175, + "grad_norm": 0.0007866232732343822, + "learning_rate": 3.0323022653344135e-06, + "loss": 0.0, + "step": 24072 + }, + { + "epoch": 9.789751931679545, + "grad_norm": 0.558339868387274, + "learning_rate": 3.0315755464342077e-06, + "loss": 0.0064, + "step": 24073 + }, + { + "epoch": 9.79015860105734, + "grad_norm": 0.04084750014441021, + "learning_rate": 3.0308488990683847e-06, + "loss": 0.0005, + "step": 24074 + }, + { + "epoch": 9.790565270435136, + "grad_norm": 1.076869008022917, + "learning_rate": 3.0301223232444065e-06, + "loss": 0.0088, + "step": 24075 + }, + { + "epoch": 9.790971939812932, + "grad_norm": 0.12636525379122698, + "learning_rate": 3.0293958189697304e-06, + "loss": 0.0009, + "step": 24076 + }, + { + "epoch": 9.791378609190728, + "grad_norm": 0.02616224076433027, + "learning_rate": 3.0286693862518102e-06, + "loss": 0.0003, + "step": 24077 + }, + { + "epoch": 9.791785278568524, + "grad_norm": 0.28171997980866187, + "learning_rate": 3.0279430250981114e-06, + "loss": 0.0025, + "step": 24078 + }, + { + "epoch": 9.79219194794632, + "grad_norm": 0.6412765056066516, + "learning_rate": 3.0272167355160864e-06, + "loss": 0.007, + "step": 24079 + }, + { + "epoch": 9.792598617324115, + "grad_norm": 0.04847663447787403, + "learning_rate": 3.026490517513191e-06, + "loss": 0.0005, + "step": 24080 + }, + { + "epoch": 9.793005286701911, + "grad_norm": 13.004493911805154, + "learning_rate": 3.0257643710968797e-06, + "loss": 0.301, + "step": 24081 + }, + { + "epoch": 9.793411956079707, + "grad_norm": 0.002165642176649555, + "learning_rate": 3.025038296274607e-06, + "loss": 0.0, + "step": 24082 + }, + { + "epoch": 9.793818625457503, + "grad_norm": 0.10792463662807808, + "learning_rate": 3.0243122930538273e-06, + "loss": 0.0008, + "step": 24083 + }, + { + "epoch": 9.794225294835298, + "grad_norm": 0.5307935000972456, + "learning_rate": 3.023586361441989e-06, + "loss": 0.0031, + "step": 24084 + }, + { + "epoch": 9.794631964213094, + "grad_norm": 0.057639950628100796, + "learning_rate": 3.0228605014465516e-06, + "loss": 0.0004, + "step": 24085 + }, + { + "epoch": 9.79503863359089, + "grad_norm": 0.0013441812365003313, + "learning_rate": 3.0221347130749623e-06, + "loss": 0.0, + "step": 24086 + }, + { + "epoch": 9.795445302968686, + "grad_norm": 0.1685803639159564, + "learning_rate": 3.0214089963346725e-06, + "loss": 0.0009, + "step": 24087 + }, + { + "epoch": 9.795851972346483, + "grad_norm": 0.014302162487148138, + "learning_rate": 3.0206833512331314e-06, + "loss": 0.0002, + "step": 24088 + }, + { + "epoch": 9.796258641724279, + "grad_norm": 0.0032893630788327294, + "learning_rate": 3.019957777777788e-06, + "loss": 0.0, + "step": 24089 + }, + { + "epoch": 9.796665311102075, + "grad_norm": 0.014119334710422123, + "learning_rate": 3.0192322759760874e-06, + "loss": 0.0002, + "step": 24090 + }, + { + "epoch": 9.79707198047987, + "grad_norm": 1.3394022991464094, + "learning_rate": 3.018506845835485e-06, + "loss": 0.0162, + "step": 24091 + }, + { + "epoch": 9.797478649857666, + "grad_norm": 0.04579604710300927, + "learning_rate": 3.0177814873634225e-06, + "loss": 0.0003, + "step": 24092 + }, + { + "epoch": 9.797885319235462, + "grad_norm": 0.011512802058284633, + "learning_rate": 3.0170562005673476e-06, + "loss": 0.0001, + "step": 24093 + }, + { + "epoch": 9.798291988613258, + "grad_norm": 0.06330866339865937, + "learning_rate": 3.0163309854547062e-06, + "loss": 0.0006, + "step": 24094 + }, + { + "epoch": 9.798698657991054, + "grad_norm": 0.004345181828928203, + "learning_rate": 3.0156058420329405e-06, + "loss": 0.0, + "step": 24095 + }, + { + "epoch": 9.79910532736885, + "grad_norm": 0.4825058956027617, + "learning_rate": 3.0148807703094974e-06, + "loss": 0.0026, + "step": 24096 + }, + { + "epoch": 9.799511996746645, + "grad_norm": 0.008556848883576863, + "learning_rate": 3.0141557702918144e-06, + "loss": 0.0001, + "step": 24097 + }, + { + "epoch": 9.79991866612444, + "grad_norm": 0.016494489783743082, + "learning_rate": 3.0134308419873416e-06, + "loss": 0.0001, + "step": 24098 + }, + { + "epoch": 9.800325335502237, + "grad_norm": 0.012855031959558018, + "learning_rate": 3.012705985403518e-06, + "loss": 0.0001, + "step": 24099 + }, + { + "epoch": 9.800732004880032, + "grad_norm": 0.3359818309967844, + "learning_rate": 3.0119812005477844e-06, + "loss": 0.0037, + "step": 24100 + }, + { + "epoch": 9.801138674257828, + "grad_norm": 0.36035355612043757, + "learning_rate": 3.01125648742758e-06, + "loss": 0.0033, + "step": 24101 + }, + { + "epoch": 9.801545343635624, + "grad_norm": 2.984275086119704, + "learning_rate": 3.010531846050345e-06, + "loss": 0.0362, + "step": 24102 + }, + { + "epoch": 9.80195201301342, + "grad_norm": 0.9005517351288334, + "learning_rate": 3.0098072764235166e-06, + "loss": 0.0043, + "step": 24103 + }, + { + "epoch": 9.802358682391215, + "grad_norm": 1.357792078622951, + "learning_rate": 3.009082778554536e-06, + "loss": 0.0117, + "step": 24104 + }, + { + "epoch": 9.802765351769011, + "grad_norm": 0.06551489283734609, + "learning_rate": 3.0083583524508396e-06, + "loss": 0.0009, + "step": 24105 + }, + { + "epoch": 9.803172021146807, + "grad_norm": 0.5696215777969666, + "learning_rate": 3.007633998119864e-06, + "loss": 0.003, + "step": 24106 + }, + { + "epoch": 9.803578690524603, + "grad_norm": 0.12045720681870463, + "learning_rate": 3.006909715569044e-06, + "loss": 0.0008, + "step": 24107 + }, + { + "epoch": 9.803985359902398, + "grad_norm": 0.01712687663047735, + "learning_rate": 3.0061855048058164e-06, + "loss": 0.0002, + "step": 24108 + }, + { + "epoch": 9.804392029280194, + "grad_norm": 0.0018940780978966378, + "learning_rate": 3.005461365837613e-06, + "loss": 0.0, + "step": 24109 + }, + { + "epoch": 9.804798698657992, + "grad_norm": 0.30047046836353936, + "learning_rate": 3.004737298671867e-06, + "loss": 0.004, + "step": 24110 + }, + { + "epoch": 9.805205368035788, + "grad_norm": 0.007999385551562984, + "learning_rate": 3.0040133033160157e-06, + "loss": 0.0001, + "step": 24111 + }, + { + "epoch": 9.805612037413583, + "grad_norm": 0.7900997512493575, + "learning_rate": 3.0032893797774897e-06, + "loss": 0.0112, + "step": 24112 + }, + { + "epoch": 9.806018706791379, + "grad_norm": 0.00498900505195045, + "learning_rate": 3.002565528063719e-06, + "loss": 0.0, + "step": 24113 + }, + { + "epoch": 9.806425376169175, + "grad_norm": 0.03628182796214992, + "learning_rate": 3.001841748182135e-06, + "loss": 0.0004, + "step": 24114 + }, + { + "epoch": 9.80683204554697, + "grad_norm": 0.015740465836918426, + "learning_rate": 3.001118040140167e-06, + "loss": 0.0002, + "step": 24115 + }, + { + "epoch": 9.807238714924766, + "grad_norm": 0.004997852027913166, + "learning_rate": 3.0003944039452424e-06, + "loss": 0.0001, + "step": 24116 + }, + { + "epoch": 9.807645384302562, + "grad_norm": 0.0012096737610780336, + "learning_rate": 2.999670839604796e-06, + "loss": 0.0, + "step": 24117 + }, + { + "epoch": 9.808052053680358, + "grad_norm": 0.13977597199403685, + "learning_rate": 2.99894734712625e-06, + "loss": 0.0009, + "step": 24118 + }, + { + "epoch": 9.808458723058154, + "grad_norm": 0.0019843394208257418, + "learning_rate": 2.9982239265170333e-06, + "loss": 0.0, + "step": 24119 + }, + { + "epoch": 9.80886539243595, + "grad_norm": 37.875838323952884, + "learning_rate": 2.9975005777845723e-06, + "loss": 0.7129, + "step": 24120 + }, + { + "epoch": 9.809272061813745, + "grad_norm": 0.23048981333630167, + "learning_rate": 2.996777300936292e-06, + "loss": 0.0022, + "step": 24121 + }, + { + "epoch": 9.809678731191541, + "grad_norm": 0.0006216526779911474, + "learning_rate": 2.996054095979618e-06, + "loss": 0.0, + "step": 24122 + }, + { + "epoch": 9.810085400569337, + "grad_norm": 0.019271826552933323, + "learning_rate": 2.9953309629219698e-06, + "loss": 0.0001, + "step": 24123 + }, + { + "epoch": 9.810492069947133, + "grad_norm": 0.010494239702452456, + "learning_rate": 2.9946079017707774e-06, + "loss": 0.0002, + "step": 24124 + }, + { + "epoch": 9.810898739324928, + "grad_norm": 0.9271318361883607, + "learning_rate": 2.993884912533461e-06, + "loss": 0.0084, + "step": 24125 + }, + { + "epoch": 9.811305408702724, + "grad_norm": 0.3096407002157529, + "learning_rate": 2.993161995217443e-06, + "loss": 0.0018, + "step": 24126 + }, + { + "epoch": 9.81171207808052, + "grad_norm": 0.046666688108976936, + "learning_rate": 2.992439149830143e-06, + "loss": 0.0006, + "step": 24127 + }, + { + "epoch": 9.812118747458316, + "grad_norm": 8.408903558866992, + "learning_rate": 2.9917163763789814e-06, + "loss": 0.1549, + "step": 24128 + }, + { + "epoch": 9.812525416836113, + "grad_norm": 0.009041707710744444, + "learning_rate": 2.990993674871376e-06, + "loss": 0.0001, + "step": 24129 + }, + { + "epoch": 9.812932086213909, + "grad_norm": 0.16301604218830018, + "learning_rate": 2.990271045314751e-06, + "loss": 0.0013, + "step": 24130 + }, + { + "epoch": 9.813338755591705, + "grad_norm": 0.0031761931389595074, + "learning_rate": 2.9895484877165214e-06, + "loss": 0.0, + "step": 24131 + }, + { + "epoch": 9.8137454249695, + "grad_norm": 2.8455887629478336, + "learning_rate": 2.9888260020841055e-06, + "loss": 0.0335, + "step": 24132 + }, + { + "epoch": 9.814152094347296, + "grad_norm": 0.12714930193880125, + "learning_rate": 2.988103588424919e-06, + "loss": 0.001, + "step": 24133 + }, + { + "epoch": 9.814558763725092, + "grad_norm": 0.01237348914242753, + "learning_rate": 2.9873812467463782e-06, + "loss": 0.0001, + "step": 24134 + }, + { + "epoch": 9.814965433102888, + "grad_norm": 0.5659149648144997, + "learning_rate": 2.9866589770558973e-06, + "loss": 0.0072, + "step": 24135 + }, + { + "epoch": 9.815372102480683, + "grad_norm": 0.09862188620850625, + "learning_rate": 2.9859367793608896e-06, + "loss": 0.0012, + "step": 24136 + }, + { + "epoch": 9.81577877185848, + "grad_norm": 0.04461248387657929, + "learning_rate": 2.9852146536687744e-06, + "loss": 0.0004, + "step": 24137 + }, + { + "epoch": 9.816185441236275, + "grad_norm": 0.011174791385966266, + "learning_rate": 2.98449259998696e-06, + "loss": 0.0001, + "step": 24138 + }, + { + "epoch": 9.81659211061407, + "grad_norm": 2.2760301379153733, + "learning_rate": 2.983770618322861e-06, + "loss": 0.0225, + "step": 24139 + }, + { + "epoch": 9.816998779991867, + "grad_norm": 0.09752884210962827, + "learning_rate": 2.9830487086838878e-06, + "loss": 0.0008, + "step": 24140 + }, + { + "epoch": 9.817405449369662, + "grad_norm": 0.1607279851534217, + "learning_rate": 2.982326871077451e-06, + "loss": 0.0016, + "step": 24141 + }, + { + "epoch": 9.817812118747458, + "grad_norm": 0.5951609349462302, + "learning_rate": 2.9816051055109605e-06, + "loss": 0.0049, + "step": 24142 + }, + { + "epoch": 9.818218788125254, + "grad_norm": 0.015253319627576896, + "learning_rate": 2.9808834119918262e-06, + "loss": 0.0002, + "step": 24143 + }, + { + "epoch": 9.81862545750305, + "grad_norm": 0.0023902671573358346, + "learning_rate": 2.980161790527456e-06, + "loss": 0.0, + "step": 24144 + }, + { + "epoch": 9.819032126880845, + "grad_norm": 5.490208753636631, + "learning_rate": 2.9794402411252575e-06, + "loss": 0.0356, + "step": 24145 + }, + { + "epoch": 9.819438796258641, + "grad_norm": 0.0006454606309211089, + "learning_rate": 2.9787187637926394e-06, + "loss": 0.0, + "step": 24146 + }, + { + "epoch": 9.819845465636437, + "grad_norm": 0.02129648145871026, + "learning_rate": 2.977997358537007e-06, + "loss": 0.0003, + "step": 24147 + }, + { + "epoch": 9.820252135014233, + "grad_norm": 0.2780091925934552, + "learning_rate": 2.977276025365762e-06, + "loss": 0.0029, + "step": 24148 + }, + { + "epoch": 9.820658804392028, + "grad_norm": 0.02463708180203989, + "learning_rate": 2.9765547642863156e-06, + "loss": 0.0002, + "step": 24149 + }, + { + "epoch": 9.821065473769824, + "grad_norm": 0.8740106506451225, + "learning_rate": 2.9758335753060706e-06, + "loss": 0.0107, + "step": 24150 + }, + { + "epoch": 9.821472143147622, + "grad_norm": 0.004131269519424873, + "learning_rate": 2.9751124584324287e-06, + "loss": 0.0, + "step": 24151 + }, + { + "epoch": 9.821878812525418, + "grad_norm": 0.13987038557916842, + "learning_rate": 2.9743914136727927e-06, + "loss": 0.0016, + "step": 24152 + }, + { + "epoch": 9.822285481903213, + "grad_norm": 0.009938136328865733, + "learning_rate": 2.973670441034565e-06, + "loss": 0.0001, + "step": 24153 + }, + { + "epoch": 9.822692151281009, + "grad_norm": 0.001333841317680182, + "learning_rate": 2.972949540525144e-06, + "loss": 0.0, + "step": 24154 + }, + { + "epoch": 9.823098820658805, + "grad_norm": 0.03814153689868419, + "learning_rate": 2.9722287121519345e-06, + "loss": 0.0005, + "step": 24155 + }, + { + "epoch": 9.8235054900366, + "grad_norm": 29.156043801186723, + "learning_rate": 2.9715079559223346e-06, + "loss": 1.0903, + "step": 24156 + }, + { + "epoch": 9.823912159414396, + "grad_norm": 0.0007125276323302824, + "learning_rate": 2.9707872718437436e-06, + "loss": 0.0, + "step": 24157 + }, + { + "epoch": 9.824318828792192, + "grad_norm": 0.018078831861148236, + "learning_rate": 2.970066659923558e-06, + "loss": 0.0002, + "step": 24158 + }, + { + "epoch": 9.824725498169988, + "grad_norm": 0.7136527755507422, + "learning_rate": 2.9693461201691776e-06, + "loss": 0.0091, + "step": 24159 + }, + { + "epoch": 9.825132167547784, + "grad_norm": 0.13897322113005828, + "learning_rate": 2.9686256525879964e-06, + "loss": 0.0022, + "step": 24160 + }, + { + "epoch": 9.82553883692558, + "grad_norm": 2.6442516558037323, + "learning_rate": 2.967905257187409e-06, + "loss": 0.0213, + "step": 24161 + }, + { + "epoch": 9.825945506303375, + "grad_norm": 0.0019678944400241434, + "learning_rate": 2.967184933974817e-06, + "loss": 0.0, + "step": 24162 + }, + { + "epoch": 9.826352175681171, + "grad_norm": 0.012741062723207205, + "learning_rate": 2.9664646829576104e-06, + "loss": 0.0001, + "step": 24163 + }, + { + "epoch": 9.826758845058967, + "grad_norm": 0.030745744119585725, + "learning_rate": 2.9657445041431845e-06, + "loss": 0.0003, + "step": 24164 + }, + { + "epoch": 9.827165514436762, + "grad_norm": 0.08470466002971484, + "learning_rate": 2.9650243975389305e-06, + "loss": 0.0008, + "step": 24165 + }, + { + "epoch": 9.827572183814558, + "grad_norm": 0.005678294209516623, + "learning_rate": 2.9643043631522427e-06, + "loss": 0.0001, + "step": 24166 + }, + { + "epoch": 9.827978853192354, + "grad_norm": 1.8993794922269138, + "learning_rate": 2.963584400990508e-06, + "loss": 0.0165, + "step": 24167 + }, + { + "epoch": 9.82838552257015, + "grad_norm": 0.01964736432522354, + "learning_rate": 2.962864511061124e-06, + "loss": 0.0002, + "step": 24168 + }, + { + "epoch": 9.828792191947946, + "grad_norm": 0.12586006105170763, + "learning_rate": 2.962144693371477e-06, + "loss": 0.0016, + "step": 24169 + }, + { + "epoch": 9.829198861325743, + "grad_norm": 0.216605051689474, + "learning_rate": 2.961424947928958e-06, + "loss": 0.002, + "step": 24170 + }, + { + "epoch": 9.829605530703539, + "grad_norm": 0.0972843929197361, + "learning_rate": 2.9607052747409537e-06, + "loss": 0.0009, + "step": 24171 + }, + { + "epoch": 9.830012200081335, + "grad_norm": 11.397191342324435, + "learning_rate": 2.9599856738148525e-06, + "loss": 0.1251, + "step": 24172 + }, + { + "epoch": 9.83041886945913, + "grad_norm": 0.009683288927906776, + "learning_rate": 2.9592661451580408e-06, + "loss": 0.0001, + "step": 24173 + }, + { + "epoch": 9.830825538836926, + "grad_norm": 0.04258186273209646, + "learning_rate": 2.9585466887779037e-06, + "loss": 0.0003, + "step": 24174 + }, + { + "epoch": 9.831232208214722, + "grad_norm": 0.008433973768056284, + "learning_rate": 2.957827304681831e-06, + "loss": 0.0001, + "step": 24175 + }, + { + "epoch": 9.831638877592518, + "grad_norm": 0.0003602521357121756, + "learning_rate": 2.957107992877206e-06, + "loss": 0.0, + "step": 24176 + }, + { + "epoch": 9.832045546970313, + "grad_norm": 0.1002333200552726, + "learning_rate": 2.956388753371412e-06, + "loss": 0.001, + "step": 24177 + }, + { + "epoch": 9.83245221634811, + "grad_norm": 0.07232313559640555, + "learning_rate": 2.955669586171832e-06, + "loss": 0.001, + "step": 24178 + }, + { + "epoch": 9.832858885725905, + "grad_norm": 0.025017290480363438, + "learning_rate": 2.9549504912858484e-06, + "loss": 0.0002, + "step": 24179 + }, + { + "epoch": 9.8332655551037, + "grad_norm": 3.7466912747716843, + "learning_rate": 2.954231468720842e-06, + "loss": 0.0393, + "step": 24180 + }, + { + "epoch": 9.833672224481496, + "grad_norm": 4.728239815998334, + "learning_rate": 2.9535125184841984e-06, + "loss": 0.1151, + "step": 24181 + }, + { + "epoch": 9.834078893859292, + "grad_norm": 0.001777821842370294, + "learning_rate": 2.9527936405832946e-06, + "loss": 0.0, + "step": 24182 + }, + { + "epoch": 9.834485563237088, + "grad_norm": 0.11394903323456068, + "learning_rate": 2.9520748350255113e-06, + "loss": 0.0013, + "step": 24183 + }, + { + "epoch": 9.834892232614884, + "grad_norm": 0.14323545159903267, + "learning_rate": 2.9513561018182268e-06, + "loss": 0.002, + "step": 24184 + }, + { + "epoch": 9.83529890199268, + "grad_norm": 2.3603395038355104, + "learning_rate": 2.950637440968819e-06, + "loss": 0.0431, + "step": 24185 + }, + { + "epoch": 9.835705571370475, + "grad_norm": 0.02236241441810926, + "learning_rate": 2.949918852484667e-06, + "loss": 0.0003, + "step": 24186 + }, + { + "epoch": 9.836112240748271, + "grad_norm": 0.3328770392710256, + "learning_rate": 2.9492003363731413e-06, + "loss": 0.0035, + "step": 24187 + }, + { + "epoch": 9.836518910126067, + "grad_norm": 0.0053794147092985295, + "learning_rate": 2.948481892641627e-06, + "loss": 0.0, + "step": 24188 + }, + { + "epoch": 9.836925579503863, + "grad_norm": 0.024170988122344347, + "learning_rate": 2.9477635212974954e-06, + "loss": 0.0003, + "step": 24189 + }, + { + "epoch": 9.837332248881658, + "grad_norm": 4.321157977242703e-06, + "learning_rate": 2.9470452223481206e-06, + "loss": 0.0, + "step": 24190 + }, + { + "epoch": 9.837738918259454, + "grad_norm": 0.06401502432089144, + "learning_rate": 2.9463269958008754e-06, + "loss": 0.0006, + "step": 24191 + }, + { + "epoch": 9.838145587637252, + "grad_norm": 0.12244275950378822, + "learning_rate": 2.9456088416631336e-06, + "loss": 0.0012, + "step": 24192 + }, + { + "epoch": 9.838552257015047, + "grad_norm": 0.3100704914888137, + "learning_rate": 2.944890759942265e-06, + "loss": 0.0041, + "step": 24193 + }, + { + "epoch": 9.838958926392843, + "grad_norm": 0.11986067190872941, + "learning_rate": 2.9441727506456464e-06, + "loss": 0.0015, + "step": 24194 + }, + { + "epoch": 9.839365595770639, + "grad_norm": 0.024714356872184907, + "learning_rate": 2.943454813780645e-06, + "loss": 0.0003, + "step": 24195 + }, + { + "epoch": 9.839772265148435, + "grad_norm": 0.0019837544765104733, + "learning_rate": 2.9427369493546323e-06, + "loss": 0.0, + "step": 24196 + }, + { + "epoch": 9.84017893452623, + "grad_norm": 0.08639052718347857, + "learning_rate": 2.9420191573749767e-06, + "loss": 0.0012, + "step": 24197 + }, + { + "epoch": 9.840585603904026, + "grad_norm": 0.33376773727533, + "learning_rate": 2.9413014378490456e-06, + "loss": 0.0024, + "step": 24198 + }, + { + "epoch": 9.840992273281822, + "grad_norm": 0.009521392207293648, + "learning_rate": 2.9405837907842082e-06, + "loss": 0.0001, + "step": 24199 + }, + { + "epoch": 9.841398942659618, + "grad_norm": 0.004413957740506139, + "learning_rate": 2.939866216187829e-06, + "loss": 0.0, + "step": 24200 + }, + { + "epoch": 9.841805612037414, + "grad_norm": 0.021727564470400935, + "learning_rate": 2.9391487140672792e-06, + "loss": 0.0002, + "step": 24201 + }, + { + "epoch": 9.84221228141521, + "grad_norm": 1.1958761840905967, + "learning_rate": 2.93843128442992e-06, + "loss": 0.015, + "step": 24202 + }, + { + "epoch": 9.842618950793005, + "grad_norm": 0.08054291048715764, + "learning_rate": 2.937713927283119e-06, + "loss": 0.0008, + "step": 24203 + }, + { + "epoch": 9.8430256201708, + "grad_norm": 1.8007193731095195, + "learning_rate": 2.936996642634239e-06, + "loss": 0.0102, + "step": 24204 + }, + { + "epoch": 9.843432289548597, + "grad_norm": 3.0728953217356754, + "learning_rate": 2.936279430490643e-06, + "loss": 0.2304, + "step": 24205 + }, + { + "epoch": 9.843838958926392, + "grad_norm": 0.22567913591705, + "learning_rate": 2.9355622908596903e-06, + "loss": 0.0016, + "step": 24206 + }, + { + "epoch": 9.844245628304188, + "grad_norm": 0.000893384995814187, + "learning_rate": 2.93484522374875e-06, + "loss": 0.0, + "step": 24207 + }, + { + "epoch": 9.844652297681984, + "grad_norm": 0.7398468993843986, + "learning_rate": 2.9341282291651784e-06, + "loss": 0.0067, + "step": 24208 + }, + { + "epoch": 9.84505896705978, + "grad_norm": 0.11047782174998658, + "learning_rate": 2.933411307116337e-06, + "loss": 0.0011, + "step": 24209 + }, + { + "epoch": 9.845465636437575, + "grad_norm": 0.4612447739672417, + "learning_rate": 2.932694457609585e-06, + "loss": 0.0031, + "step": 24210 + }, + { + "epoch": 9.845872305815373, + "grad_norm": 0.08109694004589849, + "learning_rate": 2.9319776806522814e-06, + "loss": 0.0006, + "step": 24211 + }, + { + "epoch": 9.846278975193169, + "grad_norm": 0.19659748922419265, + "learning_rate": 2.9312609762517853e-06, + "loss": 0.0022, + "step": 24212 + }, + { + "epoch": 9.846685644570965, + "grad_norm": 5.171488002364794, + "learning_rate": 2.930544344415448e-06, + "loss": 0.1077, + "step": 24213 + }, + { + "epoch": 9.84709231394876, + "grad_norm": 0.10837499669329215, + "learning_rate": 2.9298277851506353e-06, + "loss": 0.001, + "step": 24214 + }, + { + "epoch": 9.847498983326556, + "grad_norm": 2.1925370607732195, + "learning_rate": 2.9291112984646973e-06, + "loss": 0.0201, + "step": 24215 + }, + { + "epoch": 9.847905652704352, + "grad_norm": 17.997344565388666, + "learning_rate": 2.9283948843649923e-06, + "loss": 0.2689, + "step": 24216 + }, + { + "epoch": 9.848312322082148, + "grad_norm": 0.361319727574533, + "learning_rate": 2.9276785428588717e-06, + "loss": 0.0046, + "step": 24217 + }, + { + "epoch": 9.848718991459943, + "grad_norm": 0.000669727442514398, + "learning_rate": 2.926962273953691e-06, + "loss": 0.0, + "step": 24218 + }, + { + "epoch": 9.849125660837739, + "grad_norm": 0.9176010276073548, + "learning_rate": 2.926246077656798e-06, + "loss": 0.0086, + "step": 24219 + }, + { + "epoch": 9.849532330215535, + "grad_norm": 0.04464853465043183, + "learning_rate": 2.925529953975553e-06, + "loss": 0.0004, + "step": 24220 + }, + { + "epoch": 9.84993899959333, + "grad_norm": 0.0035065418551924795, + "learning_rate": 2.924813902917304e-06, + "loss": 0.0, + "step": 24221 + }, + { + "epoch": 9.850345668971126, + "grad_norm": 0.132878188776874, + "learning_rate": 2.9240979244894006e-06, + "loss": 0.001, + "step": 24222 + }, + { + "epoch": 9.850752338348922, + "grad_norm": 0.02059483822223844, + "learning_rate": 2.9233820186991935e-06, + "loss": 0.0002, + "step": 24223 + }, + { + "epoch": 9.851159007726718, + "grad_norm": 0.02338320115732304, + "learning_rate": 2.9226661855540316e-06, + "loss": 0.0003, + "step": 24224 + }, + { + "epoch": 9.851565677104514, + "grad_norm": 5.166981075406856, + "learning_rate": 2.921950425061264e-06, + "loss": 0.0572, + "step": 24225 + }, + { + "epoch": 9.85197234648231, + "grad_norm": 0.008406286170007745, + "learning_rate": 2.9212347372282337e-06, + "loss": 0.0001, + "step": 24226 + }, + { + "epoch": 9.852379015860105, + "grad_norm": 0.08962813970371424, + "learning_rate": 2.920519122062294e-06, + "loss": 0.0007, + "step": 24227 + }, + { + "epoch": 9.852785685237901, + "grad_norm": 0.02118601714983065, + "learning_rate": 2.919803579570789e-06, + "loss": 0.0003, + "step": 24228 + }, + { + "epoch": 9.853192354615697, + "grad_norm": 0.681324044928124, + "learning_rate": 2.9190881097610634e-06, + "loss": 0.0051, + "step": 24229 + }, + { + "epoch": 9.853599023993493, + "grad_norm": 1.922365486242102, + "learning_rate": 2.9183727126404628e-06, + "loss": 0.0216, + "step": 24230 + }, + { + "epoch": 9.854005693371288, + "grad_norm": 2.5956792529945365, + "learning_rate": 2.9176573882163306e-06, + "loss": 0.0352, + "step": 24231 + }, + { + "epoch": 9.854412362749084, + "grad_norm": 0.033575909990053596, + "learning_rate": 2.916942136496006e-06, + "loss": 0.0003, + "step": 24232 + }, + { + "epoch": 9.854819032126882, + "grad_norm": 0.04630679845899295, + "learning_rate": 2.9162269574868385e-06, + "loss": 0.0006, + "step": 24233 + }, + { + "epoch": 9.855225701504677, + "grad_norm": 0.011760033616691031, + "learning_rate": 2.9155118511961654e-06, + "loss": 0.0001, + "step": 24234 + }, + { + "epoch": 9.855632370882473, + "grad_norm": 3.0866499826779483, + "learning_rate": 2.9147968176313303e-06, + "loss": 0.0282, + "step": 24235 + }, + { + "epoch": 9.856039040260269, + "grad_norm": 0.0038702958210972845, + "learning_rate": 2.914081856799671e-06, + "loss": 0.0, + "step": 24236 + }, + { + "epoch": 9.856445709638065, + "grad_norm": 0.0016919169047305005, + "learning_rate": 2.9133669687085273e-06, + "loss": 0.0, + "step": 24237 + }, + { + "epoch": 9.85685237901586, + "grad_norm": 0.03140171262979832, + "learning_rate": 2.91265215336524e-06, + "loss": 0.0003, + "step": 24238 + }, + { + "epoch": 9.857259048393656, + "grad_norm": 0.0032771083505609413, + "learning_rate": 2.9119374107771404e-06, + "loss": 0.0, + "step": 24239 + }, + { + "epoch": 9.857665717771452, + "grad_norm": 0.2026837811384001, + "learning_rate": 2.9112227409515747e-06, + "loss": 0.0019, + "step": 24240 + }, + { + "epoch": 9.858072387149248, + "grad_norm": 0.3717165288595754, + "learning_rate": 2.9105081438958783e-06, + "loss": 0.0036, + "step": 24241 + }, + { + "epoch": 9.858479056527043, + "grad_norm": 0.09218654664553307, + "learning_rate": 2.90979361961738e-06, + "loss": 0.0009, + "step": 24242 + }, + { + "epoch": 9.85888572590484, + "grad_norm": 0.9392989608606167, + "learning_rate": 2.90907916812342e-06, + "loss": 0.0068, + "step": 24243 + }, + { + "epoch": 9.859292395282635, + "grad_norm": 0.07117933944892683, + "learning_rate": 2.908364789421326e-06, + "loss": 0.0009, + "step": 24244 + }, + { + "epoch": 9.85969906466043, + "grad_norm": 0.10184320338505415, + "learning_rate": 2.9076504835184415e-06, + "loss": 0.001, + "step": 24245 + }, + { + "epoch": 9.860105734038227, + "grad_norm": 0.01804347701905545, + "learning_rate": 2.906936250422093e-06, + "loss": 0.0002, + "step": 24246 + }, + { + "epoch": 9.860512403416022, + "grad_norm": 0.025960435292401902, + "learning_rate": 2.9062220901396153e-06, + "loss": 0.0001, + "step": 24247 + }, + { + "epoch": 9.860919072793818, + "grad_norm": 0.04236772950080468, + "learning_rate": 2.9055080026783366e-06, + "loss": 0.0005, + "step": 24248 + }, + { + "epoch": 9.861325742171614, + "grad_norm": 0.10125238693679092, + "learning_rate": 2.90479398804559e-06, + "loss": 0.001, + "step": 24249 + }, + { + "epoch": 9.86173241154941, + "grad_norm": 0.046062514247586235, + "learning_rate": 2.904080046248703e-06, + "loss": 0.0006, + "step": 24250 + }, + { + "epoch": 9.862139080927205, + "grad_norm": 2.9944089607461817, + "learning_rate": 2.903366177295004e-06, + "loss": 0.0283, + "step": 24251 + }, + { + "epoch": 9.862545750305003, + "grad_norm": 0.36615027092064445, + "learning_rate": 2.902652381191824e-06, + "loss": 0.0031, + "step": 24252 + }, + { + "epoch": 9.862952419682799, + "grad_norm": 0.03100993545465239, + "learning_rate": 2.9019386579464903e-06, + "loss": 0.0004, + "step": 24253 + }, + { + "epoch": 9.863359089060594, + "grad_norm": 0.09183444489159158, + "learning_rate": 2.9012250075663297e-06, + "loss": 0.0012, + "step": 24254 + }, + { + "epoch": 9.86376575843839, + "grad_norm": 3.8791534034399913, + "learning_rate": 2.9005114300586657e-06, + "loss": 0.0523, + "step": 24255 + }, + { + "epoch": 9.864172427816186, + "grad_norm": 0.012712793896748138, + "learning_rate": 2.8997979254308253e-06, + "loss": 0.0001, + "step": 24256 + }, + { + "epoch": 9.864579097193982, + "grad_norm": 0.010214246242555881, + "learning_rate": 2.8990844936901297e-06, + "loss": 0.0001, + "step": 24257 + }, + { + "epoch": 9.864985766571778, + "grad_norm": 1.0955703847466502, + "learning_rate": 2.898371134843908e-06, + "loss": 0.0118, + "step": 24258 + }, + { + "epoch": 9.865392435949573, + "grad_norm": 0.008304630822805739, + "learning_rate": 2.8976578488994823e-06, + "loss": 0.0001, + "step": 24259 + }, + { + "epoch": 9.865799105327369, + "grad_norm": 0.005589922571992931, + "learning_rate": 2.896944635864172e-06, + "loss": 0.0, + "step": 24260 + }, + { + "epoch": 9.866205774705165, + "grad_norm": 0.8099619637915845, + "learning_rate": 2.8962314957452995e-06, + "loss": 0.0107, + "step": 24261 + }, + { + "epoch": 9.86661244408296, + "grad_norm": 0.002731930321190972, + "learning_rate": 2.895518428550187e-06, + "loss": 0.0, + "step": 24262 + }, + { + "epoch": 9.867019113460756, + "grad_norm": 0.004699040054386188, + "learning_rate": 2.894805434286153e-06, + "loss": 0.0, + "step": 24263 + }, + { + "epoch": 9.867425782838552, + "grad_norm": 0.050237110176906734, + "learning_rate": 2.8940925129605145e-06, + "loss": 0.0004, + "step": 24264 + }, + { + "epoch": 9.867832452216348, + "grad_norm": 0.05224978064003361, + "learning_rate": 2.893379664580596e-06, + "loss": 0.0008, + "step": 24265 + }, + { + "epoch": 9.868239121594144, + "grad_norm": 0.046485417090002056, + "learning_rate": 2.8926668891537114e-06, + "loss": 0.0007, + "step": 24266 + }, + { + "epoch": 9.86864579097194, + "grad_norm": 0.005744190151243677, + "learning_rate": 2.891954186687178e-06, + "loss": 0.0001, + "step": 24267 + }, + { + "epoch": 9.869052460349735, + "grad_norm": 0.06710200836050138, + "learning_rate": 2.8912415571883135e-06, + "loss": 0.0006, + "step": 24268 + }, + { + "epoch": 9.869459129727531, + "grad_norm": 0.2932485279192142, + "learning_rate": 2.8905290006644303e-06, + "loss": 0.0037, + "step": 24269 + }, + { + "epoch": 9.869865799105327, + "grad_norm": 0.00810813424772137, + "learning_rate": 2.889816517122843e-06, + "loss": 0.0001, + "step": 24270 + }, + { + "epoch": 9.870272468483122, + "grad_norm": 0.002703595786810467, + "learning_rate": 2.8891041065708713e-06, + "loss": 0.0, + "step": 24271 + }, + { + "epoch": 9.870679137860918, + "grad_norm": 0.25879333033309554, + "learning_rate": 2.8883917690158237e-06, + "loss": 0.0029, + "step": 24272 + }, + { + "epoch": 9.871085807238714, + "grad_norm": 0.0007657502237716926, + "learning_rate": 2.887679504465014e-06, + "loss": 0.0, + "step": 24273 + }, + { + "epoch": 9.871492476616512, + "grad_norm": 0.13086732567474357, + "learning_rate": 2.8869673129257546e-06, + "loss": 0.0016, + "step": 24274 + }, + { + "epoch": 9.871899145994307, + "grad_norm": 0.06455943964519337, + "learning_rate": 2.8862551944053553e-06, + "loss": 0.0007, + "step": 24275 + }, + { + "epoch": 9.872305815372103, + "grad_norm": 0.005386131950101222, + "learning_rate": 2.885543148911126e-06, + "loss": 0.0001, + "step": 24276 + }, + { + "epoch": 9.872712484749899, + "grad_norm": 0.00034387256557446574, + "learning_rate": 2.8848311764503745e-06, + "loss": 0.0, + "step": 24277 + }, + { + "epoch": 9.873119154127695, + "grad_norm": 0.002042883200324094, + "learning_rate": 2.884119277030414e-06, + "loss": 0.0, + "step": 24278 + }, + { + "epoch": 9.87352582350549, + "grad_norm": 0.026477929571557212, + "learning_rate": 2.883407450658552e-06, + "loss": 0.0002, + "step": 24279 + }, + { + "epoch": 9.873932492883286, + "grad_norm": 0.030226763409142886, + "learning_rate": 2.882695697342093e-06, + "loss": 0.0002, + "step": 24280 + }, + { + "epoch": 9.874339162261082, + "grad_norm": 4.127469108179, + "learning_rate": 2.881984017088345e-06, + "loss": 0.0461, + "step": 24281 + }, + { + "epoch": 9.874745831638878, + "grad_norm": 0.03935048046959676, + "learning_rate": 2.8812724099046143e-06, + "loss": 0.0005, + "step": 24282 + }, + { + "epoch": 9.875152501016673, + "grad_norm": 4.359781637715956, + "learning_rate": 2.8805608757982007e-06, + "loss": 0.1092, + "step": 24283 + }, + { + "epoch": 9.87555917039447, + "grad_norm": 6.071090577341571, + "learning_rate": 2.8798494147764165e-06, + "loss": 0.0359, + "step": 24284 + }, + { + "epoch": 9.875965839772265, + "grad_norm": 8.116616138794257, + "learning_rate": 2.8791380268465618e-06, + "loss": 0.2386, + "step": 24285 + }, + { + "epoch": 9.87637250915006, + "grad_norm": 0.09801800117992103, + "learning_rate": 2.8784267120159392e-06, + "loss": 0.0012, + "step": 24286 + }, + { + "epoch": 9.876779178527856, + "grad_norm": 0.07971070892483086, + "learning_rate": 2.8777154702918507e-06, + "loss": 0.0006, + "step": 24287 + }, + { + "epoch": 9.877185847905652, + "grad_norm": 0.33411874751082365, + "learning_rate": 2.8770043016815972e-06, + "loss": 0.0042, + "step": 24288 + }, + { + "epoch": 9.877592517283448, + "grad_norm": 0.014913883480003729, + "learning_rate": 2.8762932061924798e-06, + "loss": 0.0002, + "step": 24289 + }, + { + "epoch": 9.877999186661244, + "grad_norm": 0.001050774642952041, + "learning_rate": 2.8755821838317943e-06, + "loss": 0.0, + "step": 24290 + }, + { + "epoch": 9.87840585603904, + "grad_norm": 0.6018734995397069, + "learning_rate": 2.8748712346068464e-06, + "loss": 0.0052, + "step": 24291 + }, + { + "epoch": 9.878812525416835, + "grad_norm": 0.04808926511354944, + "learning_rate": 2.8741603585249312e-06, + "loss": 0.0004, + "step": 24292 + }, + { + "epoch": 9.879219194794633, + "grad_norm": 0.003620018624804702, + "learning_rate": 2.8734495555933463e-06, + "loss": 0.0001, + "step": 24293 + }, + { + "epoch": 9.879625864172429, + "grad_norm": 8.674921758364855, + "learning_rate": 2.872738825819389e-06, + "loss": 0.0817, + "step": 24294 + }, + { + "epoch": 9.880032533550224, + "grad_norm": 0.0013918740265176136, + "learning_rate": 2.8720281692103545e-06, + "loss": 0.0, + "step": 24295 + }, + { + "epoch": 9.88043920292802, + "grad_norm": 0.0016078052081398221, + "learning_rate": 2.8713175857735356e-06, + "loss": 0.0, + "step": 24296 + }, + { + "epoch": 9.880845872305816, + "grad_norm": 15.380852347236118, + "learning_rate": 2.870607075516232e-06, + "loss": 0.3024, + "step": 24297 + }, + { + "epoch": 9.881252541683612, + "grad_norm": 0.3562597983296673, + "learning_rate": 2.869896638445735e-06, + "loss": 0.0021, + "step": 24298 + }, + { + "epoch": 9.881659211061407, + "grad_norm": 0.32039228572460615, + "learning_rate": 2.8691862745693365e-06, + "loss": 0.0027, + "step": 24299 + }, + { + "epoch": 9.882065880439203, + "grad_norm": 0.01030452319674411, + "learning_rate": 2.8684759838943312e-06, + "loss": 0.0001, + "step": 24300 + }, + { + "epoch": 9.882472549816999, + "grad_norm": 0.07436316356442642, + "learning_rate": 2.867765766428009e-06, + "loss": 0.0013, + "step": 24301 + }, + { + "epoch": 9.882879219194795, + "grad_norm": 0.09074955680427434, + "learning_rate": 2.8670556221776603e-06, + "loss": 0.0007, + "step": 24302 + }, + { + "epoch": 9.88328588857259, + "grad_norm": 4.020227491508217, + "learning_rate": 2.866345551150572e-06, + "loss": 0.0329, + "step": 24303 + }, + { + "epoch": 9.883692557950386, + "grad_norm": 0.0287948268625546, + "learning_rate": 2.8656355533540414e-06, + "loss": 0.0002, + "step": 24304 + }, + { + "epoch": 9.884099227328182, + "grad_norm": 0.10659701926037583, + "learning_rate": 2.864925628795352e-06, + "loss": 0.0008, + "step": 24305 + }, + { + "epoch": 9.884505896705978, + "grad_norm": 0.018617303650505714, + "learning_rate": 2.8642157774817915e-06, + "loss": 0.0002, + "step": 24306 + }, + { + "epoch": 9.884912566083774, + "grad_norm": 0.007283137427707712, + "learning_rate": 2.863505999420648e-06, + "loss": 0.0001, + "step": 24307 + }, + { + "epoch": 9.88531923546157, + "grad_norm": 0.11894302134319601, + "learning_rate": 2.8627962946192067e-06, + "loss": 0.0016, + "step": 24308 + }, + { + "epoch": 9.885725904839365, + "grad_norm": 0.3356907982620163, + "learning_rate": 2.862086663084751e-06, + "loss": 0.0033, + "step": 24309 + }, + { + "epoch": 9.88613257421716, + "grad_norm": 10.275199496810696, + "learning_rate": 2.8613771048245707e-06, + "loss": 0.22, + "step": 24310 + }, + { + "epoch": 9.886539243594957, + "grad_norm": 0.031157463577912634, + "learning_rate": 2.860667619845947e-06, + "loss": 0.0003, + "step": 24311 + }, + { + "epoch": 9.886945912972752, + "grad_norm": 0.0020831728903882257, + "learning_rate": 2.859958208156164e-06, + "loss": 0.0, + "step": 24312 + }, + { + "epoch": 9.887352582350548, + "grad_norm": 18.91087549942167, + "learning_rate": 2.859248869762503e-06, + "loss": 0.0807, + "step": 24313 + }, + { + "epoch": 9.887759251728344, + "grad_norm": 2.167519956178779, + "learning_rate": 2.858539604672246e-06, + "loss": 0.0151, + "step": 24314 + }, + { + "epoch": 9.888165921106141, + "grad_norm": 0.023540080803222032, + "learning_rate": 2.8578304128926747e-06, + "loss": 0.0002, + "step": 24315 + }, + { + "epoch": 9.888572590483937, + "grad_norm": 0.21527116857874098, + "learning_rate": 2.8571212944310657e-06, + "loss": 0.0015, + "step": 24316 + }, + { + "epoch": 9.888979259861733, + "grad_norm": 0.08314761235048189, + "learning_rate": 2.8564122492947045e-06, + "loss": 0.0011, + "step": 24317 + }, + { + "epoch": 9.889385929239529, + "grad_norm": 0.15577967656567865, + "learning_rate": 2.8557032774908666e-06, + "loss": 0.0021, + "step": 24318 + }, + { + "epoch": 9.889792598617325, + "grad_norm": 1.315329778533248, + "learning_rate": 2.854994379026831e-06, + "loss": 0.0119, + "step": 24319 + }, + { + "epoch": 9.89019926799512, + "grad_norm": 10.568770710644545, + "learning_rate": 2.8542855539098734e-06, + "loss": 0.2586, + "step": 24320 + }, + { + "epoch": 9.890605937372916, + "grad_norm": 2.3826280170691447, + "learning_rate": 2.853576802147271e-06, + "loss": 0.0174, + "step": 24321 + }, + { + "epoch": 9.891012606750712, + "grad_norm": 0.0010564458926678724, + "learning_rate": 2.852868123746296e-06, + "loss": 0.0, + "step": 24322 + }, + { + "epoch": 9.891419276128508, + "grad_norm": 3.9130716618714714, + "learning_rate": 2.8521595187142303e-06, + "loss": 0.0324, + "step": 24323 + }, + { + "epoch": 9.891825945506303, + "grad_norm": 1.798896291618538, + "learning_rate": 2.8514509870583452e-06, + "loss": 0.0137, + "step": 24324 + }, + { + "epoch": 9.892232614884099, + "grad_norm": 0.09026062617398943, + "learning_rate": 2.8507425287859125e-06, + "loss": 0.0013, + "step": 24325 + }, + { + "epoch": 9.892639284261895, + "grad_norm": 1.0904957752955802, + "learning_rate": 2.850034143904207e-06, + "loss": 0.0122, + "step": 24326 + }, + { + "epoch": 9.89304595363969, + "grad_norm": 0.32868494062411213, + "learning_rate": 2.8493258324204986e-06, + "loss": 0.0053, + "step": 24327 + }, + { + "epoch": 9.893452623017486, + "grad_norm": 0.008492601737149802, + "learning_rate": 2.8486175943420592e-06, + "loss": 0.0001, + "step": 24328 + }, + { + "epoch": 9.893859292395282, + "grad_norm": 0.006524050300779222, + "learning_rate": 2.847909429676157e-06, + "loss": 0.0001, + "step": 24329 + }, + { + "epoch": 9.894265961773078, + "grad_norm": 1.5351272154384683, + "learning_rate": 2.8472013384300666e-06, + "loss": 0.0134, + "step": 24330 + }, + { + "epoch": 9.894672631150874, + "grad_norm": 1.7696408916499273, + "learning_rate": 2.8464933206110547e-06, + "loss": 0.0148, + "step": 24331 + }, + { + "epoch": 9.89507930052867, + "grad_norm": 0.000865810540490455, + "learning_rate": 2.845785376226389e-06, + "loss": 0.0, + "step": 24332 + }, + { + "epoch": 9.895485969906465, + "grad_norm": 0.010791583331682264, + "learning_rate": 2.8450775052833377e-06, + "loss": 0.0001, + "step": 24333 + }, + { + "epoch": 9.895892639284263, + "grad_norm": 0.1661527892294171, + "learning_rate": 2.8443697077891664e-06, + "loss": 0.0011, + "step": 24334 + }, + { + "epoch": 9.896299308662059, + "grad_norm": 0.018186002846448845, + "learning_rate": 2.843661983751138e-06, + "loss": 0.0002, + "step": 24335 + }, + { + "epoch": 9.896705978039854, + "grad_norm": 0.0702232518176972, + "learning_rate": 2.842954333176524e-06, + "loss": 0.0006, + "step": 24336 + }, + { + "epoch": 9.89711264741765, + "grad_norm": 0.00846970561088337, + "learning_rate": 2.8422467560725863e-06, + "loss": 0.0001, + "step": 24337 + }, + { + "epoch": 9.897519316795446, + "grad_norm": 0.004677478737768812, + "learning_rate": 2.841539252446588e-06, + "loss": 0.0, + "step": 24338 + }, + { + "epoch": 9.897925986173242, + "grad_norm": 0.0007265384220530643, + "learning_rate": 2.8408318223057916e-06, + "loss": 0.0, + "step": 24339 + }, + { + "epoch": 9.898332655551037, + "grad_norm": 0.4263567339792862, + "learning_rate": 2.8401244656574634e-06, + "loss": 0.004, + "step": 24340 + }, + { + "epoch": 9.898739324928833, + "grad_norm": 0.03178963633821435, + "learning_rate": 2.8394171825088548e-06, + "loss": 0.0005, + "step": 24341 + }, + { + "epoch": 9.899145994306629, + "grad_norm": 0.04786132270008275, + "learning_rate": 2.8387099728672353e-06, + "loss": 0.0002, + "step": 24342 + }, + { + "epoch": 9.899552663684425, + "grad_norm": 0.06006522595936675, + "learning_rate": 2.838002836739863e-06, + "loss": 0.0006, + "step": 24343 + }, + { + "epoch": 9.89995933306222, + "grad_norm": 0.159910061311028, + "learning_rate": 2.8372957741339956e-06, + "loss": 0.0019, + "step": 24344 + }, + { + "epoch": 9.900366002440016, + "grad_norm": 0.0008285753679961265, + "learning_rate": 2.8365887850568918e-06, + "loss": 0.0, + "step": 24345 + }, + { + "epoch": 9.900772671817812, + "grad_norm": 0.0015679947609143483, + "learning_rate": 2.8358818695158096e-06, + "loss": 0.0, + "step": 24346 + }, + { + "epoch": 9.901179341195608, + "grad_norm": 0.07157526654810739, + "learning_rate": 2.8351750275180023e-06, + "loss": 0.001, + "step": 24347 + }, + { + "epoch": 9.901586010573403, + "grad_norm": 0.3022286651358371, + "learning_rate": 2.834468259070733e-06, + "loss": 0.0043, + "step": 24348 + }, + { + "epoch": 9.9019926799512, + "grad_norm": 0.053456506788213815, + "learning_rate": 2.833761564181252e-06, + "loss": 0.0006, + "step": 24349 + }, + { + "epoch": 9.902399349328995, + "grad_norm": 0.06672180705678854, + "learning_rate": 2.8330549428568165e-06, + "loss": 0.0011, + "step": 24350 + }, + { + "epoch": 9.90280601870679, + "grad_norm": 0.05477753851680188, + "learning_rate": 2.832348395104679e-06, + "loss": 0.0008, + "step": 24351 + }, + { + "epoch": 9.903212688084587, + "grad_norm": 0.020347660817281735, + "learning_rate": 2.831641920932092e-06, + "loss": 0.0002, + "step": 24352 + }, + { + "epoch": 9.903619357462382, + "grad_norm": 10.138722102151592, + "learning_rate": 2.8309355203463094e-06, + "loss": 0.3911, + "step": 24353 + }, + { + "epoch": 9.904026026840178, + "grad_norm": 0.01748046594642291, + "learning_rate": 2.8302291933545777e-06, + "loss": 0.0002, + "step": 24354 + }, + { + "epoch": 9.904432696217976, + "grad_norm": 0.25334355108575757, + "learning_rate": 2.8295229399641556e-06, + "loss": 0.0029, + "step": 24355 + }, + { + "epoch": 9.904839365595771, + "grad_norm": 0.009872687098929802, + "learning_rate": 2.8288167601822893e-06, + "loss": 0.0001, + "step": 24356 + }, + { + "epoch": 9.905246034973567, + "grad_norm": 0.004990982585742902, + "learning_rate": 2.8281106540162274e-06, + "loss": 0.0001, + "step": 24357 + }, + { + "epoch": 9.905652704351363, + "grad_norm": 0.00396484685215219, + "learning_rate": 2.82740462147322e-06, + "loss": 0.0, + "step": 24358 + }, + { + "epoch": 9.906059373729159, + "grad_norm": 0.12327853552535283, + "learning_rate": 2.8266986625605143e-06, + "loss": 0.0014, + "step": 24359 + }, + { + "epoch": 9.906466043106954, + "grad_norm": 0.0708730400448933, + "learning_rate": 2.8259927772853524e-06, + "loss": 0.0008, + "step": 24360 + }, + { + "epoch": 9.90687271248475, + "grad_norm": 0.3428713107407171, + "learning_rate": 2.8252869656549897e-06, + "loss": 0.0022, + "step": 24361 + }, + { + "epoch": 9.907279381862546, + "grad_norm": 0.008362373332730027, + "learning_rate": 2.824581227676667e-06, + "loss": 0.0001, + "step": 24362 + }, + { + "epoch": 9.907686051240342, + "grad_norm": 0.15984038161086345, + "learning_rate": 2.8238755633576298e-06, + "loss": 0.0019, + "step": 24363 + }, + { + "epoch": 9.908092720618138, + "grad_norm": 0.04967343224278127, + "learning_rate": 2.823169972705121e-06, + "loss": 0.0007, + "step": 24364 + }, + { + "epoch": 9.908499389995933, + "grad_norm": 0.06620175276315546, + "learning_rate": 2.8224644557263837e-06, + "loss": 0.0007, + "step": 24365 + }, + { + "epoch": 9.908906059373729, + "grad_norm": 7.801642586567084, + "learning_rate": 2.821759012428662e-06, + "loss": 0.1428, + "step": 24366 + }, + { + "epoch": 9.909312728751525, + "grad_norm": 0.03862553407137581, + "learning_rate": 2.8210536428191937e-06, + "loss": 0.0005, + "step": 24367 + }, + { + "epoch": 9.90971939812932, + "grad_norm": 0.010708025064383191, + "learning_rate": 2.8203483469052264e-06, + "loss": 0.0001, + "step": 24368 + }, + { + "epoch": 9.910126067507116, + "grad_norm": 0.04241536741033295, + "learning_rate": 2.819643124693996e-06, + "loss": 0.0006, + "step": 24369 + }, + { + "epoch": 9.910532736884912, + "grad_norm": 0.010056518408147834, + "learning_rate": 2.8189379761927427e-06, + "loss": 0.0001, + "step": 24370 + }, + { + "epoch": 9.910939406262708, + "grad_norm": 0.0005677469234182815, + "learning_rate": 2.818232901408705e-06, + "loss": 0.0, + "step": 24371 + }, + { + "epoch": 9.911346075640504, + "grad_norm": 0.004785818261014941, + "learning_rate": 2.8175279003491217e-06, + "loss": 0.0001, + "step": 24372 + }, + { + "epoch": 9.9117527450183, + "grad_norm": 0.10774652318007813, + "learning_rate": 2.816822973021226e-06, + "loss": 0.0013, + "step": 24373 + }, + { + "epoch": 9.912159414396095, + "grad_norm": 0.004761801991817116, + "learning_rate": 2.816118119432262e-06, + "loss": 0.0001, + "step": 24374 + }, + { + "epoch": 9.912566083773893, + "grad_norm": 0.02013409227522579, + "learning_rate": 2.815413339589459e-06, + "loss": 0.0002, + "step": 24375 + }, + { + "epoch": 9.912972753151688, + "grad_norm": 0.0014283711040953425, + "learning_rate": 2.814708633500055e-06, + "loss": 0.0, + "step": 24376 + }, + { + "epoch": 9.913379422529484, + "grad_norm": 0.42837729185697504, + "learning_rate": 2.8140040011712833e-06, + "loss": 0.0031, + "step": 24377 + }, + { + "epoch": 9.91378609190728, + "grad_norm": 0.038694479104919514, + "learning_rate": 2.8132994426103776e-06, + "loss": 0.0005, + "step": 24378 + }, + { + "epoch": 9.914192761285076, + "grad_norm": 0.7472334420689609, + "learning_rate": 2.8125949578245694e-06, + "loss": 0.0078, + "step": 24379 + }, + { + "epoch": 9.914599430662872, + "grad_norm": 2.4563986969147207, + "learning_rate": 2.811890546821088e-06, + "loss": 0.0327, + "step": 24380 + }, + { + "epoch": 9.915006100040667, + "grad_norm": 0.5723528832600187, + "learning_rate": 2.811186209607172e-06, + "loss": 0.0046, + "step": 24381 + }, + { + "epoch": 9.915412769418463, + "grad_norm": 0.005674807410594481, + "learning_rate": 2.8104819461900458e-06, + "loss": 0.0, + "step": 24382 + }, + { + "epoch": 9.915819438796259, + "grad_norm": 0.41424453792333293, + "learning_rate": 2.8097777565769425e-06, + "loss": 0.0026, + "step": 24383 + }, + { + "epoch": 9.916226108174055, + "grad_norm": 0.042188862301468046, + "learning_rate": 2.8090736407750885e-06, + "loss": 0.0006, + "step": 24384 + }, + { + "epoch": 9.91663277755185, + "grad_norm": 0.11793874718349732, + "learning_rate": 2.8083695987917125e-06, + "loss": 0.0012, + "step": 24385 + }, + { + "epoch": 9.917039446929646, + "grad_norm": 0.0022895269341990177, + "learning_rate": 2.80766563063404e-06, + "loss": 0.0, + "step": 24386 + }, + { + "epoch": 9.917446116307442, + "grad_norm": 0.5520770249076865, + "learning_rate": 2.8069617363093014e-06, + "loss": 0.0066, + "step": 24387 + }, + { + "epoch": 9.917852785685238, + "grad_norm": 5.450507539022727, + "learning_rate": 2.8062579158247217e-06, + "loss": 0.0757, + "step": 24388 + }, + { + "epoch": 9.918259455063033, + "grad_norm": 0.6310898530318656, + "learning_rate": 2.805554169187524e-06, + "loss": 0.0057, + "step": 24389 + }, + { + "epoch": 9.91866612444083, + "grad_norm": 0.06468922695777166, + "learning_rate": 2.804850496404935e-06, + "loss": 0.0008, + "step": 24390 + }, + { + "epoch": 9.919072793818625, + "grad_norm": 0.019482838479784954, + "learning_rate": 2.804146897484177e-06, + "loss": 0.0003, + "step": 24391 + }, + { + "epoch": 9.91947946319642, + "grad_norm": 1.2853460454365915, + "learning_rate": 2.8034433724324716e-06, + "loss": 0.0122, + "step": 24392 + }, + { + "epoch": 9.919886132574216, + "grad_norm": 0.005930784585946435, + "learning_rate": 2.8027399212570395e-06, + "loss": 0.0001, + "step": 24393 + }, + { + "epoch": 9.920292801952012, + "grad_norm": 0.09813012924036811, + "learning_rate": 2.8020365439651065e-06, + "loss": 0.0007, + "step": 24394 + }, + { + "epoch": 9.920699471329808, + "grad_norm": 0.3547952033676382, + "learning_rate": 2.8013332405638915e-06, + "loss": 0.0027, + "step": 24395 + }, + { + "epoch": 9.921106140707606, + "grad_norm": 0.1654960278161267, + "learning_rate": 2.8006300110606146e-06, + "loss": 0.0019, + "step": 24396 + }, + { + "epoch": 9.921512810085401, + "grad_norm": 0.015452313028825469, + "learning_rate": 2.7999268554624937e-06, + "loss": 0.0002, + "step": 24397 + }, + { + "epoch": 9.921919479463197, + "grad_norm": 0.015476271077570936, + "learning_rate": 2.799223773776747e-06, + "loss": 0.0001, + "step": 24398 + }, + { + "epoch": 9.922326148840993, + "grad_norm": 0.001952633692370783, + "learning_rate": 2.7985207660105897e-06, + "loss": 0.0, + "step": 24399 + }, + { + "epoch": 9.922732818218789, + "grad_norm": 0.031020485657194417, + "learning_rate": 2.7978178321712434e-06, + "loss": 0.0003, + "step": 24400 + }, + { + "epoch": 9.923139487596584, + "grad_norm": 0.8206849882409785, + "learning_rate": 2.7971149722659217e-06, + "loss": 0.0095, + "step": 24401 + }, + { + "epoch": 9.92354615697438, + "grad_norm": 0.021278036275986366, + "learning_rate": 2.796412186301841e-06, + "loss": 0.0002, + "step": 24402 + }, + { + "epoch": 9.923952826352176, + "grad_norm": 0.0061657391153623195, + "learning_rate": 2.7957094742862146e-06, + "loss": 0.0001, + "step": 24403 + }, + { + "epoch": 9.924359495729972, + "grad_norm": 1.5146652192465666, + "learning_rate": 2.795006836226256e-06, + "loss": 0.0082, + "step": 24404 + }, + { + "epoch": 9.924766165107767, + "grad_norm": 5.906228119062835, + "learning_rate": 2.794304272129178e-06, + "loss": 0.0855, + "step": 24405 + }, + { + "epoch": 9.925172834485563, + "grad_norm": 0.03947171501247484, + "learning_rate": 2.793601782002191e-06, + "loss": 0.0005, + "step": 24406 + }, + { + "epoch": 9.925579503863359, + "grad_norm": 0.20196010772194128, + "learning_rate": 2.7928993658525104e-06, + "loss": 0.0019, + "step": 24407 + }, + { + "epoch": 9.925986173241155, + "grad_norm": 0.01993432747549762, + "learning_rate": 2.7921970236873465e-06, + "loss": 0.0002, + "step": 24408 + }, + { + "epoch": 9.92639284261895, + "grad_norm": 0.03253574773867776, + "learning_rate": 2.791494755513907e-06, + "loss": 0.0003, + "step": 24409 + }, + { + "epoch": 9.926799511996746, + "grad_norm": 9.38415913324044, + "learning_rate": 2.7907925613394015e-06, + "loss": 0.2411, + "step": 24410 + }, + { + "epoch": 9.927206181374542, + "grad_norm": 0.04138269241023927, + "learning_rate": 2.7900904411710394e-06, + "loss": 0.0005, + "step": 24411 + }, + { + "epoch": 9.927612850752338, + "grad_norm": 0.02238318854093008, + "learning_rate": 2.789388395016025e-06, + "loss": 0.0003, + "step": 24412 + }, + { + "epoch": 9.928019520130134, + "grad_norm": 0.015865600000680054, + "learning_rate": 2.788686422881569e-06, + "loss": 0.0002, + "step": 24413 + }, + { + "epoch": 9.92842618950793, + "grad_norm": 0.046948664928169036, + "learning_rate": 2.7879845247748785e-06, + "loss": 0.0005, + "step": 24414 + }, + { + "epoch": 9.928832858885725, + "grad_norm": 1.7002531709527342, + "learning_rate": 2.7872827007031546e-06, + "loss": 0.0129, + "step": 24415 + }, + { + "epoch": 9.929239528263523, + "grad_norm": 0.31547854696471656, + "learning_rate": 2.7865809506736053e-06, + "loss": 0.0028, + "step": 24416 + }, + { + "epoch": 9.929646197641318, + "grad_norm": 0.0030071099674788245, + "learning_rate": 2.7858792746934314e-06, + "loss": 0.0, + "step": 24417 + }, + { + "epoch": 9.930052867019114, + "grad_norm": 0.05941136086431601, + "learning_rate": 2.785177672769839e-06, + "loss": 0.0006, + "step": 24418 + }, + { + "epoch": 9.93045953639691, + "grad_norm": 0.33393104195841844, + "learning_rate": 2.7844761449100256e-06, + "loss": 0.0034, + "step": 24419 + }, + { + "epoch": 9.930866205774706, + "grad_norm": 0.02306546791806017, + "learning_rate": 2.7837746911211983e-06, + "loss": 0.0002, + "step": 24420 + }, + { + "epoch": 9.931272875152501, + "grad_norm": 0.003947782410135188, + "learning_rate": 2.783073311410557e-06, + "loss": 0.0, + "step": 24421 + }, + { + "epoch": 9.931679544530297, + "grad_norm": 0.031791300999747314, + "learning_rate": 2.782372005785299e-06, + "loss": 0.0003, + "step": 24422 + }, + { + "epoch": 9.932086213908093, + "grad_norm": 0.5121729326713951, + "learning_rate": 2.781670774252626e-06, + "loss": 0.0054, + "step": 24423 + }, + { + "epoch": 9.932492883285889, + "grad_norm": 2.2318079218704714, + "learning_rate": 2.7809696168197344e-06, + "loss": 0.0291, + "step": 24424 + }, + { + "epoch": 9.932899552663685, + "grad_norm": 10.283893254117983, + "learning_rate": 2.7802685334938206e-06, + "loss": 0.142, + "step": 24425 + }, + { + "epoch": 9.93330622204148, + "grad_norm": 0.02581829035998964, + "learning_rate": 2.7795675242820865e-06, + "loss": 0.0003, + "step": 24426 + }, + { + "epoch": 9.933712891419276, + "grad_norm": 0.05888335106278195, + "learning_rate": 2.7788665891917268e-06, + "loss": 0.0004, + "step": 24427 + }, + { + "epoch": 9.934119560797072, + "grad_norm": 0.00692235186676215, + "learning_rate": 2.7781657282299344e-06, + "loss": 0.0001, + "step": 24428 + }, + { + "epoch": 9.934526230174868, + "grad_norm": 0.058810503328674366, + "learning_rate": 2.777464941403906e-06, + "loss": 0.0006, + "step": 24429 + }, + { + "epoch": 9.934932899552663, + "grad_norm": 0.7225074991126325, + "learning_rate": 2.7767642287208353e-06, + "loss": 0.0094, + "step": 24430 + }, + { + "epoch": 9.935339568930459, + "grad_norm": 0.6388767644988165, + "learning_rate": 2.7760635901879154e-06, + "loss": 0.005, + "step": 24431 + }, + { + "epoch": 9.935746238308255, + "grad_norm": 0.8528639171336267, + "learning_rate": 2.7753630258123344e-06, + "loss": 0.0046, + "step": 24432 + }, + { + "epoch": 9.93615290768605, + "grad_norm": 0.0346669669226529, + "learning_rate": 2.774662535601292e-06, + "loss": 0.0005, + "step": 24433 + }, + { + "epoch": 9.936559577063846, + "grad_norm": 0.0037224889268033805, + "learning_rate": 2.773962119561975e-06, + "loss": 0.0, + "step": 24434 + }, + { + "epoch": 9.936966246441642, + "grad_norm": 0.5219963014178703, + "learning_rate": 2.7732617777015736e-06, + "loss": 0.004, + "step": 24435 + }, + { + "epoch": 9.937372915819438, + "grad_norm": 2.6881952135086653, + "learning_rate": 2.7725615100272773e-06, + "loss": 0.0383, + "step": 24436 + }, + { + "epoch": 9.937779585197235, + "grad_norm": 0.015123217545017335, + "learning_rate": 2.771861316546274e-06, + "loss": 0.0002, + "step": 24437 + }, + { + "epoch": 9.938186254575031, + "grad_norm": 0.005844152959765142, + "learning_rate": 2.7711611972657494e-06, + "loss": 0.0001, + "step": 24438 + }, + { + "epoch": 9.938592923952827, + "grad_norm": 0.3752324620532504, + "learning_rate": 2.7704611521928958e-06, + "loss": 0.0036, + "step": 24439 + }, + { + "epoch": 9.938999593330623, + "grad_norm": 0.0657281974442613, + "learning_rate": 2.7697611813349014e-06, + "loss": 0.0005, + "step": 24440 + }, + { + "epoch": 9.939406262708419, + "grad_norm": 0.0006541701873162598, + "learning_rate": 2.769061284698944e-06, + "loss": 0.0, + "step": 24441 + }, + { + "epoch": 9.939812932086214, + "grad_norm": 0.002101732834102364, + "learning_rate": 2.7683614622922105e-06, + "loss": 0.0, + "step": 24442 + }, + { + "epoch": 9.94021960146401, + "grad_norm": 2.247515939002791, + "learning_rate": 2.7676617141218842e-06, + "loss": 0.0436, + "step": 24443 + }, + { + "epoch": 9.940626270841806, + "grad_norm": 0.0007562279413983232, + "learning_rate": 2.7669620401951537e-06, + "loss": 0.0, + "step": 24444 + }, + { + "epoch": 9.941032940219602, + "grad_norm": 0.19862779255624105, + "learning_rate": 2.7662624405191975e-06, + "loss": 0.0013, + "step": 24445 + }, + { + "epoch": 9.941439609597397, + "grad_norm": 0.0018270572374755058, + "learning_rate": 2.7655629151011977e-06, + "loss": 0.0, + "step": 24446 + }, + { + "epoch": 9.941846278975193, + "grad_norm": 0.1861941115487599, + "learning_rate": 2.764863463948336e-06, + "loss": 0.0019, + "step": 24447 + }, + { + "epoch": 9.942252948352989, + "grad_norm": 29.279306153840576, + "learning_rate": 2.764164087067793e-06, + "loss": 0.1702, + "step": 24448 + }, + { + "epoch": 9.942659617730785, + "grad_norm": 0.14167443817238512, + "learning_rate": 2.763464784466746e-06, + "loss": 0.0003, + "step": 24449 + }, + { + "epoch": 9.94306628710858, + "grad_norm": 0.0020382270857721902, + "learning_rate": 2.7627655561523736e-06, + "loss": 0.0, + "step": 24450 + }, + { + "epoch": 9.943472956486376, + "grad_norm": 1.8576497574685218, + "learning_rate": 2.7620664021318564e-06, + "loss": 0.019, + "step": 24451 + }, + { + "epoch": 9.943879625864172, + "grad_norm": 0.1693083507979997, + "learning_rate": 2.7613673224123714e-06, + "loss": 0.0023, + "step": 24452 + }, + { + "epoch": 9.944286295241968, + "grad_norm": 0.0010966625208012342, + "learning_rate": 2.7606683170010938e-06, + "loss": 0.0, + "step": 24453 + }, + { + "epoch": 9.944692964619763, + "grad_norm": 0.6853163184085889, + "learning_rate": 2.7599693859052e-06, + "loss": 0.0091, + "step": 24454 + }, + { + "epoch": 9.94509963399756, + "grad_norm": 0.03175404395173518, + "learning_rate": 2.7592705291318633e-06, + "loss": 0.0004, + "step": 24455 + }, + { + "epoch": 9.945506303375355, + "grad_norm": 0.0003389891440917931, + "learning_rate": 2.75857174668826e-06, + "loss": 0.0, + "step": 24456 + }, + { + "epoch": 9.945912972753153, + "grad_norm": 0.15048917544526072, + "learning_rate": 2.7578730385815577e-06, + "loss": 0.0009, + "step": 24457 + }, + { + "epoch": 9.946319642130948, + "grad_norm": 0.08081177345128598, + "learning_rate": 2.7571744048189377e-06, + "loss": 0.0006, + "step": 24458 + }, + { + "epoch": 9.946726311508744, + "grad_norm": 0.011125949699073932, + "learning_rate": 2.756475845407567e-06, + "loss": 0.0001, + "step": 24459 + }, + { + "epoch": 9.94713298088654, + "grad_norm": 0.006875352116012057, + "learning_rate": 2.755777360354618e-06, + "loss": 0.0001, + "step": 24460 + }, + { + "epoch": 9.947539650264336, + "grad_norm": 0.09257226592658814, + "learning_rate": 2.7550789496672602e-06, + "loss": 0.001, + "step": 24461 + }, + { + "epoch": 9.947946319642131, + "grad_norm": 0.0005248220767708374, + "learning_rate": 2.754380613352663e-06, + "loss": 0.0, + "step": 24462 + }, + { + "epoch": 9.948352989019927, + "grad_norm": 0.08917080580944894, + "learning_rate": 2.753682351417992e-06, + "loss": 0.0007, + "step": 24463 + }, + { + "epoch": 9.948759658397723, + "grad_norm": 0.034916613474935035, + "learning_rate": 2.7529841638704225e-06, + "loss": 0.0002, + "step": 24464 + }, + { + "epoch": 9.949166327775519, + "grad_norm": 0.017549427717290372, + "learning_rate": 2.752286050717118e-06, + "loss": 0.0002, + "step": 24465 + }, + { + "epoch": 9.949572997153314, + "grad_norm": 0.0015156441357660714, + "learning_rate": 2.751588011965244e-06, + "loss": 0.0, + "step": 24466 + }, + { + "epoch": 9.94997966653111, + "grad_norm": 0.004870839744742093, + "learning_rate": 2.750890047621968e-06, + "loss": 0.0, + "step": 24467 + }, + { + "epoch": 9.950386335908906, + "grad_norm": 7.333330669298736, + "learning_rate": 2.7501921576944533e-06, + "loss": 0.1179, + "step": 24468 + }, + { + "epoch": 9.950793005286702, + "grad_norm": 0.025362221373252076, + "learning_rate": 2.7494943421898655e-06, + "loss": 0.0003, + "step": 24469 + }, + { + "epoch": 9.951199674664498, + "grad_norm": 0.34343102549276744, + "learning_rate": 2.7487966011153635e-06, + "loss": 0.0039, + "step": 24470 + }, + { + "epoch": 9.951606344042293, + "grad_norm": 0.005676571294900835, + "learning_rate": 2.7480989344781172e-06, + "loss": 0.0001, + "step": 24471 + }, + { + "epoch": 9.952013013420089, + "grad_norm": 0.005055545785688754, + "learning_rate": 2.747401342285285e-06, + "loss": 0.0001, + "step": 24472 + }, + { + "epoch": 9.952419682797885, + "grad_norm": 0.057908274812456924, + "learning_rate": 2.746703824544029e-06, + "loss": 0.0005, + "step": 24473 + }, + { + "epoch": 9.95282635217568, + "grad_norm": 0.029777815098768434, + "learning_rate": 2.746006381261509e-06, + "loss": 0.0004, + "step": 24474 + }, + { + "epoch": 9.953233021553476, + "grad_norm": 3.007969605999734, + "learning_rate": 2.745309012444883e-06, + "loss": 0.0202, + "step": 24475 + }, + { + "epoch": 9.953639690931272, + "grad_norm": 0.07295017805992851, + "learning_rate": 2.744611718101309e-06, + "loss": 0.0011, + "step": 24476 + }, + { + "epoch": 9.954046360309068, + "grad_norm": 0.18180220077771816, + "learning_rate": 2.7439144982379506e-06, + "loss": 0.0019, + "step": 24477 + }, + { + "epoch": 9.954453029686865, + "grad_norm": 0.06272164071867956, + "learning_rate": 2.743217352861961e-06, + "loss": 0.0004, + "step": 24478 + }, + { + "epoch": 9.954859699064661, + "grad_norm": 0.14871517553075342, + "learning_rate": 2.7425202819804976e-06, + "loss": 0.0015, + "step": 24479 + }, + { + "epoch": 9.955266368442457, + "grad_norm": 2.884735834186444, + "learning_rate": 2.7418232856007164e-06, + "loss": 0.0418, + "step": 24480 + }, + { + "epoch": 9.955673037820253, + "grad_norm": 0.004957067582978326, + "learning_rate": 2.7411263637297726e-06, + "loss": 0.0, + "step": 24481 + }, + { + "epoch": 9.956079707198048, + "grad_norm": 0.019701916212102457, + "learning_rate": 2.7404295163748184e-06, + "loss": 0.0002, + "step": 24482 + }, + { + "epoch": 9.956486376575844, + "grad_norm": 0.6389987756302816, + "learning_rate": 2.7397327435430075e-06, + "loss": 0.0086, + "step": 24483 + }, + { + "epoch": 9.95689304595364, + "grad_norm": 0.15486411620431795, + "learning_rate": 2.7390360452414964e-06, + "loss": 0.0014, + "step": 24484 + }, + { + "epoch": 9.957299715331436, + "grad_norm": 0.23873678899033493, + "learning_rate": 2.738339421477435e-06, + "loss": 0.0021, + "step": 24485 + }, + { + "epoch": 9.957706384709232, + "grad_norm": 0.035894607878099574, + "learning_rate": 2.737642872257974e-06, + "loss": 0.0005, + "step": 24486 + }, + { + "epoch": 9.958113054087027, + "grad_norm": 0.4105122688598149, + "learning_rate": 2.7369463975902634e-06, + "loss": 0.0042, + "step": 24487 + }, + { + "epoch": 9.958519723464823, + "grad_norm": 0.007666109791913475, + "learning_rate": 2.7362499974814537e-06, + "loss": 0.0001, + "step": 24488 + }, + { + "epoch": 9.958926392842619, + "grad_norm": 0.010471343106500264, + "learning_rate": 2.735553671938691e-06, + "loss": 0.0001, + "step": 24489 + }, + { + "epoch": 9.959333062220415, + "grad_norm": 0.01571903665027257, + "learning_rate": 2.7348574209691293e-06, + "loss": 0.0002, + "step": 24490 + }, + { + "epoch": 9.95973973159821, + "grad_norm": 0.027452759588634783, + "learning_rate": 2.7341612445799114e-06, + "loss": 0.0003, + "step": 24491 + }, + { + "epoch": 9.960146400976006, + "grad_norm": 3.147441140632497, + "learning_rate": 2.7334651427781854e-06, + "loss": 0.0446, + "step": 24492 + }, + { + "epoch": 9.960553070353802, + "grad_norm": 0.08186534347572628, + "learning_rate": 2.7327691155710978e-06, + "loss": 0.0008, + "step": 24493 + }, + { + "epoch": 9.960959739731598, + "grad_norm": 0.0014754248118963413, + "learning_rate": 2.732073162965793e-06, + "loss": 0.0, + "step": 24494 + }, + { + "epoch": 9.961366409109393, + "grad_norm": 1.4398233679649879, + "learning_rate": 2.731377284969413e-06, + "loss": 0.0128, + "step": 24495 + }, + { + "epoch": 9.96177307848719, + "grad_norm": 0.01896193827838082, + "learning_rate": 2.7306814815891025e-06, + "loss": 0.0001, + "step": 24496 + }, + { + "epoch": 9.962179747864985, + "grad_norm": 0.002102419268455336, + "learning_rate": 2.729985752832007e-06, + "loss": 0.0, + "step": 24497 + }, + { + "epoch": 9.962586417242782, + "grad_norm": 0.01553073520153139, + "learning_rate": 2.7292900987052673e-06, + "loss": 0.0002, + "step": 24498 + }, + { + "epoch": 9.962993086620578, + "grad_norm": 1.3838339433546112, + "learning_rate": 2.7285945192160236e-06, + "loss": 0.0133, + "step": 24499 + }, + { + "epoch": 9.963399755998374, + "grad_norm": 4.697732157969609, + "learning_rate": 2.727899014371417e-06, + "loss": 0.1216, + "step": 24500 + }, + { + "epoch": 9.96380642537617, + "grad_norm": 11.095067107983928, + "learning_rate": 2.727203584178587e-06, + "loss": 0.2493, + "step": 24501 + }, + { + "epoch": 9.964213094753966, + "grad_norm": 0.07637804728149782, + "learning_rate": 2.72650822864467e-06, + "loss": 0.0007, + "step": 24502 + }, + { + "epoch": 9.964619764131761, + "grad_norm": 0.004879659684937135, + "learning_rate": 2.725812947776808e-06, + "loss": 0.0001, + "step": 24503 + }, + { + "epoch": 9.965026433509557, + "grad_norm": 0.005885221338414924, + "learning_rate": 2.7251177415821385e-06, + "loss": 0.0001, + "step": 24504 + }, + { + "epoch": 9.965433102887353, + "grad_norm": 0.0016211836787179067, + "learning_rate": 2.7244226100677963e-06, + "loss": 0.0, + "step": 24505 + }, + { + "epoch": 9.965839772265149, + "grad_norm": 0.00822165393260609, + "learning_rate": 2.723727553240918e-06, + "loss": 0.0001, + "step": 24506 + }, + { + "epoch": 9.966246441642944, + "grad_norm": 3.7856068391956694e-05, + "learning_rate": 2.7230325711086393e-06, + "loss": 0.0, + "step": 24507 + }, + { + "epoch": 9.96665311102074, + "grad_norm": 0.0034387439596863137, + "learning_rate": 2.722337663678092e-06, + "loss": 0.0001, + "step": 24508 + }, + { + "epoch": 9.967059780398536, + "grad_norm": 0.16847676724338512, + "learning_rate": 2.721642830956409e-06, + "loss": 0.0009, + "step": 24509 + }, + { + "epoch": 9.967466449776332, + "grad_norm": 0.005234880022898803, + "learning_rate": 2.720948072950729e-06, + "loss": 0.0001, + "step": 24510 + }, + { + "epoch": 9.967873119154127, + "grad_norm": 0.01951242329638871, + "learning_rate": 2.7202533896681805e-06, + "loss": 0.0001, + "step": 24511 + }, + { + "epoch": 9.968279788531923, + "grad_norm": 0.10359100908594017, + "learning_rate": 2.719558781115894e-06, + "loss": 0.0012, + "step": 24512 + }, + { + "epoch": 9.968686457909719, + "grad_norm": 0.014839639898822024, + "learning_rate": 2.7188642473010007e-06, + "loss": 0.0002, + "step": 24513 + }, + { + "epoch": 9.969093127287515, + "grad_norm": 1.7369149803187935, + "learning_rate": 2.718169788230631e-06, + "loss": 0.012, + "step": 24514 + }, + { + "epoch": 9.96949979666531, + "grad_norm": 0.0024538263872084343, + "learning_rate": 2.71747540391191e-06, + "loss": 0.0, + "step": 24515 + }, + { + "epoch": 9.969906466043106, + "grad_norm": 0.03529138695582372, + "learning_rate": 2.7167810943519725e-06, + "loss": 0.0004, + "step": 24516 + }, + { + "epoch": 9.970313135420902, + "grad_norm": 0.03896940584158204, + "learning_rate": 2.7160868595579427e-06, + "loss": 0.0004, + "step": 24517 + }, + { + "epoch": 9.970719804798698, + "grad_norm": 1.4823756784031044, + "learning_rate": 2.7153926995369475e-06, + "loss": 0.0169, + "step": 24518 + }, + { + "epoch": 9.971126474176495, + "grad_norm": 0.8386942429284806, + "learning_rate": 2.714698614296112e-06, + "loss": 0.0053, + "step": 24519 + }, + { + "epoch": 9.971533143554291, + "grad_norm": 0.08289481467115709, + "learning_rate": 2.7140046038425615e-06, + "loss": 0.0007, + "step": 24520 + }, + { + "epoch": 9.971939812932087, + "grad_norm": 2.332964073957159, + "learning_rate": 2.713310668183421e-06, + "loss": 0.0268, + "step": 24521 + }, + { + "epoch": 9.972346482309883, + "grad_norm": 0.022556007892752482, + "learning_rate": 2.712616807325811e-06, + "loss": 0.0002, + "step": 24522 + }, + { + "epoch": 9.972753151687678, + "grad_norm": 1.2416148945575838, + "learning_rate": 2.7119230212768597e-06, + "loss": 0.0137, + "step": 24523 + }, + { + "epoch": 9.973159821065474, + "grad_norm": 0.2789227192436427, + "learning_rate": 2.711229310043687e-06, + "loss": 0.0035, + "step": 24524 + }, + { + "epoch": 9.97356649044327, + "grad_norm": 0.5685470017014337, + "learning_rate": 2.710535673633413e-06, + "loss": 0.0051, + "step": 24525 + }, + { + "epoch": 9.973973159821066, + "grad_norm": 0.019400021699118558, + "learning_rate": 2.7098421120531604e-06, + "loss": 0.0002, + "step": 24526 + }, + { + "epoch": 9.974379829198861, + "grad_norm": 0.002109006058303661, + "learning_rate": 2.7091486253100464e-06, + "loss": 0.0, + "step": 24527 + }, + { + "epoch": 9.974786498576657, + "grad_norm": 0.7554711792787009, + "learning_rate": 2.7084552134111884e-06, + "loss": 0.0071, + "step": 24528 + }, + { + "epoch": 9.975193167954453, + "grad_norm": 0.5616278655710278, + "learning_rate": 2.7077618763637104e-06, + "loss": 0.0066, + "step": 24529 + }, + { + "epoch": 9.975599837332249, + "grad_norm": 0.012649255430169353, + "learning_rate": 2.707068614174727e-06, + "loss": 0.0002, + "step": 24530 + }, + { + "epoch": 9.976006506710045, + "grad_norm": 0.5847474489360699, + "learning_rate": 2.7063754268513554e-06, + "loss": 0.0052, + "step": 24531 + }, + { + "epoch": 9.97641317608784, + "grad_norm": 0.0031278109063217156, + "learning_rate": 2.7056823144007103e-06, + "loss": 0.0, + "step": 24532 + }, + { + "epoch": 9.976819845465636, + "grad_norm": 0.28191646035633355, + "learning_rate": 2.7049892768299068e-06, + "loss": 0.003, + "step": 24533 + }, + { + "epoch": 9.977226514843432, + "grad_norm": 12.676262852552501, + "learning_rate": 2.704296314146061e-06, + "loss": 0.0756, + "step": 24534 + }, + { + "epoch": 9.977633184221228, + "grad_norm": 0.7180671126793954, + "learning_rate": 2.7036034263562817e-06, + "loss": 0.008, + "step": 24535 + }, + { + "epoch": 9.978039853599023, + "grad_norm": 0.01977877811989736, + "learning_rate": 2.7029106134676887e-06, + "loss": 0.0003, + "step": 24536 + }, + { + "epoch": 9.978446522976819, + "grad_norm": 16.0845035304746, + "learning_rate": 2.7022178754873907e-06, + "loss": 0.5105, + "step": 24537 + }, + { + "epoch": 9.978853192354615, + "grad_norm": 16.787103005186673, + "learning_rate": 2.701525212422499e-06, + "loss": 0.2954, + "step": 24538 + }, + { + "epoch": 9.979259861732412, + "grad_norm": 0.8323186411597151, + "learning_rate": 2.700832624280124e-06, + "loss": 0.0096, + "step": 24539 + }, + { + "epoch": 9.979666531110208, + "grad_norm": 2.2477727648277517, + "learning_rate": 2.7001401110673765e-06, + "loss": 0.0391, + "step": 24540 + }, + { + "epoch": 9.980073200488004, + "grad_norm": 0.0023067013935711973, + "learning_rate": 2.699447672791363e-06, + "loss": 0.0, + "step": 24541 + }, + { + "epoch": 9.9804798698658, + "grad_norm": 0.5231020924340564, + "learning_rate": 2.6987553094591955e-06, + "loss": 0.005, + "step": 24542 + }, + { + "epoch": 9.980886539243595, + "grad_norm": 0.0011680364476777698, + "learning_rate": 2.698063021077978e-06, + "loss": 0.0, + "step": 24543 + }, + { + "epoch": 9.981293208621391, + "grad_norm": 1.4059537009175707, + "learning_rate": 2.69737080765482e-06, + "loss": 0.0141, + "step": 24544 + }, + { + "epoch": 9.981699877999187, + "grad_norm": 0.22085579809304742, + "learning_rate": 2.696678669196825e-06, + "loss": 0.0025, + "step": 24545 + }, + { + "epoch": 9.982106547376983, + "grad_norm": 0.06314533407897935, + "learning_rate": 2.695986605711096e-06, + "loss": 0.0008, + "step": 24546 + }, + { + "epoch": 9.982513216754779, + "grad_norm": 0.0039446409645876816, + "learning_rate": 2.6952946172047432e-06, + "loss": 0.0, + "step": 24547 + }, + { + "epoch": 9.982919886132574, + "grad_norm": 0.0002680832090611756, + "learning_rate": 2.6946027036848687e-06, + "loss": 0.0, + "step": 24548 + }, + { + "epoch": 9.98332655551037, + "grad_norm": 0.06941073309101309, + "learning_rate": 2.693910865158573e-06, + "loss": 0.0006, + "step": 24549 + }, + { + "epoch": 9.983733224888166, + "grad_norm": 0.06509270778150705, + "learning_rate": 2.6932191016329602e-06, + "loss": 0.0006, + "step": 24550 + }, + { + "epoch": 9.984139894265962, + "grad_norm": 1.522049377942788, + "learning_rate": 2.6925274131151303e-06, + "loss": 0.024, + "step": 24551 + }, + { + "epoch": 9.984546563643757, + "grad_norm": 0.024983009260672418, + "learning_rate": 2.6918357996121848e-06, + "loss": 0.0004, + "step": 24552 + }, + { + "epoch": 9.984953233021553, + "grad_norm": 0.36816123340288714, + "learning_rate": 2.691144261131219e-06, + "loss": 0.0042, + "step": 24553 + }, + { + "epoch": 9.985359902399349, + "grad_norm": 0.03882864946695976, + "learning_rate": 2.690452797679339e-06, + "loss": 0.0003, + "step": 24554 + }, + { + "epoch": 9.985766571777145, + "grad_norm": 0.0617098560225613, + "learning_rate": 2.6897614092636413e-06, + "loss": 0.0006, + "step": 24555 + }, + { + "epoch": 9.98617324115494, + "grad_norm": 1.0313799066462597, + "learning_rate": 2.689070095891221e-06, + "loss": 0.0107, + "step": 24556 + }, + { + "epoch": 9.986579910532736, + "grad_norm": 0.0847988556897356, + "learning_rate": 2.688378857569175e-06, + "loss": 0.0007, + "step": 24557 + }, + { + "epoch": 9.986986579910532, + "grad_norm": 0.023785872871135014, + "learning_rate": 2.6876876943046004e-06, + "loss": 0.0003, + "step": 24558 + }, + { + "epoch": 9.987393249288328, + "grad_norm": 1.2829825565194328, + "learning_rate": 2.6869966061045895e-06, + "loss": 0.0239, + "step": 24559 + }, + { + "epoch": 9.987799918666125, + "grad_norm": 9.195407377140876, + "learning_rate": 2.6863055929762406e-06, + "loss": 0.4695, + "step": 24560 + }, + { + "epoch": 9.988206588043921, + "grad_norm": 0.0503117138922541, + "learning_rate": 2.685614654926647e-06, + "loss": 0.0005, + "step": 24561 + }, + { + "epoch": 9.988613257421717, + "grad_norm": 0.0005357013463320432, + "learning_rate": 2.684923791962899e-06, + "loss": 0.0, + "step": 24562 + }, + { + "epoch": 9.989019926799513, + "grad_norm": 0.08597123917545844, + "learning_rate": 2.6842330040920906e-06, + "loss": 0.001, + "step": 24563 + }, + { + "epoch": 9.989426596177308, + "grad_norm": 4.0819322890909735, + "learning_rate": 2.6835422913213126e-06, + "loss": 0.0655, + "step": 24564 + }, + { + "epoch": 9.989833265555104, + "grad_norm": 0.27342929601602284, + "learning_rate": 2.6828516536576555e-06, + "loss": 0.003, + "step": 24565 + }, + { + "epoch": 9.9902399349329, + "grad_norm": 0.048200583861618615, + "learning_rate": 2.6821610911082054e-06, + "loss": 0.0005, + "step": 24566 + }, + { + "epoch": 9.990646604310696, + "grad_norm": 0.0064992181309084665, + "learning_rate": 2.6814706036800576e-06, + "loss": 0.0001, + "step": 24567 + }, + { + "epoch": 9.991053273688491, + "grad_norm": 0.011395638909373626, + "learning_rate": 2.6807801913802967e-06, + "loss": 0.0002, + "step": 24568 + }, + { + "epoch": 9.991459943066287, + "grad_norm": 0.28665932938163674, + "learning_rate": 2.6800898542160114e-06, + "loss": 0.0008, + "step": 24569 + }, + { + "epoch": 9.991866612444083, + "grad_norm": 0.04102566755788077, + "learning_rate": 2.6793995921942885e-06, + "loss": 0.0004, + "step": 24570 + }, + { + "epoch": 9.992273281821879, + "grad_norm": 1.612597990315356, + "learning_rate": 2.6787094053222118e-06, + "loss": 0.0227, + "step": 24571 + }, + { + "epoch": 9.992679951199674, + "grad_norm": 0.01184269725278923, + "learning_rate": 2.6780192936068695e-06, + "loss": 0.0001, + "step": 24572 + }, + { + "epoch": 9.99308662057747, + "grad_norm": 0.01702525793210788, + "learning_rate": 2.67732925705534e-06, + "loss": 0.0002, + "step": 24573 + }, + { + "epoch": 9.993493289955266, + "grad_norm": 0.016854305352532815, + "learning_rate": 2.6766392956747145e-06, + "loss": 0.0002, + "step": 24574 + }, + { + "epoch": 9.993899959333062, + "grad_norm": 3.9726884046123487, + "learning_rate": 2.675949409472072e-06, + "loss": 0.042, + "step": 24575 + }, + { + "epoch": 9.994306628710858, + "grad_norm": 0.427324049070725, + "learning_rate": 2.675259598454495e-06, + "loss": 0.0025, + "step": 24576 + }, + { + "epoch": 9.994713298088653, + "grad_norm": 0.0023302041076105804, + "learning_rate": 2.674569862629065e-06, + "loss": 0.0, + "step": 24577 + }, + { + "epoch": 9.995119967466449, + "grad_norm": 0.0063552572930746405, + "learning_rate": 2.673880202002862e-06, + "loss": 0.0001, + "step": 24578 + }, + { + "epoch": 9.995526636844245, + "grad_norm": 0.09705277319615435, + "learning_rate": 2.6731906165829623e-06, + "loss": 0.0007, + "step": 24579 + }, + { + "epoch": 9.995933306222042, + "grad_norm": 0.03135326480570379, + "learning_rate": 2.6725011063764526e-06, + "loss": 0.0003, + "step": 24580 + }, + { + "epoch": 9.996339975599838, + "grad_norm": 0.614225559760013, + "learning_rate": 2.6718116713904065e-06, + "loss": 0.0046, + "step": 24581 + }, + { + "epoch": 9.996746644977634, + "grad_norm": 0.05058022785865677, + "learning_rate": 2.6711223116319017e-06, + "loss": 0.0005, + "step": 24582 + }, + { + "epoch": 9.99715331435543, + "grad_norm": 0.08658078143667532, + "learning_rate": 2.670433027108015e-06, + "loss": 0.001, + "step": 24583 + }, + { + "epoch": 9.997559983733225, + "grad_norm": 0.17408225950826337, + "learning_rate": 2.6697438178258216e-06, + "loss": 0.0018, + "step": 24584 + }, + { + "epoch": 9.997966653111021, + "grad_norm": 0.040228741528822394, + "learning_rate": 2.669054683792398e-06, + "loss": 0.0005, + "step": 24585 + }, + { + "epoch": 9.998373322488817, + "grad_norm": 0.1857683864713744, + "learning_rate": 2.6683656250148136e-06, + "loss": 0.0009, + "step": 24586 + }, + { + "epoch": 9.998779991866613, + "grad_norm": 0.11733578820967645, + "learning_rate": 2.66767664150015e-06, + "loss": 0.0014, + "step": 24587 + }, + { + "epoch": 9.999186661244408, + "grad_norm": 1.6597677774178574, + "learning_rate": 2.666987733255476e-06, + "loss": 0.0079, + "step": 24588 + }, + { + "epoch": 9.999593330622204, + "grad_norm": 0.16918715614002705, + "learning_rate": 2.6662989002878625e-06, + "loss": 0.0018, + "step": 24589 + }, + { + "epoch": 10.0, + "grad_norm": 0.014157319764007276, + "learning_rate": 2.6656101426043822e-06, + "loss": 0.0001, + "step": 24590 + }, + { + "epoch": 10.000406669377796, + "grad_norm": 0.7275898552351999, + "learning_rate": 2.6649214602121055e-06, + "loss": 0.0064, + "step": 24591 + }, + { + "epoch": 10.000813338755592, + "grad_norm": 0.007274581642561443, + "learning_rate": 2.6642328531180983e-06, + "loss": 0.0001, + "step": 24592 + }, + { + "epoch": 10.001220008133387, + "grad_norm": 0.05099772226592966, + "learning_rate": 2.6635443213294367e-06, + "loss": 0.0007, + "step": 24593 + }, + { + "epoch": 10.001626677511183, + "grad_norm": 0.057418901011900975, + "learning_rate": 2.6628558648531845e-06, + "loss": 0.0005, + "step": 24594 + }, + { + "epoch": 10.002033346888979, + "grad_norm": 0.14234400100227393, + "learning_rate": 2.6621674836964097e-06, + "loss": 0.0013, + "step": 24595 + }, + { + "epoch": 10.002440016266775, + "grad_norm": 0.02006673183648532, + "learning_rate": 2.6614791778661776e-06, + "loss": 0.0002, + "step": 24596 + }, + { + "epoch": 10.00284668564457, + "grad_norm": 0.16622514830989987, + "learning_rate": 2.6607909473695557e-06, + "loss": 0.0024, + "step": 24597 + }, + { + "epoch": 10.003253355022366, + "grad_norm": 0.3081203921494798, + "learning_rate": 2.660102792213609e-06, + "loss": 0.0032, + "step": 24598 + }, + { + "epoch": 10.003660024400162, + "grad_norm": 0.006022930243181205, + "learning_rate": 2.6594147124053983e-06, + "loss": 0.0, + "step": 24599 + }, + { + "epoch": 10.004066693777958, + "grad_norm": 0.025808059734853277, + "learning_rate": 2.658726707951993e-06, + "loss": 0.0003, + "step": 24600 + }, + { + "epoch": 10.004473363155755, + "grad_norm": 0.008133101290196768, + "learning_rate": 2.658038778860452e-06, + "loss": 0.0001, + "step": 24601 + }, + { + "epoch": 10.004880032533551, + "grad_norm": 0.29251003947556214, + "learning_rate": 2.6573509251378393e-06, + "loss": 0.0032, + "step": 24602 + }, + { + "epoch": 10.005286701911347, + "grad_norm": 0.030663782788334933, + "learning_rate": 2.656663146791214e-06, + "loss": 0.0004, + "step": 24603 + }, + { + "epoch": 10.005693371289142, + "grad_norm": 0.04693839803169689, + "learning_rate": 2.6559754438276376e-06, + "loss": 0.0006, + "step": 24604 + }, + { + "epoch": 10.006100040666938, + "grad_norm": 0.14918048129084124, + "learning_rate": 2.6552878162541663e-06, + "loss": 0.0018, + "step": 24605 + }, + { + "epoch": 10.006506710044734, + "grad_norm": 0.15212939252288715, + "learning_rate": 2.6546002640778644e-06, + "loss": 0.0013, + "step": 24606 + }, + { + "epoch": 10.00691337942253, + "grad_norm": 0.7496374881908113, + "learning_rate": 2.653912787305789e-06, + "loss": 0.0079, + "step": 24607 + }, + { + "epoch": 10.007320048800326, + "grad_norm": 0.006526812668587443, + "learning_rate": 2.653225385944995e-06, + "loss": 0.0001, + "step": 24608 + }, + { + "epoch": 10.007726718178121, + "grad_norm": 0.2333903641555611, + "learning_rate": 2.6525380600025407e-06, + "loss": 0.0023, + "step": 24609 + }, + { + "epoch": 10.008133387555917, + "grad_norm": 0.03356770914847479, + "learning_rate": 2.6518508094854813e-06, + "loss": 0.0003, + "step": 24610 + }, + { + "epoch": 10.008540056933713, + "grad_norm": 0.43608935940077836, + "learning_rate": 2.6511636344008708e-06, + "loss": 0.0039, + "step": 24611 + }, + { + "epoch": 10.008946726311509, + "grad_norm": 0.008841395465108395, + "learning_rate": 2.6504765347557614e-06, + "loss": 0.0001, + "step": 24612 + }, + { + "epoch": 10.009353395689304, + "grad_norm": 0.10092162955265084, + "learning_rate": 2.6497895105572125e-06, + "loss": 0.0008, + "step": 24613 + }, + { + "epoch": 10.0097600650671, + "grad_norm": 0.1648568367077272, + "learning_rate": 2.6491025618122733e-06, + "loss": 0.0016, + "step": 24614 + }, + { + "epoch": 10.010166734444896, + "grad_norm": 0.04918643940997951, + "learning_rate": 2.6484156885279955e-06, + "loss": 0.0007, + "step": 24615 + }, + { + "epoch": 10.010573403822692, + "grad_norm": 0.18748521577721153, + "learning_rate": 2.647728890711432e-06, + "loss": 0.003, + "step": 24616 + }, + { + "epoch": 10.010980073200487, + "grad_norm": 0.006866001976941866, + "learning_rate": 2.6470421683696302e-06, + "loss": 0.0001, + "step": 24617 + }, + { + "epoch": 10.011386742578283, + "grad_norm": 0.0017032633494789453, + "learning_rate": 2.646355521509639e-06, + "loss": 0.0, + "step": 24618 + }, + { + "epoch": 10.011793411956079, + "grad_norm": 0.0057104866967009485, + "learning_rate": 2.6456689501385125e-06, + "loss": 0.0001, + "step": 24619 + }, + { + "epoch": 10.012200081333875, + "grad_norm": 0.05822016461986388, + "learning_rate": 2.6449824542632953e-06, + "loss": 0.0002, + "step": 24620 + }, + { + "epoch": 10.012606750711672, + "grad_norm": 0.14245963275995496, + "learning_rate": 2.644296033891035e-06, + "loss": 0.0013, + "step": 24621 + }, + { + "epoch": 10.013013420089468, + "grad_norm": 1.8783580320155189, + "learning_rate": 2.643609689028778e-06, + "loss": 0.0269, + "step": 24622 + }, + { + "epoch": 10.013420089467264, + "grad_norm": 0.0004934443648960676, + "learning_rate": 2.6429234196835696e-06, + "loss": 0.0, + "step": 24623 + }, + { + "epoch": 10.01382675884506, + "grad_norm": 18.77767517736941, + "learning_rate": 2.6422372258624563e-06, + "loss": 0.2531, + "step": 24624 + }, + { + "epoch": 10.014233428222855, + "grad_norm": 0.006563815009569395, + "learning_rate": 2.6415511075724766e-06, + "loss": 0.0001, + "step": 24625 + }, + { + "epoch": 10.014640097600651, + "grad_norm": 0.09882546257821295, + "learning_rate": 2.6408650648206812e-06, + "loss": 0.0009, + "step": 24626 + }, + { + "epoch": 10.015046766978447, + "grad_norm": 0.11569908022715397, + "learning_rate": 2.6401790976141108e-06, + "loss": 0.0015, + "step": 24627 + }, + { + "epoch": 10.015453436356243, + "grad_norm": 0.0009556246203397983, + "learning_rate": 2.6394932059598053e-06, + "loss": 0.0, + "step": 24628 + }, + { + "epoch": 10.015860105734038, + "grad_norm": 0.006448251666227613, + "learning_rate": 2.638807389864807e-06, + "loss": 0.0001, + "step": 24629 + }, + { + "epoch": 10.016266775111834, + "grad_norm": 0.29161579631566725, + "learning_rate": 2.638121649336155e-06, + "loss": 0.0025, + "step": 24630 + }, + { + "epoch": 10.01667344448963, + "grad_norm": 0.11184271013744028, + "learning_rate": 2.637435984380886e-06, + "loss": 0.0012, + "step": 24631 + }, + { + "epoch": 10.017080113867426, + "grad_norm": 0.1006674263751232, + "learning_rate": 2.6367503950060458e-06, + "loss": 0.0009, + "step": 24632 + }, + { + "epoch": 10.017486783245221, + "grad_norm": 0.00989349741231511, + "learning_rate": 2.6360648812186696e-06, + "loss": 0.0002, + "step": 24633 + }, + { + "epoch": 10.017893452623017, + "grad_norm": 0.36899695379931224, + "learning_rate": 2.6353794430257916e-06, + "loss": 0.0046, + "step": 24634 + }, + { + "epoch": 10.018300122000813, + "grad_norm": 0.08392766419020113, + "learning_rate": 2.6346940804344513e-06, + "loss": 0.0011, + "step": 24635 + }, + { + "epoch": 10.018706791378609, + "grad_norm": 0.1675616612260924, + "learning_rate": 2.6340087934516834e-06, + "loss": 0.0016, + "step": 24636 + }, + { + "epoch": 10.019113460756405, + "grad_norm": 0.01577819023609587, + "learning_rate": 2.6333235820845225e-06, + "loss": 0.0002, + "step": 24637 + }, + { + "epoch": 10.0195201301342, + "grad_norm": 0.10453912148372711, + "learning_rate": 2.632638446339999e-06, + "loss": 0.0009, + "step": 24638 + }, + { + "epoch": 10.019926799511996, + "grad_norm": 0.024604361409920953, + "learning_rate": 2.631953386225156e-06, + "loss": 0.0002, + "step": 24639 + }, + { + "epoch": 10.020333468889792, + "grad_norm": 0.04427431578397133, + "learning_rate": 2.6312684017470167e-06, + "loss": 0.0004, + "step": 24640 + }, + { + "epoch": 10.020740138267588, + "grad_norm": 0.0346621807545316, + "learning_rate": 2.6305834929126163e-06, + "loss": 0.0004, + "step": 24641 + }, + { + "epoch": 10.021146807645385, + "grad_norm": 1.4787455820723154, + "learning_rate": 2.6298986597289844e-06, + "loss": 0.0142, + "step": 24642 + }, + { + "epoch": 10.021553477023181, + "grad_norm": 0.009896161806394221, + "learning_rate": 2.6292139022031494e-06, + "loss": 0.0001, + "step": 24643 + }, + { + "epoch": 10.021960146400977, + "grad_norm": 0.15922855929736843, + "learning_rate": 2.6285292203421463e-06, + "loss": 0.0026, + "step": 24644 + }, + { + "epoch": 10.022366815778772, + "grad_norm": 0.7866686918910722, + "learning_rate": 2.627844614153001e-06, + "loss": 0.0051, + "step": 24645 + }, + { + "epoch": 10.022773485156568, + "grad_norm": 0.0023726348181879204, + "learning_rate": 2.6271600836427415e-06, + "loss": 0.0, + "step": 24646 + }, + { + "epoch": 10.023180154534364, + "grad_norm": 0.19847977638135364, + "learning_rate": 2.6264756288183934e-06, + "loss": 0.0019, + "step": 24647 + }, + { + "epoch": 10.02358682391216, + "grad_norm": 0.293051075593966, + "learning_rate": 2.625791249686984e-06, + "loss": 0.0036, + "step": 24648 + }, + { + "epoch": 10.023993493289955, + "grad_norm": 0.020575336025021585, + "learning_rate": 2.6251069462555368e-06, + "loss": 0.0002, + "step": 24649 + }, + { + "epoch": 10.024400162667751, + "grad_norm": 1.4767423713657142, + "learning_rate": 2.624422718531081e-06, + "loss": 0.0164, + "step": 24650 + }, + { + "epoch": 10.024806832045547, + "grad_norm": 21.468790157745943, + "learning_rate": 2.6237385665206384e-06, + "loss": 0.288, + "step": 24651 + }, + { + "epoch": 10.025213501423343, + "grad_norm": 0.05414445626748402, + "learning_rate": 2.6230544902312305e-06, + "loss": 0.0008, + "step": 24652 + }, + { + "epoch": 10.025620170801139, + "grad_norm": 0.43881293581358016, + "learning_rate": 2.622370489669882e-06, + "loss": 0.0046, + "step": 24653 + }, + { + "epoch": 10.026026840178934, + "grad_norm": 0.03152896872564428, + "learning_rate": 2.621686564843614e-06, + "loss": 0.0004, + "step": 24654 + }, + { + "epoch": 10.02643350955673, + "grad_norm": 0.0330102011091329, + "learning_rate": 2.6210027157594453e-06, + "loss": 0.0003, + "step": 24655 + }, + { + "epoch": 10.026840178934526, + "grad_norm": 0.028251870278658796, + "learning_rate": 2.6203189424243957e-06, + "loss": 0.0004, + "step": 24656 + }, + { + "epoch": 10.027246848312322, + "grad_norm": 0.010087893313906483, + "learning_rate": 2.61963524484549e-06, + "loss": 0.0001, + "step": 24657 + }, + { + "epoch": 10.027653517690117, + "grad_norm": 0.13153508868203043, + "learning_rate": 2.6189516230297416e-06, + "loss": 0.0009, + "step": 24658 + }, + { + "epoch": 10.028060187067913, + "grad_norm": 0.011188631863270921, + "learning_rate": 2.6182680769841695e-06, + "loss": 0.0001, + "step": 24659 + }, + { + "epoch": 10.028466856445709, + "grad_norm": 0.02086377868782013, + "learning_rate": 2.6175846067157915e-06, + "loss": 0.0003, + "step": 24660 + }, + { + "epoch": 10.028873525823505, + "grad_norm": 0.0053450818144329185, + "learning_rate": 2.616901212231623e-06, + "loss": 0.0001, + "step": 24661 + }, + { + "epoch": 10.029280195201302, + "grad_norm": 0.10621895614645827, + "learning_rate": 2.616217893538676e-06, + "loss": 0.001, + "step": 24662 + }, + { + "epoch": 10.029686864579098, + "grad_norm": 0.42256286689654243, + "learning_rate": 2.6155346506439715e-06, + "loss": 0.0062, + "step": 24663 + }, + { + "epoch": 10.030093533956894, + "grad_norm": 0.045022879401013266, + "learning_rate": 2.6148514835545203e-06, + "loss": 0.0006, + "step": 24664 + }, + { + "epoch": 10.03050020333469, + "grad_norm": 0.05190370937963987, + "learning_rate": 2.614168392277335e-06, + "loss": 0.0004, + "step": 24665 + }, + { + "epoch": 10.030906872712485, + "grad_norm": 0.12438219965920645, + "learning_rate": 2.613485376819429e-06, + "loss": 0.0009, + "step": 24666 + }, + { + "epoch": 10.031313542090281, + "grad_norm": 0.02131618776285954, + "learning_rate": 2.6128024371878135e-06, + "loss": 0.0003, + "step": 24667 + }, + { + "epoch": 10.031720211468077, + "grad_norm": 0.12511312864208451, + "learning_rate": 2.612119573389499e-06, + "loss": 0.001, + "step": 24668 + }, + { + "epoch": 10.032126880845873, + "grad_norm": 0.05674798250572788, + "learning_rate": 2.6114367854314913e-06, + "loss": 0.0007, + "step": 24669 + }, + { + "epoch": 10.032533550223668, + "grad_norm": 0.029647919906678727, + "learning_rate": 2.610754073320807e-06, + "loss": 0.0003, + "step": 24670 + }, + { + "epoch": 10.032940219601464, + "grad_norm": 0.17990015933981493, + "learning_rate": 2.610071437064452e-06, + "loss": 0.0012, + "step": 24671 + }, + { + "epoch": 10.03334688897926, + "grad_norm": 0.0055490523267749635, + "learning_rate": 2.609388876669432e-06, + "loss": 0.0001, + "step": 24672 + }, + { + "epoch": 10.033753558357056, + "grad_norm": 0.034884213536801566, + "learning_rate": 2.6087063921427547e-06, + "loss": 0.0003, + "step": 24673 + }, + { + "epoch": 10.034160227734851, + "grad_norm": 6.634415029393858e-05, + "learning_rate": 2.6080239834914266e-06, + "loss": 0.0, + "step": 24674 + }, + { + "epoch": 10.034566897112647, + "grad_norm": 0.23908709015755045, + "learning_rate": 2.6073416507224523e-06, + "loss": 0.0032, + "step": 24675 + }, + { + "epoch": 10.034973566490443, + "grad_norm": 0.03227794972256575, + "learning_rate": 2.606659393842834e-06, + "loss": 0.0004, + "step": 24676 + }, + { + "epoch": 10.035380235868239, + "grad_norm": 0.03095320292932152, + "learning_rate": 2.6059772128595805e-06, + "loss": 0.0004, + "step": 24677 + }, + { + "epoch": 10.035786905246034, + "grad_norm": 1.2715809083709633, + "learning_rate": 2.605295107779693e-06, + "loss": 0.0026, + "step": 24678 + }, + { + "epoch": 10.03619357462383, + "grad_norm": 0.009998715122416903, + "learning_rate": 2.604613078610173e-06, + "loss": 0.0001, + "step": 24679 + }, + { + "epoch": 10.036600244001626, + "grad_norm": 0.5937500710255963, + "learning_rate": 2.6039311253580212e-06, + "loss": 0.0054, + "step": 24680 + }, + { + "epoch": 10.037006913379422, + "grad_norm": 1.3400410450510094, + "learning_rate": 2.6032492480302385e-06, + "loss": 0.0161, + "step": 24681 + }, + { + "epoch": 10.037413582757218, + "grad_norm": 0.09524900445151513, + "learning_rate": 2.6025674466338226e-06, + "loss": 0.0008, + "step": 24682 + }, + { + "epoch": 10.037820252135015, + "grad_norm": 0.06600271744641137, + "learning_rate": 2.6018857211757773e-06, + "loss": 0.0008, + "step": 24683 + }, + { + "epoch": 10.03822692151281, + "grad_norm": 0.0005161081918438491, + "learning_rate": 2.601204071663099e-06, + "loss": 0.0, + "step": 24684 + }, + { + "epoch": 10.038633590890607, + "grad_norm": 0.13885242363847242, + "learning_rate": 2.6005224981027845e-06, + "loss": 0.0014, + "step": 24685 + }, + { + "epoch": 10.039040260268402, + "grad_norm": 0.026015895869432717, + "learning_rate": 2.5998410005018303e-06, + "loss": 0.0002, + "step": 24686 + }, + { + "epoch": 10.039446929646198, + "grad_norm": 0.008092627420956391, + "learning_rate": 2.599159578867233e-06, + "loss": 0.0001, + "step": 24687 + }, + { + "epoch": 10.039853599023994, + "grad_norm": 0.013272358848130142, + "learning_rate": 2.5984782332059877e-06, + "loss": 0.0001, + "step": 24688 + }, + { + "epoch": 10.04026026840179, + "grad_norm": 0.23408270261752895, + "learning_rate": 2.5977969635250842e-06, + "loss": 0.0012, + "step": 24689 + }, + { + "epoch": 10.040666937779585, + "grad_norm": 0.046833068520789535, + "learning_rate": 2.597115769831524e-06, + "loss": 0.0004, + "step": 24690 + }, + { + "epoch": 10.041073607157381, + "grad_norm": 0.13166128134025928, + "learning_rate": 2.596434652132297e-06, + "loss": 0.002, + "step": 24691 + }, + { + "epoch": 10.041480276535177, + "grad_norm": 0.09370039078940932, + "learning_rate": 2.595753610434394e-06, + "loss": 0.0004, + "step": 24692 + }, + { + "epoch": 10.041886945912973, + "grad_norm": 0.0012672345006038601, + "learning_rate": 2.595072644744806e-06, + "loss": 0.0, + "step": 24693 + }, + { + "epoch": 10.042293615290768, + "grad_norm": 0.06740428866698371, + "learning_rate": 2.5943917550705243e-06, + "loss": 0.0007, + "step": 24694 + }, + { + "epoch": 10.042700284668564, + "grad_norm": 0.6303645195680977, + "learning_rate": 2.593710941418537e-06, + "loss": 0.0066, + "step": 24695 + }, + { + "epoch": 10.04310695404636, + "grad_norm": 0.21564877152585074, + "learning_rate": 2.5930302037958353e-06, + "loss": 0.0024, + "step": 24696 + }, + { + "epoch": 10.043513623424156, + "grad_norm": 3.309902423258819, + "learning_rate": 2.592349542209407e-06, + "loss": 0.0894, + "step": 24697 + }, + { + "epoch": 10.043920292801952, + "grad_norm": 1.340049854590285, + "learning_rate": 2.591668956666239e-06, + "loss": 0.016, + "step": 24698 + }, + { + "epoch": 10.044326962179747, + "grad_norm": 0.009323522227043418, + "learning_rate": 2.590988447173318e-06, + "loss": 0.0001, + "step": 24699 + }, + { + "epoch": 10.044733631557543, + "grad_norm": 6.7027433437899, + "learning_rate": 2.590308013737629e-06, + "loss": 0.0459, + "step": 24700 + }, + { + "epoch": 10.045140300935339, + "grad_norm": 0.028035536764318594, + "learning_rate": 2.589627656366158e-06, + "loss": 0.0003, + "step": 24701 + }, + { + "epoch": 10.045546970313135, + "grad_norm": 0.07391082457421076, + "learning_rate": 2.5889473750658855e-06, + "loss": 0.0012, + "step": 24702 + }, + { + "epoch": 10.045953639690932, + "grad_norm": 2.803978702576841, + "learning_rate": 2.588267169843801e-06, + "loss": 0.0571, + "step": 24703 + }, + { + "epoch": 10.046360309068728, + "grad_norm": 0.014265366816803841, + "learning_rate": 2.5875870407068837e-06, + "loss": 0.0002, + "step": 24704 + }, + { + "epoch": 10.046766978446524, + "grad_norm": 0.010214914971414118, + "learning_rate": 2.5869069876621165e-06, + "loss": 0.0001, + "step": 24705 + }, + { + "epoch": 10.04717364782432, + "grad_norm": 0.005992530547361681, + "learning_rate": 2.58622701071648e-06, + "loss": 0.0001, + "step": 24706 + }, + { + "epoch": 10.047580317202115, + "grad_norm": 0.0026864979067658203, + "learning_rate": 2.5855471098769546e-06, + "loss": 0.0, + "step": 24707 + }, + { + "epoch": 10.047986986579911, + "grad_norm": 0.06365399771009206, + "learning_rate": 2.5848672851505176e-06, + "loss": 0.0006, + "step": 24708 + }, + { + "epoch": 10.048393655957707, + "grad_norm": 8.281414700840626, + "learning_rate": 2.5841875365441516e-06, + "loss": 0.1589, + "step": 24709 + }, + { + "epoch": 10.048800325335502, + "grad_norm": 0.0006454354028765946, + "learning_rate": 2.5835078640648324e-06, + "loss": 0.0, + "step": 24710 + }, + { + "epoch": 10.049206994713298, + "grad_norm": 0.042193708146252194, + "learning_rate": 2.5828282677195395e-06, + "loss": 0.0004, + "step": 24711 + }, + { + "epoch": 10.049613664091094, + "grad_norm": 0.2078928686375659, + "learning_rate": 2.582148747515245e-06, + "loss": 0.0028, + "step": 24712 + }, + { + "epoch": 10.05002033346889, + "grad_norm": 2.1004588460227627, + "learning_rate": 2.5814693034589287e-06, + "loss": 0.0177, + "step": 24713 + }, + { + "epoch": 10.050427002846686, + "grad_norm": 0.04014086816152888, + "learning_rate": 2.580789935557564e-06, + "loss": 0.0005, + "step": 24714 + }, + { + "epoch": 10.050833672224481, + "grad_norm": 0.022251245259332822, + "learning_rate": 2.58011064381812e-06, + "loss": 0.0003, + "step": 24715 + }, + { + "epoch": 10.051240341602277, + "grad_norm": 0.1081800588447707, + "learning_rate": 2.5794314282475787e-06, + "loss": 0.0007, + "step": 24716 + }, + { + "epoch": 10.051647010980073, + "grad_norm": 0.19319492633423266, + "learning_rate": 2.5787522888529083e-06, + "loss": 0.0024, + "step": 24717 + }, + { + "epoch": 10.052053680357869, + "grad_norm": 0.11488207958982487, + "learning_rate": 2.57807322564108e-06, + "loss": 0.0018, + "step": 24718 + }, + { + "epoch": 10.052460349735664, + "grad_norm": 0.007765566408491853, + "learning_rate": 2.5773942386190665e-06, + "loss": 0.0, + "step": 24719 + }, + { + "epoch": 10.05286701911346, + "grad_norm": 0.07725461061067146, + "learning_rate": 2.5767153277938364e-06, + "loss": 0.0012, + "step": 24720 + }, + { + "epoch": 10.053273688491256, + "grad_norm": 0.004685556431233204, + "learning_rate": 2.5760364931723557e-06, + "loss": 0.0001, + "step": 24721 + }, + { + "epoch": 10.053680357869052, + "grad_norm": 0.0016783220182234263, + "learning_rate": 2.575357734761601e-06, + "loss": 0.0, + "step": 24722 + }, + { + "epoch": 10.054087027246847, + "grad_norm": 0.02695414289762568, + "learning_rate": 2.574679052568536e-06, + "loss": 0.0003, + "step": 24723 + }, + { + "epoch": 10.054493696624645, + "grad_norm": 0.09685146853222346, + "learning_rate": 2.5740004466001266e-06, + "loss": 0.0011, + "step": 24724 + }, + { + "epoch": 10.05490036600244, + "grad_norm": 1.5941297099836482, + "learning_rate": 2.5733219168633407e-06, + "loss": 0.0136, + "step": 24725 + }, + { + "epoch": 10.055307035380237, + "grad_norm": 0.07437835054663283, + "learning_rate": 2.572643463365143e-06, + "loss": 0.0005, + "step": 24726 + }, + { + "epoch": 10.055713704758032, + "grad_norm": 0.0030777535908862574, + "learning_rate": 2.5719650861124977e-06, + "loss": 0.0, + "step": 24727 + }, + { + "epoch": 10.056120374135828, + "grad_norm": 0.001958724135489462, + "learning_rate": 2.571286785112367e-06, + "loss": 0.0, + "step": 24728 + }, + { + "epoch": 10.056527043513624, + "grad_norm": 0.27941928964694057, + "learning_rate": 2.570608560371719e-06, + "loss": 0.0014, + "step": 24729 + }, + { + "epoch": 10.05693371289142, + "grad_norm": 1.121017020360689, + "learning_rate": 2.569930411897513e-06, + "loss": 0.0105, + "step": 24730 + }, + { + "epoch": 10.057340382269215, + "grad_norm": 1.3242762984159238, + "learning_rate": 2.569252339696712e-06, + "loss": 0.0174, + "step": 24731 + }, + { + "epoch": 10.057747051647011, + "grad_norm": 0.28297321867583797, + "learning_rate": 2.5685743437762744e-06, + "loss": 0.0021, + "step": 24732 + }, + { + "epoch": 10.058153721024807, + "grad_norm": 0.03645589412120527, + "learning_rate": 2.5678964241431614e-06, + "loss": 0.0003, + "step": 24733 + }, + { + "epoch": 10.058560390402603, + "grad_norm": 0.06515951525131114, + "learning_rate": 2.567218580804329e-06, + "loss": 0.0006, + "step": 24734 + }, + { + "epoch": 10.058967059780398, + "grad_norm": 27.78967932015744, + "learning_rate": 2.5665408137667423e-06, + "loss": 0.7846, + "step": 24735 + }, + { + "epoch": 10.059373729158194, + "grad_norm": 0.4618927761260623, + "learning_rate": 2.565863123037354e-06, + "loss": 0.004, + "step": 24736 + }, + { + "epoch": 10.05978039853599, + "grad_norm": 0.1662956397739169, + "learning_rate": 2.5651855086231236e-06, + "loss": 0.0012, + "step": 24737 + }, + { + "epoch": 10.060187067913786, + "grad_norm": 0.7988438139105818, + "learning_rate": 2.564507970531006e-06, + "loss": 0.0037, + "step": 24738 + }, + { + "epoch": 10.060593737291581, + "grad_norm": 0.16844308176023357, + "learning_rate": 2.563830508767955e-06, + "loss": 0.0019, + "step": 24739 + }, + { + "epoch": 10.061000406669377, + "grad_norm": 0.07665370810636891, + "learning_rate": 2.563153123340928e-06, + "loss": 0.0008, + "step": 24740 + }, + { + "epoch": 10.061407076047173, + "grad_norm": 0.08538067716200508, + "learning_rate": 2.562475814256876e-06, + "loss": 0.0016, + "step": 24741 + }, + { + "epoch": 10.061813745424969, + "grad_norm": 0.0513753379597687, + "learning_rate": 2.561798581522753e-06, + "loss": 0.0005, + "step": 24742 + }, + { + "epoch": 10.062220414802765, + "grad_norm": 0.036469438539339954, + "learning_rate": 2.5611214251455106e-06, + "loss": 0.0003, + "step": 24743 + }, + { + "epoch": 10.062627084180562, + "grad_norm": 0.2189937087733073, + "learning_rate": 2.5604443451321015e-06, + "loss": 0.0016, + "step": 24744 + }, + { + "epoch": 10.063033753558358, + "grad_norm": 0.015082096529014788, + "learning_rate": 2.5597673414894754e-06, + "loss": 0.0002, + "step": 24745 + }, + { + "epoch": 10.063440422936154, + "grad_norm": 0.238655252821848, + "learning_rate": 2.5590904142245797e-06, + "loss": 0.0026, + "step": 24746 + }, + { + "epoch": 10.06384709231395, + "grad_norm": 0.005958991000318465, + "learning_rate": 2.5584135633443673e-06, + "loss": 0.0, + "step": 24747 + }, + { + "epoch": 10.064253761691745, + "grad_norm": 1.274919733668886, + "learning_rate": 2.5577367888557867e-06, + "loss": 0.0106, + "step": 24748 + }, + { + "epoch": 10.064660431069541, + "grad_norm": 0.04006296910159241, + "learning_rate": 2.557060090765783e-06, + "loss": 0.0005, + "step": 24749 + }, + { + "epoch": 10.065067100447337, + "grad_norm": 0.004187976392485011, + "learning_rate": 2.5563834690813027e-06, + "loss": 0.0, + "step": 24750 + }, + { + "epoch": 10.065473769825132, + "grad_norm": 0.05925258563483669, + "learning_rate": 2.5557069238092923e-06, + "loss": 0.0008, + "step": 24751 + }, + { + "epoch": 10.065880439202928, + "grad_norm": 0.6638065729834565, + "learning_rate": 2.555030454956695e-06, + "loss": 0.0065, + "step": 24752 + }, + { + "epoch": 10.066287108580724, + "grad_norm": 0.01074650655429502, + "learning_rate": 2.5543540625304595e-06, + "loss": 0.0002, + "step": 24753 + }, + { + "epoch": 10.06669377795852, + "grad_norm": 0.0022624148644335394, + "learning_rate": 2.5536777465375273e-06, + "loss": 0.0, + "step": 24754 + }, + { + "epoch": 10.067100447336315, + "grad_norm": 0.0008482168888374169, + "learning_rate": 2.5530015069848402e-06, + "loss": 0.0, + "step": 24755 + }, + { + "epoch": 10.067507116714111, + "grad_norm": 1.4289121774121785, + "learning_rate": 2.5523253438793403e-06, + "loss": 0.025, + "step": 24756 + }, + { + "epoch": 10.067913786091907, + "grad_norm": 2.8096846622537144, + "learning_rate": 2.5516492572279693e-06, + "loss": 0.0276, + "step": 24757 + }, + { + "epoch": 10.068320455469703, + "grad_norm": 0.022323108105187284, + "learning_rate": 2.550973247037668e-06, + "loss": 0.0002, + "step": 24758 + }, + { + "epoch": 10.068727124847499, + "grad_norm": 0.04674670338757551, + "learning_rate": 2.550297313315372e-06, + "loss": 0.0007, + "step": 24759 + }, + { + "epoch": 10.069133794225294, + "grad_norm": 3.267637714743573, + "learning_rate": 2.5496214560680257e-06, + "loss": 0.0473, + "step": 24760 + }, + { + "epoch": 10.06954046360309, + "grad_norm": 0.012482679724970937, + "learning_rate": 2.5489456753025653e-06, + "loss": 0.0001, + "step": 24761 + }, + { + "epoch": 10.069947132980886, + "grad_norm": 0.2249852753968916, + "learning_rate": 2.5482699710259284e-06, + "loss": 0.0033, + "step": 24762 + }, + { + "epoch": 10.070353802358682, + "grad_norm": 0.10293070955005586, + "learning_rate": 2.54759434324505e-06, + "loss": 0.0009, + "step": 24763 + }, + { + "epoch": 10.070760471736477, + "grad_norm": 0.03919617000546628, + "learning_rate": 2.546918791966866e-06, + "loss": 0.0005, + "step": 24764 + }, + { + "epoch": 10.071167141114275, + "grad_norm": 0.22837508096048612, + "learning_rate": 2.5462433171983092e-06, + "loss": 0.002, + "step": 24765 + }, + { + "epoch": 10.07157381049207, + "grad_norm": 0.004661134256680478, + "learning_rate": 2.545567918946319e-06, + "loss": 0.0, + "step": 24766 + }, + { + "epoch": 10.071980479869866, + "grad_norm": 0.187288253802883, + "learning_rate": 2.544892597217826e-06, + "loss": 0.0024, + "step": 24767 + }, + { + "epoch": 10.072387149247662, + "grad_norm": 0.0055257454903832786, + "learning_rate": 2.544217352019762e-06, + "loss": 0.0001, + "step": 24768 + }, + { + "epoch": 10.072793818625458, + "grad_norm": 0.008385867339288082, + "learning_rate": 2.54354218335906e-06, + "loss": 0.0001, + "step": 24769 + }, + { + "epoch": 10.073200488003254, + "grad_norm": 0.060146138482815366, + "learning_rate": 2.54286709124265e-06, + "loss": 0.0006, + "step": 24770 + }, + { + "epoch": 10.07360715738105, + "grad_norm": 0.007724843612314078, + "learning_rate": 2.542192075677462e-06, + "loss": 0.0, + "step": 24771 + }, + { + "epoch": 10.074013826758845, + "grad_norm": 0.5675961042681429, + "learning_rate": 2.541517136670423e-06, + "loss": 0.0061, + "step": 24772 + }, + { + "epoch": 10.074420496136641, + "grad_norm": 0.006385765784936776, + "learning_rate": 2.5408422742284678e-06, + "loss": 0.0001, + "step": 24773 + }, + { + "epoch": 10.074827165514437, + "grad_norm": 0.09476552705713058, + "learning_rate": 2.5401674883585203e-06, + "loss": 0.0009, + "step": 24774 + }, + { + "epoch": 10.075233834892233, + "grad_norm": 0.197538768624234, + "learning_rate": 2.5394927790675083e-06, + "loss": 0.0016, + "step": 24775 + }, + { + "epoch": 10.075640504270028, + "grad_norm": 0.1103061802512285, + "learning_rate": 2.5388181463623574e-06, + "loss": 0.0012, + "step": 24776 + }, + { + "epoch": 10.076047173647824, + "grad_norm": 0.006092740121100408, + "learning_rate": 2.5381435902499927e-06, + "loss": 0.0001, + "step": 24777 + }, + { + "epoch": 10.07645384302562, + "grad_norm": 0.03412459248511524, + "learning_rate": 2.537469110737338e-06, + "loss": 0.0004, + "step": 24778 + }, + { + "epoch": 10.076860512403416, + "grad_norm": 0.016102175171575576, + "learning_rate": 2.5367947078313217e-06, + "loss": 0.0002, + "step": 24779 + }, + { + "epoch": 10.077267181781211, + "grad_norm": 0.02297755758800194, + "learning_rate": 2.5361203815388623e-06, + "loss": 0.0002, + "step": 24780 + }, + { + "epoch": 10.077673851159007, + "grad_norm": 0.15791501281623635, + "learning_rate": 2.5354461318668845e-06, + "loss": 0.0019, + "step": 24781 + }, + { + "epoch": 10.078080520536803, + "grad_norm": 2.3377537119602727, + "learning_rate": 2.5347719588223097e-06, + "loss": 0.0276, + "step": 24782 + }, + { + "epoch": 10.078487189914599, + "grad_norm": 0.11567749431771734, + "learning_rate": 2.534097862412057e-06, + "loss": 0.0011, + "step": 24783 + }, + { + "epoch": 10.078893859292394, + "grad_norm": 0.11983886788486076, + "learning_rate": 2.533423842643048e-06, + "loss": 0.001, + "step": 24784 + }, + { + "epoch": 10.079300528670192, + "grad_norm": 0.008397429639284222, + "learning_rate": 2.532749899522197e-06, + "loss": 0.0001, + "step": 24785 + }, + { + "epoch": 10.079707198047988, + "grad_norm": 0.09812154993497356, + "learning_rate": 2.53207603305643e-06, + "loss": 0.001, + "step": 24786 + }, + { + "epoch": 10.080113867425784, + "grad_norm": 0.003157464782497936, + "learning_rate": 2.531402243252662e-06, + "loss": 0.0, + "step": 24787 + }, + { + "epoch": 10.08052053680358, + "grad_norm": 0.9421363853336393, + "learning_rate": 2.5307285301178076e-06, + "loss": 0.005, + "step": 24788 + }, + { + "epoch": 10.080927206181375, + "grad_norm": 0.013304067806418764, + "learning_rate": 2.5300548936587844e-06, + "loss": 0.0002, + "step": 24789 + }, + { + "epoch": 10.08133387555917, + "grad_norm": 4.927387951116904, + "learning_rate": 2.529381333882507e-06, + "loss": 0.0476, + "step": 24790 + }, + { + "epoch": 10.081740544936967, + "grad_norm": 8.13548432732412, + "learning_rate": 2.5287078507958907e-06, + "loss": 0.3409, + "step": 24791 + }, + { + "epoch": 10.082147214314762, + "grad_norm": 0.03890051014056921, + "learning_rate": 2.5280344444058446e-06, + "loss": 0.0004, + "step": 24792 + }, + { + "epoch": 10.082553883692558, + "grad_norm": 0.038956545744270286, + "learning_rate": 2.52736111471929e-06, + "loss": 0.0005, + "step": 24793 + }, + { + "epoch": 10.082960553070354, + "grad_norm": 0.04236776101656982, + "learning_rate": 2.5266878617431336e-06, + "loss": 0.0005, + "step": 24794 + }, + { + "epoch": 10.08336722244815, + "grad_norm": 0.053433840241953495, + "learning_rate": 2.526014685484287e-06, + "loss": 0.0005, + "step": 24795 + }, + { + "epoch": 10.083773891825945, + "grad_norm": 0.00993130584605906, + "learning_rate": 2.525341585949662e-06, + "loss": 0.0001, + "step": 24796 + }, + { + "epoch": 10.084180561203741, + "grad_norm": 0.3708712860757488, + "learning_rate": 2.5246685631461667e-06, + "loss": 0.002, + "step": 24797 + }, + { + "epoch": 10.084587230581537, + "grad_norm": 0.1878041837690546, + "learning_rate": 2.523995617080709e-06, + "loss": 0.0016, + "step": 24798 + }, + { + "epoch": 10.084993899959333, + "grad_norm": 0.019651151366694722, + "learning_rate": 2.5233227477602007e-06, + "loss": 0.0001, + "step": 24799 + }, + { + "epoch": 10.085400569337128, + "grad_norm": 0.9799297653575473, + "learning_rate": 2.5226499551915483e-06, + "loss": 0.0082, + "step": 24800 + }, + { + "epoch": 10.085807238714924, + "grad_norm": 4.068403464457051, + "learning_rate": 2.521977239381657e-06, + "loss": 0.0179, + "step": 24801 + }, + { + "epoch": 10.08621390809272, + "grad_norm": 0.013360760944723242, + "learning_rate": 2.521304600337433e-06, + "loss": 0.0001, + "step": 24802 + }, + { + "epoch": 10.086620577470516, + "grad_norm": 0.08988087405480041, + "learning_rate": 2.520632038065781e-06, + "loss": 0.0008, + "step": 24803 + }, + { + "epoch": 10.087027246848312, + "grad_norm": 0.003672565566034987, + "learning_rate": 2.5199595525736055e-06, + "loss": 0.0, + "step": 24804 + }, + { + "epoch": 10.087433916226107, + "grad_norm": 0.005362178432669624, + "learning_rate": 2.5192871438678067e-06, + "loss": 0.0001, + "step": 24805 + }, + { + "epoch": 10.087840585603905, + "grad_norm": 0.0274950675409055, + "learning_rate": 2.5186148119552943e-06, + "loss": 0.0004, + "step": 24806 + }, + { + "epoch": 10.0882472549817, + "grad_norm": 0.4881148140444498, + "learning_rate": 2.517942556842965e-06, + "loss": 0.0054, + "step": 24807 + }, + { + "epoch": 10.088653924359496, + "grad_norm": 0.4099024659160801, + "learning_rate": 2.517270378537722e-06, + "loss": 0.0041, + "step": 24808 + }, + { + "epoch": 10.089060593737292, + "grad_norm": 0.7374908787839909, + "learning_rate": 2.5165982770464637e-06, + "loss": 0.0026, + "step": 24809 + }, + { + "epoch": 10.089467263115088, + "grad_norm": 0.014814834173045743, + "learning_rate": 2.515926252376091e-06, + "loss": 0.0001, + "step": 24810 + }, + { + "epoch": 10.089873932492884, + "grad_norm": 0.031824625441655444, + "learning_rate": 2.5152543045334987e-06, + "loss": 0.0003, + "step": 24811 + }, + { + "epoch": 10.09028060187068, + "grad_norm": 0.27330299144049985, + "learning_rate": 2.514582433525591e-06, + "loss": 0.0025, + "step": 24812 + }, + { + "epoch": 10.090687271248475, + "grad_norm": 0.6484993724144412, + "learning_rate": 2.5139106393592618e-06, + "loss": 0.0046, + "step": 24813 + }, + { + "epoch": 10.091093940626271, + "grad_norm": 0.011014287683569508, + "learning_rate": 2.513238922041408e-06, + "loss": 0.0001, + "step": 24814 + }, + { + "epoch": 10.091500610004067, + "grad_norm": 0.030407061363419995, + "learning_rate": 2.512567281578924e-06, + "loss": 0.0004, + "step": 24815 + }, + { + "epoch": 10.091907279381862, + "grad_norm": 0.09519589756267523, + "learning_rate": 2.511895717978705e-06, + "loss": 0.0006, + "step": 24816 + }, + { + "epoch": 10.092313948759658, + "grad_norm": 0.08597524931212122, + "learning_rate": 2.5112242312476454e-06, + "loss": 0.0007, + "step": 24817 + }, + { + "epoch": 10.092720618137454, + "grad_norm": 0.4895974628740003, + "learning_rate": 2.5105528213926354e-06, + "loss": 0.0051, + "step": 24818 + }, + { + "epoch": 10.09312728751525, + "grad_norm": 0.000842193033513831, + "learning_rate": 2.5098814884205735e-06, + "loss": 0.0, + "step": 24819 + }, + { + "epoch": 10.093533956893046, + "grad_norm": 0.027952869497255176, + "learning_rate": 2.509210232338347e-06, + "loss": 0.0003, + "step": 24820 + }, + { + "epoch": 10.093940626270841, + "grad_norm": 0.0597427921440538, + "learning_rate": 2.5085390531528474e-06, + "loss": 0.0007, + "step": 24821 + }, + { + "epoch": 10.094347295648637, + "grad_norm": 1.1373605235814555, + "learning_rate": 2.5078679508709656e-06, + "loss": 0.0109, + "step": 24822 + }, + { + "epoch": 10.094753965026433, + "grad_norm": 0.010291086957741606, + "learning_rate": 2.50719692549959e-06, + "loss": 0.0001, + "step": 24823 + }, + { + "epoch": 10.095160634404229, + "grad_norm": 0.0028236987996989715, + "learning_rate": 2.5065259770456053e-06, + "loss": 0.0, + "step": 24824 + }, + { + "epoch": 10.095567303782024, + "grad_norm": 0.7838624533599267, + "learning_rate": 2.5058551055159054e-06, + "loss": 0.0067, + "step": 24825 + }, + { + "epoch": 10.095973973159822, + "grad_norm": 0.011652119712690692, + "learning_rate": 2.505184310917376e-06, + "loss": 0.0002, + "step": 24826 + }, + { + "epoch": 10.096380642537618, + "grad_norm": 0.0107162140058917, + "learning_rate": 2.504513593256901e-06, + "loss": 0.0001, + "step": 24827 + }, + { + "epoch": 10.096787311915413, + "grad_norm": 0.03194382911090601, + "learning_rate": 2.5038429525413655e-06, + "loss": 0.0003, + "step": 24828 + }, + { + "epoch": 10.09719398129321, + "grad_norm": 0.02474195411241846, + "learning_rate": 2.503172388777656e-06, + "loss": 0.0003, + "step": 24829 + }, + { + "epoch": 10.097600650671005, + "grad_norm": 0.04916435385017417, + "learning_rate": 2.502501901972655e-06, + "loss": 0.0006, + "step": 24830 + }, + { + "epoch": 10.0980073200488, + "grad_norm": 0.10653577015773452, + "learning_rate": 2.501831492133242e-06, + "loss": 0.0011, + "step": 24831 + }, + { + "epoch": 10.098413989426597, + "grad_norm": 0.11753015315252949, + "learning_rate": 2.5011611592663056e-06, + "loss": 0.0012, + "step": 24832 + }, + { + "epoch": 10.098820658804392, + "grad_norm": 0.5856928893205077, + "learning_rate": 2.500490903378724e-06, + "loss": 0.0073, + "step": 24833 + }, + { + "epoch": 10.099227328182188, + "grad_norm": 0.10740938989111302, + "learning_rate": 2.4998207244773776e-06, + "loss": 0.0012, + "step": 24834 + }, + { + "epoch": 10.099633997559984, + "grad_norm": 1.5957110070017277, + "learning_rate": 2.4991506225691464e-06, + "loss": 0.0142, + "step": 24835 + }, + { + "epoch": 10.10004066693778, + "grad_norm": 0.08257769346085657, + "learning_rate": 2.498480597660908e-06, + "loss": 0.0007, + "step": 24836 + }, + { + "epoch": 10.100447336315575, + "grad_norm": 0.011551807531475928, + "learning_rate": 2.49781064975954e-06, + "loss": 0.0001, + "step": 24837 + }, + { + "epoch": 10.100854005693371, + "grad_norm": 0.030374988302289994, + "learning_rate": 2.497140778871925e-06, + "loss": 0.0003, + "step": 24838 + }, + { + "epoch": 10.101260675071167, + "grad_norm": 0.1291374815179765, + "learning_rate": 2.496470985004937e-06, + "loss": 0.0014, + "step": 24839 + }, + { + "epoch": 10.101667344448963, + "grad_norm": 0.034985423942501946, + "learning_rate": 2.49580126816545e-06, + "loss": 0.0003, + "step": 24840 + }, + { + "epoch": 10.102074013826758, + "grad_norm": 0.03629983172362047, + "learning_rate": 2.495131628360339e-06, + "loss": 0.0003, + "step": 24841 + }, + { + "epoch": 10.102480683204554, + "grad_norm": 0.002395746236081526, + "learning_rate": 2.494462065596476e-06, + "loss": 0.0, + "step": 24842 + }, + { + "epoch": 10.10288735258235, + "grad_norm": 0.01818912769963339, + "learning_rate": 2.4937925798807394e-06, + "loss": 0.0003, + "step": 24843 + }, + { + "epoch": 10.103294021960146, + "grad_norm": 0.009606403928308627, + "learning_rate": 2.49312317122e-06, + "loss": 0.0001, + "step": 24844 + }, + { + "epoch": 10.103700691337941, + "grad_norm": 0.1037523480225528, + "learning_rate": 2.4924538396211308e-06, + "loss": 0.0008, + "step": 24845 + }, + { + "epoch": 10.104107360715737, + "grad_norm": 0.012188609835558752, + "learning_rate": 2.4917845850909996e-06, + "loss": 0.0001, + "step": 24846 + }, + { + "epoch": 10.104514030093535, + "grad_norm": 0.2539195873925252, + "learning_rate": 2.4911154076364795e-06, + "loss": 0.0033, + "step": 24847 + }, + { + "epoch": 10.10492069947133, + "grad_norm": 0.07159929224518247, + "learning_rate": 2.4904463072644393e-06, + "loss": 0.0006, + "step": 24848 + }, + { + "epoch": 10.105327368849126, + "grad_norm": 0.8743092732595641, + "learning_rate": 2.4897772839817423e-06, + "loss": 0.0095, + "step": 24849 + }, + { + "epoch": 10.105734038226922, + "grad_norm": 0.013144402847066314, + "learning_rate": 2.489108337795266e-06, + "loss": 0.0002, + "step": 24850 + }, + { + "epoch": 10.106140707604718, + "grad_norm": 0.012863189424711603, + "learning_rate": 2.488439468711873e-06, + "loss": 0.0001, + "step": 24851 + }, + { + "epoch": 10.106547376982514, + "grad_norm": 0.015629832656972212, + "learning_rate": 2.487770676738428e-06, + "loss": 0.0001, + "step": 24852 + }, + { + "epoch": 10.10695404636031, + "grad_norm": 0.2636821322217579, + "learning_rate": 2.4871019618817982e-06, + "loss": 0.0018, + "step": 24853 + }, + { + "epoch": 10.107360715738105, + "grad_norm": 0.091324811394497, + "learning_rate": 2.4864333241488493e-06, + "loss": 0.0011, + "step": 24854 + }, + { + "epoch": 10.107767385115901, + "grad_norm": 0.003860430524737374, + "learning_rate": 2.485764763546439e-06, + "loss": 0.0, + "step": 24855 + }, + { + "epoch": 10.108174054493697, + "grad_norm": 0.18744115688345092, + "learning_rate": 2.4850962800814395e-06, + "loss": 0.0024, + "step": 24856 + }, + { + "epoch": 10.108580723871492, + "grad_norm": 0.03880251909643261, + "learning_rate": 2.4844278737607077e-06, + "loss": 0.0003, + "step": 24857 + }, + { + "epoch": 10.108987393249288, + "grad_norm": 0.030873043677747757, + "learning_rate": 2.4837595445911066e-06, + "loss": 0.0004, + "step": 24858 + }, + { + "epoch": 10.109394062627084, + "grad_norm": 0.09668650587664814, + "learning_rate": 2.483091292579498e-06, + "loss": 0.001, + "step": 24859 + }, + { + "epoch": 10.10980073200488, + "grad_norm": 1.7272317657786593, + "learning_rate": 2.4824231177327386e-06, + "loss": 0.0238, + "step": 24860 + }, + { + "epoch": 10.110207401382675, + "grad_norm": 0.4442685995848421, + "learning_rate": 2.48175502005769e-06, + "loss": 0.0027, + "step": 24861 + }, + { + "epoch": 10.110614070760471, + "grad_norm": 0.030051140248294916, + "learning_rate": 2.4810869995612074e-06, + "loss": 0.0003, + "step": 24862 + }, + { + "epoch": 10.111020740138267, + "grad_norm": 0.1846848712532631, + "learning_rate": 2.480419056250154e-06, + "loss": 0.0018, + "step": 24863 + }, + { + "epoch": 10.111427409516063, + "grad_norm": 2.1167857561466845, + "learning_rate": 2.479751190131383e-06, + "loss": 0.0393, + "step": 24864 + }, + { + "epoch": 10.111834078893859, + "grad_norm": 0.46102646999597763, + "learning_rate": 2.4790834012117514e-06, + "loss": 0.0028, + "step": 24865 + }, + { + "epoch": 10.112240748271654, + "grad_norm": 0.17633576443131124, + "learning_rate": 2.4784156894981137e-06, + "loss": 0.002, + "step": 24866 + }, + { + "epoch": 10.112647417649452, + "grad_norm": 0.22558877242004063, + "learning_rate": 2.477748054997324e-06, + "loss": 0.0016, + "step": 24867 + }, + { + "epoch": 10.113054087027248, + "grad_norm": 0.0005451839250246789, + "learning_rate": 2.4770804977162344e-06, + "loss": 0.0, + "step": 24868 + }, + { + "epoch": 10.113460756405043, + "grad_norm": 0.09758148592009874, + "learning_rate": 2.476413017661702e-06, + "loss": 0.0008, + "step": 24869 + }, + { + "epoch": 10.11386742578284, + "grad_norm": 0.47825597310239837, + "learning_rate": 2.475745614840577e-06, + "loss": 0.0041, + "step": 24870 + }, + { + "epoch": 10.114274095160635, + "grad_norm": 0.004673845297124245, + "learning_rate": 2.4750782892597093e-06, + "loss": 0.0, + "step": 24871 + }, + { + "epoch": 10.11468076453843, + "grad_norm": 0.010491823823157167, + "learning_rate": 2.4744110409259504e-06, + "loss": 0.0001, + "step": 24872 + }, + { + "epoch": 10.115087433916226, + "grad_norm": 0.026867366776988498, + "learning_rate": 2.4737438698461503e-06, + "loss": 0.0003, + "step": 24873 + }, + { + "epoch": 10.115494103294022, + "grad_norm": 0.01851817463548572, + "learning_rate": 2.473076776027157e-06, + "loss": 0.0002, + "step": 24874 + }, + { + "epoch": 10.115900772671818, + "grad_norm": 0.048396692476027375, + "learning_rate": 2.4724097594758154e-06, + "loss": 0.0003, + "step": 24875 + }, + { + "epoch": 10.116307442049614, + "grad_norm": 3.9799580152199043, + "learning_rate": 2.47174282019898e-06, + "loss": 0.0428, + "step": 24876 + }, + { + "epoch": 10.11671411142741, + "grad_norm": 0.001831947179876858, + "learning_rate": 2.4710759582034926e-06, + "loss": 0.0, + "step": 24877 + }, + { + "epoch": 10.117120780805205, + "grad_norm": 0.019802368014470086, + "learning_rate": 2.4704091734961998e-06, + "loss": 0.0002, + "step": 24878 + }, + { + "epoch": 10.117527450183001, + "grad_norm": 0.03157361781083089, + "learning_rate": 2.4697424660839477e-06, + "loss": 0.0003, + "step": 24879 + }, + { + "epoch": 10.117934119560797, + "grad_norm": 0.015546598921625042, + "learning_rate": 2.469075835973578e-06, + "loss": 0.0002, + "step": 24880 + }, + { + "epoch": 10.118340788938593, + "grad_norm": 2.033854474585816, + "learning_rate": 2.468409283171932e-06, + "loss": 0.0089, + "step": 24881 + }, + { + "epoch": 10.118747458316388, + "grad_norm": 0.1048740833603152, + "learning_rate": 2.467742807685858e-06, + "loss": 0.001, + "step": 24882 + }, + { + "epoch": 10.119154127694184, + "grad_norm": 0.0011015266026999694, + "learning_rate": 2.4670764095221953e-06, + "loss": 0.0, + "step": 24883 + }, + { + "epoch": 10.11956079707198, + "grad_norm": 0.03677612084039935, + "learning_rate": 2.4664100886877852e-06, + "loss": 0.0003, + "step": 24884 + }, + { + "epoch": 10.119967466449776, + "grad_norm": 1.3374056364727815, + "learning_rate": 2.4657438451894657e-06, + "loss": 0.0103, + "step": 24885 + }, + { + "epoch": 10.120374135827571, + "grad_norm": 1.1724515927804968, + "learning_rate": 2.465077679034078e-06, + "loss": 0.0113, + "step": 24886 + }, + { + "epoch": 10.120780805205367, + "grad_norm": 0.17949467021009907, + "learning_rate": 2.4644115902284617e-06, + "loss": 0.0019, + "step": 24887 + }, + { + "epoch": 10.121187474583165, + "grad_norm": 0.2975080861725737, + "learning_rate": 2.4637455787794484e-06, + "loss": 0.0043, + "step": 24888 + }, + { + "epoch": 10.12159414396096, + "grad_norm": 0.0025048736226851525, + "learning_rate": 2.463079644693883e-06, + "loss": 0.0, + "step": 24889 + }, + { + "epoch": 10.122000813338756, + "grad_norm": 0.030621983905274555, + "learning_rate": 2.462413787978598e-06, + "loss": 0.0002, + "step": 24890 + }, + { + "epoch": 10.122407482716552, + "grad_norm": 0.0038158773437712012, + "learning_rate": 2.46174800864043e-06, + "loss": 0.0, + "step": 24891 + }, + { + "epoch": 10.122814152094348, + "grad_norm": 0.00035796364303498213, + "learning_rate": 2.4610823066862123e-06, + "loss": 0.0, + "step": 24892 + }, + { + "epoch": 10.123220821472144, + "grad_norm": 0.11990963523503723, + "learning_rate": 2.460416682122778e-06, + "loss": 0.0013, + "step": 24893 + }, + { + "epoch": 10.12362749084994, + "grad_norm": 0.0039412096621649255, + "learning_rate": 2.4597511349569614e-06, + "loss": 0.0001, + "step": 24894 + }, + { + "epoch": 10.124034160227735, + "grad_norm": 0.005978840996295798, + "learning_rate": 2.459085665195592e-06, + "loss": 0.0001, + "step": 24895 + }, + { + "epoch": 10.12444082960553, + "grad_norm": 0.037001177866914, + "learning_rate": 2.4584202728455065e-06, + "loss": 0.0004, + "step": 24896 + }, + { + "epoch": 10.124847498983327, + "grad_norm": 7.985175246617419, + "learning_rate": 2.4577549579135318e-06, + "loss": 0.1027, + "step": 24897 + }, + { + "epoch": 10.125254168361122, + "grad_norm": 0.00677231044052864, + "learning_rate": 2.4570897204064993e-06, + "loss": 0.0001, + "step": 24898 + }, + { + "epoch": 10.125660837738918, + "grad_norm": 0.04296656907855186, + "learning_rate": 2.4564245603312365e-06, + "loss": 0.0003, + "step": 24899 + }, + { + "epoch": 10.126067507116714, + "grad_norm": 0.007439776677351939, + "learning_rate": 2.4557594776945727e-06, + "loss": 0.0, + "step": 24900 + }, + { + "epoch": 10.12647417649451, + "grad_norm": 0.08391415238822564, + "learning_rate": 2.455094472503331e-06, + "loss": 0.0004, + "step": 24901 + }, + { + "epoch": 10.126880845872305, + "grad_norm": 0.038635040298808515, + "learning_rate": 2.454429544764345e-06, + "loss": 0.0003, + "step": 24902 + }, + { + "epoch": 10.127287515250101, + "grad_norm": 3.7972840946765873, + "learning_rate": 2.4537646944844374e-06, + "loss": 0.0299, + "step": 24903 + }, + { + "epoch": 10.127694184627897, + "grad_norm": 0.022511409283247883, + "learning_rate": 2.4530999216704332e-06, + "loss": 0.0003, + "step": 24904 + }, + { + "epoch": 10.128100854005693, + "grad_norm": 0.20251289649707713, + "learning_rate": 2.452435226329157e-06, + "loss": 0.0025, + "step": 24905 + }, + { + "epoch": 10.128507523383488, + "grad_norm": 0.011436992624488062, + "learning_rate": 2.451770608467432e-06, + "loss": 0.0001, + "step": 24906 + }, + { + "epoch": 10.128914192761284, + "grad_norm": 0.11500441671669646, + "learning_rate": 2.4511060680920794e-06, + "loss": 0.0014, + "step": 24907 + }, + { + "epoch": 10.129320862139082, + "grad_norm": 0.012393489569160816, + "learning_rate": 2.45044160520992e-06, + "loss": 0.0002, + "step": 24908 + }, + { + "epoch": 10.129727531516878, + "grad_norm": 0.20419491155243613, + "learning_rate": 2.44977721982778e-06, + "loss": 0.0027, + "step": 24909 + }, + { + "epoch": 10.130134200894673, + "grad_norm": 0.2957378230286481, + "learning_rate": 2.449112911952477e-06, + "loss": 0.0016, + "step": 24910 + }, + { + "epoch": 10.130540870272469, + "grad_norm": 0.33383848128846105, + "learning_rate": 2.4484486815908305e-06, + "loss": 0.003, + "step": 24911 + }, + { + "epoch": 10.130947539650265, + "grad_norm": 0.03207746066072412, + "learning_rate": 2.4477845287496583e-06, + "loss": 0.0003, + "step": 24912 + }, + { + "epoch": 10.13135420902806, + "grad_norm": 0.16704731366695258, + "learning_rate": 2.4471204534357796e-06, + "loss": 0.0014, + "step": 24913 + }, + { + "epoch": 10.131760878405856, + "grad_norm": 0.2928192275425057, + "learning_rate": 2.4464564556560066e-06, + "loss": 0.0027, + "step": 24914 + }, + { + "epoch": 10.132167547783652, + "grad_norm": 0.0021550466649505125, + "learning_rate": 2.4457925354171642e-06, + "loss": 0.0, + "step": 24915 + }, + { + "epoch": 10.132574217161448, + "grad_norm": 0.02112376820120186, + "learning_rate": 2.445128692726062e-06, + "loss": 0.0002, + "step": 24916 + }, + { + "epoch": 10.132980886539244, + "grad_norm": 0.6855471980176169, + "learning_rate": 2.444464927589517e-06, + "loss": 0.0072, + "step": 24917 + }, + { + "epoch": 10.13338755591704, + "grad_norm": 0.025712338290837286, + "learning_rate": 2.443801240014342e-06, + "loss": 0.0003, + "step": 24918 + }, + { + "epoch": 10.133794225294835, + "grad_norm": 0.3475824991140952, + "learning_rate": 2.4431376300073496e-06, + "loss": 0.0032, + "step": 24919 + }, + { + "epoch": 10.134200894672631, + "grad_norm": 0.05186046526110964, + "learning_rate": 2.4424740975753525e-06, + "loss": 0.0005, + "step": 24920 + }, + { + "epoch": 10.134607564050427, + "grad_norm": 0.022411441861745908, + "learning_rate": 2.441810642725161e-06, + "loss": 0.0003, + "step": 24921 + }, + { + "epoch": 10.135014233428222, + "grad_norm": 0.007282583468944563, + "learning_rate": 2.441147265463588e-06, + "loss": 0.0, + "step": 24922 + }, + { + "epoch": 10.135420902806018, + "grad_norm": 0.8506637049357746, + "learning_rate": 2.4404839657974446e-06, + "loss": 0.0083, + "step": 24923 + }, + { + "epoch": 10.135827572183814, + "grad_norm": 0.16429080998021903, + "learning_rate": 2.4398207437335363e-06, + "loss": 0.0016, + "step": 24924 + }, + { + "epoch": 10.13623424156161, + "grad_norm": 0.08032619548083642, + "learning_rate": 2.439157599278674e-06, + "loss": 0.0005, + "step": 24925 + }, + { + "epoch": 10.136640910939406, + "grad_norm": 0.019808284089805797, + "learning_rate": 2.438494532439665e-06, + "loss": 0.0002, + "step": 24926 + }, + { + "epoch": 10.137047580317201, + "grad_norm": 0.004653099816835791, + "learning_rate": 2.4378315432233113e-06, + "loss": 0.0, + "step": 24927 + }, + { + "epoch": 10.137454249694997, + "grad_norm": 0.21777533284595252, + "learning_rate": 2.437168631636425e-06, + "loss": 0.0032, + "step": 24928 + }, + { + "epoch": 10.137860919072795, + "grad_norm": 0.05783525514869792, + "learning_rate": 2.4365057976858096e-06, + "loss": 0.0004, + "step": 24929 + }, + { + "epoch": 10.13826758845059, + "grad_norm": 0.25338396434795174, + "learning_rate": 2.4358430413782695e-06, + "loss": 0.0041, + "step": 24930 + }, + { + "epoch": 10.138674257828386, + "grad_norm": 0.23919152846144504, + "learning_rate": 2.435180362720607e-06, + "loss": 0.0028, + "step": 24931 + }, + { + "epoch": 10.139080927206182, + "grad_norm": 0.007165982151593434, + "learning_rate": 2.4345177617196257e-06, + "loss": 0.0, + "step": 24932 + }, + { + "epoch": 10.139487596583978, + "grad_norm": 0.5903636874251774, + "learning_rate": 2.4338552383821268e-06, + "loss": 0.0055, + "step": 24933 + }, + { + "epoch": 10.139894265961773, + "grad_norm": 0.10751647477352161, + "learning_rate": 2.433192792714909e-06, + "loss": 0.0013, + "step": 24934 + }, + { + "epoch": 10.14030093533957, + "grad_norm": 0.00724644863062409, + "learning_rate": 2.4325304247247784e-06, + "loss": 0.0001, + "step": 24935 + }, + { + "epoch": 10.140707604717365, + "grad_norm": 0.06248689989522821, + "learning_rate": 2.4318681344185315e-06, + "loss": 0.0004, + "step": 24936 + }, + { + "epoch": 10.14111427409516, + "grad_norm": 0.04840848552278339, + "learning_rate": 2.4312059218029672e-06, + "loss": 0.0004, + "step": 24937 + }, + { + "epoch": 10.141520943472957, + "grad_norm": 0.07029332132645237, + "learning_rate": 2.4305437868848835e-06, + "loss": 0.0004, + "step": 24938 + }, + { + "epoch": 10.141927612850752, + "grad_norm": 2.4694866209612005, + "learning_rate": 2.4298817296710774e-06, + "loss": 0.0185, + "step": 24939 + }, + { + "epoch": 10.142334282228548, + "grad_norm": 0.0028901910112177527, + "learning_rate": 2.4292197501683455e-06, + "loss": 0.0, + "step": 24940 + }, + { + "epoch": 10.142740951606344, + "grad_norm": 0.17012589508804649, + "learning_rate": 2.428557848383484e-06, + "loss": 0.0015, + "step": 24941 + }, + { + "epoch": 10.14314762098414, + "grad_norm": 0.0931752042070216, + "learning_rate": 2.427896024323285e-06, + "loss": 0.0012, + "step": 24942 + }, + { + "epoch": 10.143554290361935, + "grad_norm": 0.35625472072357856, + "learning_rate": 2.427234277994546e-06, + "loss": 0.0041, + "step": 24943 + }, + { + "epoch": 10.143960959739731, + "grad_norm": 0.09113497091728717, + "learning_rate": 2.4265726094040566e-06, + "loss": 0.001, + "step": 24944 + }, + { + "epoch": 10.144367629117527, + "grad_norm": 7.738587747441703, + "learning_rate": 2.4259110185586088e-06, + "loss": 0.2229, + "step": 24945 + }, + { + "epoch": 10.144774298495323, + "grad_norm": 0.03461689309084255, + "learning_rate": 2.4252495054649995e-06, + "loss": 0.0003, + "step": 24946 + }, + { + "epoch": 10.145180967873118, + "grad_norm": 0.02278913047494829, + "learning_rate": 2.424588070130017e-06, + "loss": 0.0002, + "step": 24947 + }, + { + "epoch": 10.145587637250914, + "grad_norm": 0.0013294689696349258, + "learning_rate": 2.423926712560449e-06, + "loss": 0.0, + "step": 24948 + }, + { + "epoch": 10.145994306628712, + "grad_norm": 1.9719205073531398, + "learning_rate": 2.423265432763088e-06, + "loss": 0.02, + "step": 24949 + }, + { + "epoch": 10.146400976006507, + "grad_norm": 0.007264564147528383, + "learning_rate": 2.4226042307447196e-06, + "loss": 0.0001, + "step": 24950 + }, + { + "epoch": 10.146807645384303, + "grad_norm": 0.029309011067816724, + "learning_rate": 2.4219431065121326e-06, + "loss": 0.0004, + "step": 24951 + }, + { + "epoch": 10.147214314762099, + "grad_norm": 0.0019927912090723352, + "learning_rate": 2.421282060072111e-06, + "loss": 0.0, + "step": 24952 + }, + { + "epoch": 10.147620984139895, + "grad_norm": 0.020549007896545277, + "learning_rate": 2.4206210914314453e-06, + "loss": 0.0001, + "step": 24953 + }, + { + "epoch": 10.14802765351769, + "grad_norm": 0.0008933550278582754, + "learning_rate": 2.419960200596919e-06, + "loss": 0.0, + "step": 24954 + }, + { + "epoch": 10.148434322895486, + "grad_norm": 0.2624289095732205, + "learning_rate": 2.419299387575317e-06, + "loss": 0.003, + "step": 24955 + }, + { + "epoch": 10.148840992273282, + "grad_norm": 0.04111971602465745, + "learning_rate": 2.4186386523734217e-06, + "loss": 0.0003, + "step": 24956 + }, + { + "epoch": 10.149247661651078, + "grad_norm": 0.01668001517864463, + "learning_rate": 2.4179779949980165e-06, + "loss": 0.0001, + "step": 24957 + }, + { + "epoch": 10.149654331028874, + "grad_norm": 0.008463286278007154, + "learning_rate": 2.41731741545588e-06, + "loss": 0.0001, + "step": 24958 + }, + { + "epoch": 10.15006100040667, + "grad_norm": 0.0012756227982104217, + "learning_rate": 2.4166569137537987e-06, + "loss": 0.0, + "step": 24959 + }, + { + "epoch": 10.150467669784465, + "grad_norm": 0.01788460128461659, + "learning_rate": 2.415996489898551e-06, + "loss": 0.0003, + "step": 24960 + }, + { + "epoch": 10.150874339162261, + "grad_norm": 0.5644242146663259, + "learning_rate": 2.4153361438969167e-06, + "loss": 0.0055, + "step": 24961 + }, + { + "epoch": 10.151281008540057, + "grad_norm": 0.5961236259420692, + "learning_rate": 2.4146758757556743e-06, + "loss": 0.0051, + "step": 24962 + }, + { + "epoch": 10.151687677917852, + "grad_norm": 0.0009568453746390447, + "learning_rate": 2.414015685481601e-06, + "loss": 0.0, + "step": 24963 + }, + { + "epoch": 10.152094347295648, + "grad_norm": 0.044388553927038346, + "learning_rate": 2.413355573081475e-06, + "loss": 0.0006, + "step": 24964 + }, + { + "epoch": 10.152501016673444, + "grad_norm": 0.032065144253777836, + "learning_rate": 2.41269553856207e-06, + "loss": 0.0004, + "step": 24965 + }, + { + "epoch": 10.15290768605124, + "grad_norm": 0.5168176916741382, + "learning_rate": 2.4120355819301664e-06, + "loss": 0.0054, + "step": 24966 + }, + { + "epoch": 10.153314355429035, + "grad_norm": 0.013655583499567761, + "learning_rate": 2.4113757031925365e-06, + "loss": 0.0001, + "step": 24967 + }, + { + "epoch": 10.153721024806831, + "grad_norm": 0.0034556662082122452, + "learning_rate": 2.4107159023559545e-06, + "loss": 0.0, + "step": 24968 + }, + { + "epoch": 10.154127694184627, + "grad_norm": 0.7681537809315535, + "learning_rate": 2.4100561794271926e-06, + "loss": 0.0063, + "step": 24969 + }, + { + "epoch": 10.154534363562425, + "grad_norm": 1.2064052447075728, + "learning_rate": 2.4093965344130244e-06, + "loss": 0.0149, + "step": 24970 + }, + { + "epoch": 10.15494103294022, + "grad_norm": 0.16675899995142301, + "learning_rate": 2.408736967320219e-06, + "loss": 0.0016, + "step": 24971 + }, + { + "epoch": 10.155347702318016, + "grad_norm": 0.02022773020596593, + "learning_rate": 2.4080774781555505e-06, + "loss": 0.0002, + "step": 24972 + }, + { + "epoch": 10.155754371695812, + "grad_norm": 0.08941428474489566, + "learning_rate": 2.4074180669257885e-06, + "loss": 0.0007, + "step": 24973 + }, + { + "epoch": 10.156161041073608, + "grad_norm": 0.007588622441124053, + "learning_rate": 2.4067587336377007e-06, + "loss": 0.0001, + "step": 24974 + }, + { + "epoch": 10.156567710451403, + "grad_norm": 0.14652458695799728, + "learning_rate": 2.4060994782980575e-06, + "loss": 0.0019, + "step": 24975 + }, + { + "epoch": 10.1569743798292, + "grad_norm": 1.051775089279081, + "learning_rate": 2.405440300913624e-06, + "loss": 0.0097, + "step": 24976 + }, + { + "epoch": 10.157381049206995, + "grad_norm": 0.9423935418308794, + "learning_rate": 2.4047812014911687e-06, + "loss": 0.0084, + "step": 24977 + }, + { + "epoch": 10.15778771858479, + "grad_norm": 0.00518571988158405, + "learning_rate": 2.4041221800374537e-06, + "loss": 0.0001, + "step": 24978 + }, + { + "epoch": 10.158194387962586, + "grad_norm": 0.004012636334009088, + "learning_rate": 2.40346323655925e-06, + "loss": 0.0, + "step": 24979 + }, + { + "epoch": 10.158601057340382, + "grad_norm": 3.765171943129245, + "learning_rate": 2.40280437106332e-06, + "loss": 0.0462, + "step": 24980 + }, + { + "epoch": 10.159007726718178, + "grad_norm": 14.368397511883096, + "learning_rate": 2.4021455835564254e-06, + "loss": 0.1704, + "step": 24981 + }, + { + "epoch": 10.159414396095974, + "grad_norm": 0.018947107722590728, + "learning_rate": 2.401486874045332e-06, + "loss": 0.0002, + "step": 24982 + }, + { + "epoch": 10.15982106547377, + "grad_norm": 0.0006242169677340291, + "learning_rate": 2.400828242536799e-06, + "loss": 0.0, + "step": 24983 + }, + { + "epoch": 10.160227734851565, + "grad_norm": 7.237275541630589, + "learning_rate": 2.400169689037587e-06, + "loss": 0.0529, + "step": 24984 + }, + { + "epoch": 10.160634404229361, + "grad_norm": 1.2677563182399003, + "learning_rate": 2.3995112135544597e-06, + "loss": 0.0062, + "step": 24985 + }, + { + "epoch": 10.161041073607157, + "grad_norm": 0.2705777115136876, + "learning_rate": 2.398852816094176e-06, + "loss": 0.0023, + "step": 24986 + }, + { + "epoch": 10.161447742984953, + "grad_norm": 0.01481172944120563, + "learning_rate": 2.398194496663493e-06, + "loss": 0.0002, + "step": 24987 + }, + { + "epoch": 10.161854412362748, + "grad_norm": 0.0005013713092882512, + "learning_rate": 2.3975362552691695e-06, + "loss": 0.0, + "step": 24988 + }, + { + "epoch": 10.162261081740544, + "grad_norm": 0.0408192112130215, + "learning_rate": 2.3968780919179623e-06, + "loss": 0.0004, + "step": 24989 + }, + { + "epoch": 10.162667751118342, + "grad_norm": 0.005863151390818965, + "learning_rate": 2.3962200066166285e-06, + "loss": 0.0, + "step": 24990 + }, + { + "epoch": 10.163074420496137, + "grad_norm": 2.8804650904082814, + "learning_rate": 2.39556199937192e-06, + "loss": 0.0312, + "step": 24991 + }, + { + "epoch": 10.163481089873933, + "grad_norm": 0.0028598447092762487, + "learning_rate": 2.394904070190599e-06, + "loss": 0.0, + "step": 24992 + }, + { + "epoch": 10.163887759251729, + "grad_norm": 0.09420517543929673, + "learning_rate": 2.3942462190794135e-06, + "loss": 0.0007, + "step": 24993 + }, + { + "epoch": 10.164294428629525, + "grad_norm": 0.007628376409465197, + "learning_rate": 2.39358844604512e-06, + "loss": 0.0001, + "step": 24994 + }, + { + "epoch": 10.16470109800732, + "grad_norm": 0.0011565768981993698, + "learning_rate": 2.392930751094469e-06, + "loss": 0.0, + "step": 24995 + }, + { + "epoch": 10.165107767385116, + "grad_norm": 0.0021793837772151837, + "learning_rate": 2.392273134234211e-06, + "loss": 0.0, + "step": 24996 + }, + { + "epoch": 10.165514436762912, + "grad_norm": 0.01717318175087086, + "learning_rate": 2.391615595471096e-06, + "loss": 0.0001, + "step": 24997 + }, + { + "epoch": 10.165921106140708, + "grad_norm": 0.001539207066829313, + "learning_rate": 2.3909581348118803e-06, + "loss": 0.0, + "step": 24998 + }, + { + "epoch": 10.166327775518504, + "grad_norm": 0.005016412339136246, + "learning_rate": 2.390300752263307e-06, + "loss": 0.0001, + "step": 24999 + }, + { + "epoch": 10.1667344448963, + "grad_norm": 0.2322247958710133, + "learning_rate": 2.3896434478321274e-06, + "loss": 0.0023, + "step": 25000 + }, + { + "epoch": 10.167141114274095, + "grad_norm": 0.1018593735676593, + "learning_rate": 2.3889862215250882e-06, + "loss": 0.0008, + "step": 25001 + }, + { + "epoch": 10.16754778365189, + "grad_norm": 0.09606343230229672, + "learning_rate": 2.388329073348936e-06, + "loss": 0.0009, + "step": 25002 + }, + { + "epoch": 10.167954453029687, + "grad_norm": 0.019013767616533714, + "learning_rate": 2.387672003310416e-06, + "loss": 0.0002, + "step": 25003 + }, + { + "epoch": 10.168361122407482, + "grad_norm": 0.18766621038717768, + "learning_rate": 2.3870150114162715e-06, + "loss": 0.0022, + "step": 25004 + }, + { + "epoch": 10.168767791785278, + "grad_norm": 0.009978761353277727, + "learning_rate": 2.3863580976732524e-06, + "loss": 0.0001, + "step": 25005 + }, + { + "epoch": 10.169174461163074, + "grad_norm": 1.2692058133275865, + "learning_rate": 2.385701262088099e-06, + "loss": 0.0087, + "step": 25006 + }, + { + "epoch": 10.16958113054087, + "grad_norm": 0.0007023701975860803, + "learning_rate": 2.3850445046675552e-06, + "loss": 0.0, + "step": 25007 + }, + { + "epoch": 10.169987799918665, + "grad_norm": 0.0031396157513527605, + "learning_rate": 2.3843878254183616e-06, + "loss": 0.0, + "step": 25008 + }, + { + "epoch": 10.170394469296461, + "grad_norm": 0.7840889053361726, + "learning_rate": 2.3837312243472597e-06, + "loss": 0.0057, + "step": 25009 + }, + { + "epoch": 10.170801138674257, + "grad_norm": 0.07652601315539745, + "learning_rate": 2.38307470146099e-06, + "loss": 0.0008, + "step": 25010 + }, + { + "epoch": 10.171207808052054, + "grad_norm": 0.05769777032365702, + "learning_rate": 2.382418256766289e-06, + "loss": 0.0007, + "step": 25011 + }, + { + "epoch": 10.17161447742985, + "grad_norm": 0.0014291354812895369, + "learning_rate": 2.3817618902699014e-06, + "loss": 0.0, + "step": 25012 + }, + { + "epoch": 10.172021146807646, + "grad_norm": 0.002702405176355029, + "learning_rate": 2.3811056019785626e-06, + "loss": 0.0, + "step": 25013 + }, + { + "epoch": 10.172427816185442, + "grad_norm": 0.3940317840511059, + "learning_rate": 2.3804493918990092e-06, + "loss": 0.0046, + "step": 25014 + }, + { + "epoch": 10.172834485563238, + "grad_norm": 0.008015881180854346, + "learning_rate": 2.379793260037977e-06, + "loss": 0.0001, + "step": 25015 + }, + { + "epoch": 10.173241154941033, + "grad_norm": 0.08508829039609635, + "learning_rate": 2.379137206402202e-06, + "loss": 0.0008, + "step": 25016 + }, + { + "epoch": 10.173647824318829, + "grad_norm": 0.007047551297835302, + "learning_rate": 2.3784812309984173e-06, + "loss": 0.0001, + "step": 25017 + }, + { + "epoch": 10.174054493696625, + "grad_norm": 0.0009404856631051761, + "learning_rate": 2.3778253338333612e-06, + "loss": 0.0, + "step": 25018 + }, + { + "epoch": 10.17446116307442, + "grad_norm": 0.20623918007521133, + "learning_rate": 2.377169514913764e-06, + "loss": 0.0023, + "step": 25019 + }, + { + "epoch": 10.174867832452216, + "grad_norm": 0.1100283826535422, + "learning_rate": 2.3765137742463585e-06, + "loss": 0.0011, + "step": 25020 + }, + { + "epoch": 10.175274501830012, + "grad_norm": 0.06979473598290079, + "learning_rate": 2.3758581118378754e-06, + "loss": 0.0007, + "step": 25021 + }, + { + "epoch": 10.175681171207808, + "grad_norm": 0.2844338182252275, + "learning_rate": 2.3752025276950453e-06, + "loss": 0.001, + "step": 25022 + }, + { + "epoch": 10.176087840585604, + "grad_norm": 1.7281445770400765, + "learning_rate": 2.3745470218246002e-06, + "loss": 0.0169, + "step": 25023 + }, + { + "epoch": 10.1764945099634, + "grad_norm": 0.702557884430276, + "learning_rate": 2.373891594233264e-06, + "loss": 0.0061, + "step": 25024 + }, + { + "epoch": 10.176901179341195, + "grad_norm": 0.00024251521765017054, + "learning_rate": 2.3732362449277715e-06, + "loss": 0.0, + "step": 25025 + }, + { + "epoch": 10.177307848718991, + "grad_norm": 2.696956448648392, + "learning_rate": 2.3725809739148475e-06, + "loss": 0.0216, + "step": 25026 + }, + { + "epoch": 10.177714518096787, + "grad_norm": 0.22699389384355628, + "learning_rate": 2.37192578120122e-06, + "loss": 0.0026, + "step": 25027 + }, + { + "epoch": 10.178121187474582, + "grad_norm": 0.018346185154737144, + "learning_rate": 2.3712706667936116e-06, + "loss": 0.0001, + "step": 25028 + }, + { + "epoch": 10.178527856852378, + "grad_norm": 2.4803051397336255, + "learning_rate": 2.3706156306987505e-06, + "loss": 0.0369, + "step": 25029 + }, + { + "epoch": 10.178934526230174, + "grad_norm": 0.01823116702809631, + "learning_rate": 2.369960672923356e-06, + "loss": 0.0002, + "step": 25030 + }, + { + "epoch": 10.179341195607972, + "grad_norm": 0.0013693020549679098, + "learning_rate": 2.369305793474158e-06, + "loss": 0.0, + "step": 25031 + }, + { + "epoch": 10.179747864985767, + "grad_norm": 0.2661785524641225, + "learning_rate": 2.368650992357877e-06, + "loss": 0.0014, + "step": 25032 + }, + { + "epoch": 10.180154534363563, + "grad_norm": 0.0011246809592754346, + "learning_rate": 2.3679962695812344e-06, + "loss": 0.0, + "step": 25033 + }, + { + "epoch": 10.180561203741359, + "grad_norm": 0.037670139531545785, + "learning_rate": 2.3673416251509517e-06, + "loss": 0.0003, + "step": 25034 + }, + { + "epoch": 10.180967873119155, + "grad_norm": 0.027420471019465882, + "learning_rate": 2.3666870590737477e-06, + "loss": 0.0003, + "step": 25035 + }, + { + "epoch": 10.18137454249695, + "grad_norm": 0.014486883178535838, + "learning_rate": 2.3660325713563438e-06, + "loss": 0.0002, + "step": 25036 + }, + { + "epoch": 10.181781211874746, + "grad_norm": 0.030965632708245823, + "learning_rate": 2.365378162005454e-06, + "loss": 0.0004, + "step": 25037 + }, + { + "epoch": 10.182187881252542, + "grad_norm": 0.20275382055885274, + "learning_rate": 2.3647238310278053e-06, + "loss": 0.0024, + "step": 25038 + }, + { + "epoch": 10.182594550630338, + "grad_norm": 0.19196488037140982, + "learning_rate": 2.364069578430107e-06, + "loss": 0.0016, + "step": 25039 + }, + { + "epoch": 10.183001220008133, + "grad_norm": 0.025677858770467944, + "learning_rate": 2.363415404219077e-06, + "loss": 0.0002, + "step": 25040 + }, + { + "epoch": 10.18340788938593, + "grad_norm": 0.00063066641892673, + "learning_rate": 2.362761308401432e-06, + "loss": 0.0, + "step": 25041 + }, + { + "epoch": 10.183814558763725, + "grad_norm": 0.11034752887336087, + "learning_rate": 2.3621072909838827e-06, + "loss": 0.0013, + "step": 25042 + }, + { + "epoch": 10.18422122814152, + "grad_norm": 0.0006890769414581709, + "learning_rate": 2.3614533519731486e-06, + "loss": 0.0, + "step": 25043 + }, + { + "epoch": 10.184627897519317, + "grad_norm": 0.2818745903110669, + "learning_rate": 2.3607994913759415e-06, + "loss": 0.0027, + "step": 25044 + }, + { + "epoch": 10.185034566897112, + "grad_norm": 0.14910154656643382, + "learning_rate": 2.360145709198971e-06, + "loss": 0.0008, + "step": 25045 + }, + { + "epoch": 10.185441236274908, + "grad_norm": 0.004659002766925661, + "learning_rate": 2.35949200544895e-06, + "loss": 0.0, + "step": 25046 + }, + { + "epoch": 10.185847905652704, + "grad_norm": 0.05487658269158506, + "learning_rate": 2.35883838013259e-06, + "loss": 0.0005, + "step": 25047 + }, + { + "epoch": 10.1862545750305, + "grad_norm": 0.004003891028752034, + "learning_rate": 2.358184833256596e-06, + "loss": 0.0, + "step": 25048 + }, + { + "epoch": 10.186661244408295, + "grad_norm": 0.023332373072906414, + "learning_rate": 2.357531364827683e-06, + "loss": 0.0001, + "step": 25049 + }, + { + "epoch": 10.187067913786091, + "grad_norm": 0.013193693037167274, + "learning_rate": 2.3568779748525583e-06, + "loss": 0.0001, + "step": 25050 + }, + { + "epoch": 10.187474583163887, + "grad_norm": 0.012599909944225308, + "learning_rate": 2.356224663337927e-06, + "loss": 0.0002, + "step": 25051 + }, + { + "epoch": 10.187881252541684, + "grad_norm": 0.0022565094347973377, + "learning_rate": 2.3555714302904966e-06, + "loss": 0.0, + "step": 25052 + }, + { + "epoch": 10.18828792191948, + "grad_norm": 0.0015472817035543, + "learning_rate": 2.354918275716973e-06, + "loss": 0.0, + "step": 25053 + }, + { + "epoch": 10.188694591297276, + "grad_norm": 0.0021209180620070144, + "learning_rate": 2.3542651996240605e-06, + "loss": 0.0, + "step": 25054 + }, + { + "epoch": 10.189101260675072, + "grad_norm": 0.008697842669494667, + "learning_rate": 2.3536122020184617e-06, + "loss": 0.0001, + "step": 25055 + }, + { + "epoch": 10.189507930052867, + "grad_norm": 0.032575152328290946, + "learning_rate": 2.3529592829068848e-06, + "loss": 0.0002, + "step": 25056 + }, + { + "epoch": 10.189914599430663, + "grad_norm": 0.4704702852582684, + "learning_rate": 2.3523064422960297e-06, + "loss": 0.0054, + "step": 25057 + }, + { + "epoch": 10.190321268808459, + "grad_norm": 0.39185474279856275, + "learning_rate": 2.351653680192597e-06, + "loss": 0.0037, + "step": 25058 + }, + { + "epoch": 10.190727938186255, + "grad_norm": 6.386968536771077, + "learning_rate": 2.3510009966032897e-06, + "loss": 0.0774, + "step": 25059 + }, + { + "epoch": 10.19113460756405, + "grad_norm": 0.007389558846971102, + "learning_rate": 2.3503483915348057e-06, + "loss": 0.0001, + "step": 25060 + }, + { + "epoch": 10.191541276941846, + "grad_norm": 11.701444169010447, + "learning_rate": 2.349695864993844e-06, + "loss": 0.1596, + "step": 25061 + }, + { + "epoch": 10.191947946319642, + "grad_norm": 0.422355774443403, + "learning_rate": 2.3490434169871058e-06, + "loss": 0.0039, + "step": 25062 + }, + { + "epoch": 10.192354615697438, + "grad_norm": 0.0032907859196118034, + "learning_rate": 2.348391047521289e-06, + "loss": 0.0, + "step": 25063 + }, + { + "epoch": 10.192761285075234, + "grad_norm": 1.5883854071923549, + "learning_rate": 2.3477387566030883e-06, + "loss": 0.0157, + "step": 25064 + }, + { + "epoch": 10.19316795445303, + "grad_norm": 0.11657651074602375, + "learning_rate": 2.347086544239201e-06, + "loss": 0.0008, + "step": 25065 + }, + { + "epoch": 10.193574623830825, + "grad_norm": 0.017284016622122738, + "learning_rate": 2.3464344104363213e-06, + "loss": 0.0002, + "step": 25066 + }, + { + "epoch": 10.193981293208621, + "grad_norm": 0.2855327266655499, + "learning_rate": 2.345782355201145e-06, + "loss": 0.0018, + "step": 25067 + }, + { + "epoch": 10.194387962586417, + "grad_norm": 0.18689627728301653, + "learning_rate": 2.345130378540362e-06, + "loss": 0.0015, + "step": 25068 + }, + { + "epoch": 10.194794631964212, + "grad_norm": 0.00024294013827483608, + "learning_rate": 2.344478480460671e-06, + "loss": 0.0, + "step": 25069 + }, + { + "epoch": 10.195201301342008, + "grad_norm": 0.011854368628241725, + "learning_rate": 2.343826660968761e-06, + "loss": 0.0001, + "step": 25070 + }, + { + "epoch": 10.195607970719804, + "grad_norm": 0.022761834704761126, + "learning_rate": 2.343174920071324e-06, + "loss": 0.0003, + "step": 25071 + }, + { + "epoch": 10.196014640097602, + "grad_norm": 0.07353198931963405, + "learning_rate": 2.34252325777505e-06, + "loss": 0.0007, + "step": 25072 + }, + { + "epoch": 10.196421309475397, + "grad_norm": 0.05468603999725038, + "learning_rate": 2.341871674086629e-06, + "loss": 0.0004, + "step": 25073 + }, + { + "epoch": 10.196827978853193, + "grad_norm": 0.01490718127333699, + "learning_rate": 2.341220169012747e-06, + "loss": 0.0001, + "step": 25074 + }, + { + "epoch": 10.197234648230989, + "grad_norm": 0.886352511480678, + "learning_rate": 2.3405687425600963e-06, + "loss": 0.0032, + "step": 25075 + }, + { + "epoch": 10.197641317608785, + "grad_norm": 0.08458740186908434, + "learning_rate": 2.3399173947353627e-06, + "loss": 0.0008, + "step": 25076 + }, + { + "epoch": 10.19804798698658, + "grad_norm": 0.005219506651373123, + "learning_rate": 2.339266125545233e-06, + "loss": 0.0, + "step": 25077 + }, + { + "epoch": 10.198454656364376, + "grad_norm": 0.0009326887361399848, + "learning_rate": 2.3386149349963916e-06, + "loss": 0.0, + "step": 25078 + }, + { + "epoch": 10.198861325742172, + "grad_norm": 0.3940865522177333, + "learning_rate": 2.3379638230955238e-06, + "loss": 0.004, + "step": 25079 + }, + { + "epoch": 10.199267995119968, + "grad_norm": 0.19380141038244975, + "learning_rate": 2.337312789849313e-06, + "loss": 0.0019, + "step": 25080 + }, + { + "epoch": 10.199674664497763, + "grad_norm": 0.003743914330538637, + "learning_rate": 2.336661835264441e-06, + "loss": 0.0, + "step": 25081 + }, + { + "epoch": 10.20008133387556, + "grad_norm": 0.17889418192354936, + "learning_rate": 2.336010959347594e-06, + "loss": 0.0017, + "step": 25082 + }, + { + "epoch": 10.200488003253355, + "grad_norm": 0.1958844033696757, + "learning_rate": 2.3353601621054533e-06, + "loss": 0.002, + "step": 25083 + }, + { + "epoch": 10.20089467263115, + "grad_norm": 0.038199616799439, + "learning_rate": 2.334709443544697e-06, + "loss": 0.0002, + "step": 25084 + }, + { + "epoch": 10.201301342008946, + "grad_norm": 0.9554978368846512, + "learning_rate": 2.334058803672006e-06, + "loss": 0.0077, + "step": 25085 + }, + { + "epoch": 10.201708011386742, + "grad_norm": 0.0010535302491087694, + "learning_rate": 2.3334082424940595e-06, + "loss": 0.0, + "step": 25086 + }, + { + "epoch": 10.202114680764538, + "grad_norm": 0.07932091681288206, + "learning_rate": 2.3327577600175344e-06, + "loss": 0.0007, + "step": 25087 + }, + { + "epoch": 10.202521350142334, + "grad_norm": 0.10473078775972831, + "learning_rate": 2.3321073562491116e-06, + "loss": 0.0006, + "step": 25088 + }, + { + "epoch": 10.20292801952013, + "grad_norm": 0.004801263407451466, + "learning_rate": 2.3314570311954653e-06, + "loss": 0.0001, + "step": 25089 + }, + { + "epoch": 10.203334688897925, + "grad_norm": 0.0033310672454376225, + "learning_rate": 2.3308067848632734e-06, + "loss": 0.0, + "step": 25090 + }, + { + "epoch": 10.203741358275721, + "grad_norm": 0.1395604336700734, + "learning_rate": 2.3301566172592084e-06, + "loss": 0.0014, + "step": 25091 + }, + { + "epoch": 10.204148027653517, + "grad_norm": 0.17090914327114046, + "learning_rate": 2.3295065283899467e-06, + "loss": 0.0015, + "step": 25092 + }, + { + "epoch": 10.204554697031314, + "grad_norm": 0.0089471738400545, + "learning_rate": 2.3288565182621604e-06, + "loss": 0.0001, + "step": 25093 + }, + { + "epoch": 10.20496136640911, + "grad_norm": 0.006863339778827529, + "learning_rate": 2.3282065868825198e-06, + "loss": 0.0001, + "step": 25094 + }, + { + "epoch": 10.205368035786906, + "grad_norm": 0.10884691481660319, + "learning_rate": 2.3275567342577033e-06, + "loss": 0.0007, + "step": 25095 + }, + { + "epoch": 10.205774705164702, + "grad_norm": 1.808180979352865, + "learning_rate": 2.3269069603943785e-06, + "loss": 0.0139, + "step": 25096 + }, + { + "epoch": 10.206181374542497, + "grad_norm": 0.10097059442192728, + "learning_rate": 2.3262572652992143e-06, + "loss": 0.0012, + "step": 25097 + }, + { + "epoch": 10.206588043920293, + "grad_norm": 3.5516816359965495, + "learning_rate": 2.3256076489788825e-06, + "loss": 0.0171, + "step": 25098 + }, + { + "epoch": 10.206994713298089, + "grad_norm": 0.010817983745063516, + "learning_rate": 2.324958111440051e-06, + "loss": 0.0001, + "step": 25099 + }, + { + "epoch": 10.207401382675885, + "grad_norm": 0.006666270711385343, + "learning_rate": 2.324308652689383e-06, + "loss": 0.0001, + "step": 25100 + }, + { + "epoch": 10.20780805205368, + "grad_norm": 0.04756178332303628, + "learning_rate": 2.3236592727335538e-06, + "loss": 0.0005, + "step": 25101 + }, + { + "epoch": 10.208214721431476, + "grad_norm": 0.003275101014454815, + "learning_rate": 2.323009971579225e-06, + "loss": 0.0, + "step": 25102 + }, + { + "epoch": 10.208621390809272, + "grad_norm": 0.2583060825724206, + "learning_rate": 2.3223607492330626e-06, + "loss": 0.0017, + "step": 25103 + }, + { + "epoch": 10.209028060187068, + "grad_norm": 0.0027369786431129064, + "learning_rate": 2.321711605701731e-06, + "loss": 0.0, + "step": 25104 + }, + { + "epoch": 10.209434729564864, + "grad_norm": 0.004260197578534726, + "learning_rate": 2.3210625409918952e-06, + "loss": 0.0001, + "step": 25105 + }, + { + "epoch": 10.20984139894266, + "grad_norm": 0.04782792845743789, + "learning_rate": 2.320413555110216e-06, + "loss": 0.0005, + "step": 25106 + }, + { + "epoch": 10.210248068320455, + "grad_norm": 0.0013241190324735632, + "learning_rate": 2.319764648063354e-06, + "loss": 0.0, + "step": 25107 + }, + { + "epoch": 10.21065473769825, + "grad_norm": 0.36574199681168995, + "learning_rate": 2.3191158198579765e-06, + "loss": 0.0041, + "step": 25108 + }, + { + "epoch": 10.211061407076047, + "grad_norm": 1.7236915637126415, + "learning_rate": 2.3184670705007406e-06, + "loss": 0.0199, + "step": 25109 + }, + { + "epoch": 10.211468076453842, + "grad_norm": 0.01425213058956353, + "learning_rate": 2.3178183999983063e-06, + "loss": 0.0002, + "step": 25110 + }, + { + "epoch": 10.211874745831638, + "grad_norm": 0.016497612962974827, + "learning_rate": 2.3171698083573325e-06, + "loss": 0.0002, + "step": 25111 + }, + { + "epoch": 10.212281415209434, + "grad_norm": 0.0010601559951427713, + "learning_rate": 2.3165212955844773e-06, + "loss": 0.0, + "step": 25112 + }, + { + "epoch": 10.212688084587231, + "grad_norm": 0.0038041937932629982, + "learning_rate": 2.315872861686399e-06, + "loss": 0.0, + "step": 25113 + }, + { + "epoch": 10.213094753965027, + "grad_norm": 0.00032567640408816707, + "learning_rate": 2.3152245066697486e-06, + "loss": 0.0, + "step": 25114 + }, + { + "epoch": 10.213501423342823, + "grad_norm": 0.03687752241208065, + "learning_rate": 2.31457623054119e-06, + "loss": 0.0005, + "step": 25115 + }, + { + "epoch": 10.213908092720619, + "grad_norm": 0.4013839703976688, + "learning_rate": 2.313928033307374e-06, + "loss": 0.0039, + "step": 25116 + }, + { + "epoch": 10.214314762098414, + "grad_norm": 0.013004178182330992, + "learning_rate": 2.3132799149749554e-06, + "loss": 0.0002, + "step": 25117 + }, + { + "epoch": 10.21472143147621, + "grad_norm": 0.011221130727752231, + "learning_rate": 2.3126318755505873e-06, + "loss": 0.0001, + "step": 25118 + }, + { + "epoch": 10.215128100854006, + "grad_norm": 2.188929180443585, + "learning_rate": 2.3119839150409217e-06, + "loss": 0.0202, + "step": 25119 + }, + { + "epoch": 10.215534770231802, + "grad_norm": 0.027939818887987283, + "learning_rate": 2.311336033452608e-06, + "loss": 0.0004, + "step": 25120 + }, + { + "epoch": 10.215941439609598, + "grad_norm": 0.06679084587412208, + "learning_rate": 2.310688230792302e-06, + "loss": 0.0004, + "step": 25121 + }, + { + "epoch": 10.216348108987393, + "grad_norm": 0.02629886122544921, + "learning_rate": 2.310040507066651e-06, + "loss": 0.0001, + "step": 25122 + }, + { + "epoch": 10.216754778365189, + "grad_norm": 0.044704116548507526, + "learning_rate": 2.3093928622823047e-06, + "loss": 0.0003, + "step": 25123 + }, + { + "epoch": 10.217161447742985, + "grad_norm": 0.005834867177246891, + "learning_rate": 2.308745296445911e-06, + "loss": 0.0001, + "step": 25124 + }, + { + "epoch": 10.21756811712078, + "grad_norm": 0.07286935323201024, + "learning_rate": 2.3080978095641193e-06, + "loss": 0.0009, + "step": 25125 + }, + { + "epoch": 10.217974786498576, + "grad_norm": 0.18412965225488467, + "learning_rate": 2.307450401643573e-06, + "loss": 0.0018, + "step": 25126 + }, + { + "epoch": 10.218381455876372, + "grad_norm": 0.004975096402416128, + "learning_rate": 2.3068030726909174e-06, + "loss": 0.0, + "step": 25127 + }, + { + "epoch": 10.218788125254168, + "grad_norm": 0.0175876518718086, + "learning_rate": 2.3061558227128033e-06, + "loss": 0.0002, + "step": 25128 + }, + { + "epoch": 10.219194794631964, + "grad_norm": 0.01800256916817072, + "learning_rate": 2.305508651715872e-06, + "loss": 0.0003, + "step": 25129 + }, + { + "epoch": 10.21960146400976, + "grad_norm": 0.07061937144091274, + "learning_rate": 2.304861559706767e-06, + "loss": 0.0006, + "step": 25130 + }, + { + "epoch": 10.220008133387555, + "grad_norm": 0.01998920650075079, + "learning_rate": 2.3042145466921317e-06, + "loss": 0.0003, + "step": 25131 + }, + { + "epoch": 10.220414802765351, + "grad_norm": 0.005388169411997051, + "learning_rate": 2.3035676126786065e-06, + "loss": 0.0001, + "step": 25132 + }, + { + "epoch": 10.220821472143147, + "grad_norm": 0.06168221716113768, + "learning_rate": 2.3029207576728294e-06, + "loss": 0.0002, + "step": 25133 + }, + { + "epoch": 10.221228141520944, + "grad_norm": 0.037554327639294525, + "learning_rate": 2.302273981681449e-06, + "loss": 0.0006, + "step": 25134 + }, + { + "epoch": 10.22163481089874, + "grad_norm": 0.07776064682561988, + "learning_rate": 2.3016272847111e-06, + "loss": 0.0008, + "step": 25135 + }, + { + "epoch": 10.222041480276536, + "grad_norm": 0.03594589706620574, + "learning_rate": 2.3009806667684207e-06, + "loss": 0.0004, + "step": 25136 + }, + { + "epoch": 10.222448149654332, + "grad_norm": 1.0149251512467137, + "learning_rate": 2.300334127860051e-06, + "loss": 0.0103, + "step": 25137 + }, + { + "epoch": 10.222854819032127, + "grad_norm": 0.013512257914327799, + "learning_rate": 2.299687667992625e-06, + "loss": 0.0001, + "step": 25138 + }, + { + "epoch": 10.223261488409923, + "grad_norm": 0.007386796863172839, + "learning_rate": 2.299041287172782e-06, + "loss": 0.0001, + "step": 25139 + }, + { + "epoch": 10.223668157787719, + "grad_norm": 0.25280112891848433, + "learning_rate": 2.2983949854071554e-06, + "loss": 0.0027, + "step": 25140 + }, + { + "epoch": 10.224074827165515, + "grad_norm": 5.745437335704349, + "learning_rate": 2.297748762702381e-06, + "loss": 0.1323, + "step": 25141 + }, + { + "epoch": 10.22448149654331, + "grad_norm": 1.3362274698754752, + "learning_rate": 2.2971026190650914e-06, + "loss": 0.0096, + "step": 25142 + }, + { + "epoch": 10.224888165921106, + "grad_norm": 0.009919862461407636, + "learning_rate": 2.29645655450192e-06, + "loss": 0.0001, + "step": 25143 + }, + { + "epoch": 10.225294835298902, + "grad_norm": 0.0326759430266546, + "learning_rate": 2.295810569019499e-06, + "loss": 0.0002, + "step": 25144 + }, + { + "epoch": 10.225701504676698, + "grad_norm": 0.007632668382400337, + "learning_rate": 2.2951646626244572e-06, + "loss": 0.0001, + "step": 25145 + }, + { + "epoch": 10.226108174054493, + "grad_norm": 0.013092541370514172, + "learning_rate": 2.2945188353234315e-06, + "loss": 0.0001, + "step": 25146 + }, + { + "epoch": 10.22651484343229, + "grad_norm": 0.019336658536689838, + "learning_rate": 2.2938730871230473e-06, + "loss": 0.0001, + "step": 25147 + }, + { + "epoch": 10.226921512810085, + "grad_norm": 0.05014834266454419, + "learning_rate": 2.293227418029935e-06, + "loss": 0.0007, + "step": 25148 + }, + { + "epoch": 10.22732818218788, + "grad_norm": 0.009190637058717671, + "learning_rate": 2.2925818280507216e-06, + "loss": 0.0001, + "step": 25149 + }, + { + "epoch": 10.227734851565677, + "grad_norm": 0.004574232698680207, + "learning_rate": 2.291936317192035e-06, + "loss": 0.0, + "step": 25150 + }, + { + "epoch": 10.228141520943472, + "grad_norm": 0.030614221275567118, + "learning_rate": 2.2912908854604986e-06, + "loss": 0.0004, + "step": 25151 + }, + { + "epoch": 10.228548190321268, + "grad_norm": 0.11292092393146198, + "learning_rate": 2.290645532862743e-06, + "loss": 0.0015, + "step": 25152 + }, + { + "epoch": 10.228954859699064, + "grad_norm": 0.15121993420315985, + "learning_rate": 2.2900002594053915e-06, + "loss": 0.0011, + "step": 25153 + }, + { + "epoch": 10.229361529076861, + "grad_norm": 0.01042326405003812, + "learning_rate": 2.289355065095068e-06, + "loss": 0.0001, + "step": 25154 + }, + { + "epoch": 10.229768198454657, + "grad_norm": 0.5491553929836072, + "learning_rate": 2.288709949938396e-06, + "loss": 0.0053, + "step": 25155 + }, + { + "epoch": 10.230174867832453, + "grad_norm": 0.007745074745876639, + "learning_rate": 2.288064913941996e-06, + "loss": 0.0001, + "step": 25156 + }, + { + "epoch": 10.230581537210249, + "grad_norm": 0.03614215737005333, + "learning_rate": 2.287419957112492e-06, + "loss": 0.0004, + "step": 25157 + }, + { + "epoch": 10.230988206588044, + "grad_norm": 0.0037207249293117156, + "learning_rate": 2.2867750794565014e-06, + "loss": 0.0, + "step": 25158 + }, + { + "epoch": 10.23139487596584, + "grad_norm": 2.3134700622885944, + "learning_rate": 2.2861302809806483e-06, + "loss": 0.0337, + "step": 25159 + }, + { + "epoch": 10.231801545343636, + "grad_norm": 0.01712700626120134, + "learning_rate": 2.2854855616915507e-06, + "loss": 0.0002, + "step": 25160 + }, + { + "epoch": 10.232208214721432, + "grad_norm": 0.31095274291617314, + "learning_rate": 2.2848409215958257e-06, + "loss": 0.0022, + "step": 25161 + }, + { + "epoch": 10.232614884099227, + "grad_norm": 0.04203430925669046, + "learning_rate": 2.2841963607000926e-06, + "loss": 0.0006, + "step": 25162 + }, + { + "epoch": 10.233021553477023, + "grad_norm": 0.011908244099036997, + "learning_rate": 2.2835518790109666e-06, + "loss": 0.0001, + "step": 25163 + }, + { + "epoch": 10.233428222854819, + "grad_norm": 0.0013563322028047494, + "learning_rate": 2.282907476535061e-06, + "loss": 0.0, + "step": 25164 + }, + { + "epoch": 10.233834892232615, + "grad_norm": 0.06148706587549987, + "learning_rate": 2.2822631532789985e-06, + "loss": 0.0005, + "step": 25165 + }, + { + "epoch": 10.23424156161041, + "grad_norm": 0.24424767437590428, + "learning_rate": 2.281618909249388e-06, + "loss": 0.0022, + "step": 25166 + }, + { + "epoch": 10.234648230988206, + "grad_norm": 0.11933606939280887, + "learning_rate": 2.280974744452843e-06, + "loss": 0.0016, + "step": 25167 + }, + { + "epoch": 10.235054900366002, + "grad_norm": 0.05036123146502037, + "learning_rate": 2.2803306588959784e-06, + "loss": 0.0004, + "step": 25168 + }, + { + "epoch": 10.235461569743798, + "grad_norm": 0.12915423723628244, + "learning_rate": 2.2796866525854046e-06, + "loss": 0.0012, + "step": 25169 + }, + { + "epoch": 10.235868239121594, + "grad_norm": 0.9023612822714577, + "learning_rate": 2.2790427255277324e-06, + "loss": 0.003, + "step": 25170 + }, + { + "epoch": 10.23627490849939, + "grad_norm": 0.006253319429077117, + "learning_rate": 2.2783988777295695e-06, + "loss": 0.0001, + "step": 25171 + }, + { + "epoch": 10.236681577877185, + "grad_norm": 0.09211137778415261, + "learning_rate": 2.2777551091975325e-06, + "loss": 0.0009, + "step": 25172 + }, + { + "epoch": 10.237088247254981, + "grad_norm": 0.006233135373428621, + "learning_rate": 2.2771114199382247e-06, + "loss": 0.0001, + "step": 25173 + }, + { + "epoch": 10.237494916632777, + "grad_norm": 0.002981156787513434, + "learning_rate": 2.276467809958255e-06, + "loss": 0.0, + "step": 25174 + }, + { + "epoch": 10.237901586010574, + "grad_norm": 0.000952130520546988, + "learning_rate": 2.2758242792642306e-06, + "loss": 0.0, + "step": 25175 + }, + { + "epoch": 10.23830825538837, + "grad_norm": 0.26836703913133125, + "learning_rate": 2.2751808278627575e-06, + "loss": 0.0026, + "step": 25176 + }, + { + "epoch": 10.238714924766166, + "grad_norm": 0.032429244314098105, + "learning_rate": 2.2745374557604382e-06, + "loss": 0.0002, + "step": 25177 + }, + { + "epoch": 10.239121594143962, + "grad_norm": 2.502601414984579, + "learning_rate": 2.2738941629638834e-06, + "loss": 0.0267, + "step": 25178 + }, + { + "epoch": 10.239528263521757, + "grad_norm": 0.1500483212829425, + "learning_rate": 2.2732509494796927e-06, + "loss": 0.0015, + "step": 25179 + }, + { + "epoch": 10.239934932899553, + "grad_norm": 0.003046242588566382, + "learning_rate": 2.2726078153144704e-06, + "loss": 0.0, + "step": 25180 + }, + { + "epoch": 10.240341602277349, + "grad_norm": 0.18013692034552475, + "learning_rate": 2.2719647604748175e-06, + "loss": 0.0019, + "step": 25181 + }, + { + "epoch": 10.240748271655145, + "grad_norm": 0.012193381392859215, + "learning_rate": 2.271321784967336e-06, + "loss": 0.0002, + "step": 25182 + }, + { + "epoch": 10.24115494103294, + "grad_norm": 0.02079187172307357, + "learning_rate": 2.2706788887986265e-06, + "loss": 0.0002, + "step": 25183 + }, + { + "epoch": 10.241561610410736, + "grad_norm": 29.145066793603775, + "learning_rate": 2.2700360719752847e-06, + "loss": 0.0832, + "step": 25184 + }, + { + "epoch": 10.241968279788532, + "grad_norm": 0.03407062669657634, + "learning_rate": 2.2693933345039164e-06, + "loss": 0.0004, + "step": 25185 + }, + { + "epoch": 10.242374949166328, + "grad_norm": 0.02135331629869575, + "learning_rate": 2.268750676391116e-06, + "loss": 0.0001, + "step": 25186 + }, + { + "epoch": 10.242781618544123, + "grad_norm": 0.010283509319894685, + "learning_rate": 2.268108097643481e-06, + "loss": 0.0001, + "step": 25187 + }, + { + "epoch": 10.24318828792192, + "grad_norm": 0.06188063884127905, + "learning_rate": 2.2674655982676084e-06, + "loss": 0.0007, + "step": 25188 + }, + { + "epoch": 10.243594957299715, + "grad_norm": 0.01857241900014226, + "learning_rate": 2.266823178270092e-06, + "loss": 0.0002, + "step": 25189 + }, + { + "epoch": 10.24400162667751, + "grad_norm": 0.038063943318083035, + "learning_rate": 2.2661808376575257e-06, + "loss": 0.0002, + "step": 25190 + }, + { + "epoch": 10.244408296055306, + "grad_norm": 0.3059902938457662, + "learning_rate": 2.2655385764365078e-06, + "loss": 0.0049, + "step": 25191 + }, + { + "epoch": 10.244814965433102, + "grad_norm": 0.0007107738534563738, + "learning_rate": 2.264896394613629e-06, + "loss": 0.0, + "step": 25192 + }, + { + "epoch": 10.245221634810898, + "grad_norm": 0.0027190932079434833, + "learning_rate": 2.2642542921954824e-06, + "loss": 0.0, + "step": 25193 + }, + { + "epoch": 10.245628304188694, + "grad_norm": 0.054608221047305686, + "learning_rate": 2.2636122691886585e-06, + "loss": 0.0005, + "step": 25194 + }, + { + "epoch": 10.246034973566491, + "grad_norm": 0.007130821030317036, + "learning_rate": 2.262970325599748e-06, + "loss": 0.0001, + "step": 25195 + }, + { + "epoch": 10.246441642944287, + "grad_norm": 0.026590304781452606, + "learning_rate": 2.262328461435341e-06, + "loss": 0.0003, + "step": 25196 + }, + { + "epoch": 10.246848312322083, + "grad_norm": 0.0050363468474949, + "learning_rate": 2.261686676702025e-06, + "loss": 0.0001, + "step": 25197 + }, + { + "epoch": 10.247254981699879, + "grad_norm": 0.010588888300617804, + "learning_rate": 2.261044971406391e-06, + "loss": 0.0001, + "step": 25198 + }, + { + "epoch": 10.247661651077674, + "grad_norm": 0.024134125820540062, + "learning_rate": 2.2604033455550265e-06, + "loss": 0.0002, + "step": 25199 + }, + { + "epoch": 10.24806832045547, + "grad_norm": 0.00013310799415282363, + "learning_rate": 2.259761799154516e-06, + "loss": 0.0, + "step": 25200 + }, + { + "epoch": 10.248474989833266, + "grad_norm": 0.012713400628701011, + "learning_rate": 2.2591203322114475e-06, + "loss": 0.0001, + "step": 25201 + }, + { + "epoch": 10.248881659211062, + "grad_norm": 0.014674449511593298, + "learning_rate": 2.258478944732404e-06, + "loss": 0.0002, + "step": 25202 + }, + { + "epoch": 10.249288328588857, + "grad_norm": 0.12419695282076472, + "learning_rate": 2.2578376367239675e-06, + "loss": 0.0012, + "step": 25203 + }, + { + "epoch": 10.249694997966653, + "grad_norm": 0.0355037468448031, + "learning_rate": 2.2571964081927265e-06, + "loss": 0.0003, + "step": 25204 + }, + { + "epoch": 10.250101667344449, + "grad_norm": 3.082797846563541, + "learning_rate": 2.256555259145262e-06, + "loss": 0.0328, + "step": 25205 + }, + { + "epoch": 10.250508336722245, + "grad_norm": 0.13156714504571826, + "learning_rate": 2.2559141895881543e-06, + "loss": 0.001, + "step": 25206 + }, + { + "epoch": 10.25091500610004, + "grad_norm": 0.2401527598600935, + "learning_rate": 2.2552731995279854e-06, + "loss": 0.0024, + "step": 25207 + }, + { + "epoch": 10.251321675477836, + "grad_norm": 0.0017971720483391663, + "learning_rate": 2.2546322889713344e-06, + "loss": 0.0, + "step": 25208 + }, + { + "epoch": 10.251728344855632, + "grad_norm": 0.08482746737633785, + "learning_rate": 2.2539914579247823e-06, + "loss": 0.0006, + "step": 25209 + }, + { + "epoch": 10.252135014233428, + "grad_norm": 0.0439397588323079, + "learning_rate": 2.2533507063949033e-06, + "loss": 0.0006, + "step": 25210 + }, + { + "epoch": 10.252541683611224, + "grad_norm": 0.0008621399114545779, + "learning_rate": 2.25271003438828e-06, + "loss": 0.0, + "step": 25211 + }, + { + "epoch": 10.25294835298902, + "grad_norm": 0.045213562819450395, + "learning_rate": 2.2520694419114875e-06, + "loss": 0.0006, + "step": 25212 + }, + { + "epoch": 10.253355022366815, + "grad_norm": 0.0009247978364010406, + "learning_rate": 2.251428928971102e-06, + "loss": 0.0, + "step": 25213 + }, + { + "epoch": 10.25376169174461, + "grad_norm": 0.6128934863752653, + "learning_rate": 2.250788495573698e-06, + "loss": 0.0056, + "step": 25214 + }, + { + "epoch": 10.254168361122407, + "grad_norm": 0.012062465307936185, + "learning_rate": 2.250148141725851e-06, + "loss": 0.0001, + "step": 25215 + }, + { + "epoch": 10.254575030500204, + "grad_norm": 0.00014197618087537814, + "learning_rate": 2.2495078674341307e-06, + "loss": 0.0, + "step": 25216 + }, + { + "epoch": 10.254981699878, + "grad_norm": 0.001991958137420403, + "learning_rate": 2.248867672705115e-06, + "loss": 0.0, + "step": 25217 + }, + { + "epoch": 10.255388369255796, + "grad_norm": 0.015494838433545328, + "learning_rate": 2.248227557545374e-06, + "loss": 0.0001, + "step": 25218 + }, + { + "epoch": 10.255795038633591, + "grad_norm": 0.21885515151344795, + "learning_rate": 2.2475875219614784e-06, + "loss": 0.0021, + "step": 25219 + }, + { + "epoch": 10.256201708011387, + "grad_norm": 0.00435230203057012, + "learning_rate": 2.2469475659599993e-06, + "loss": 0.0001, + "step": 25220 + }, + { + "epoch": 10.256608377389183, + "grad_norm": 1.8357395265435439, + "learning_rate": 2.2463076895475046e-06, + "loss": 0.0234, + "step": 25221 + }, + { + "epoch": 10.257015046766979, + "grad_norm": 1.0250391219159325, + "learning_rate": 2.245667892730564e-06, + "loss": 0.01, + "step": 25222 + }, + { + "epoch": 10.257421716144774, + "grad_norm": 0.012011387496187251, + "learning_rate": 2.2450281755157435e-06, + "loss": 0.0001, + "step": 25223 + }, + { + "epoch": 10.25782838552257, + "grad_norm": 0.01747708706905156, + "learning_rate": 2.244388537909613e-06, + "loss": 0.0002, + "step": 25224 + }, + { + "epoch": 10.258235054900366, + "grad_norm": 0.07648021950758842, + "learning_rate": 2.2437489799187382e-06, + "loss": 0.0008, + "step": 25225 + }, + { + "epoch": 10.258641724278162, + "grad_norm": 0.0049448271170879415, + "learning_rate": 2.243109501549684e-06, + "loss": 0.0001, + "step": 25226 + }, + { + "epoch": 10.259048393655958, + "grad_norm": 0.5787412059762801, + "learning_rate": 2.2424701028090133e-06, + "loss": 0.0046, + "step": 25227 + }, + { + "epoch": 10.259455063033753, + "grad_norm": 0.23762809751493866, + "learning_rate": 2.2418307837032926e-06, + "loss": 0.0018, + "step": 25228 + }, + { + "epoch": 10.259861732411549, + "grad_norm": 0.09453871657309183, + "learning_rate": 2.241191544239083e-06, + "loss": 0.0007, + "step": 25229 + }, + { + "epoch": 10.260268401789345, + "grad_norm": 0.10639381345860816, + "learning_rate": 2.240552384422945e-06, + "loss": 0.0012, + "step": 25230 + }, + { + "epoch": 10.26067507116714, + "grad_norm": 0.08580813692049676, + "learning_rate": 2.2399133042614443e-06, + "loss": 0.0009, + "step": 25231 + }, + { + "epoch": 10.261081740544936, + "grad_norm": 1.191810138829259, + "learning_rate": 2.23927430376114e-06, + "loss": 0.0114, + "step": 25232 + }, + { + "epoch": 10.261488409922732, + "grad_norm": 0.02601038701973738, + "learning_rate": 2.238635382928589e-06, + "loss": 0.0003, + "step": 25233 + }, + { + "epoch": 10.261895079300528, + "grad_norm": 0.015821814223134043, + "learning_rate": 2.2379965417703532e-06, + "loss": 0.0002, + "step": 25234 + }, + { + "epoch": 10.262301748678324, + "grad_norm": 0.0034842641874518465, + "learning_rate": 2.2373577802929892e-06, + "loss": 0.0, + "step": 25235 + }, + { + "epoch": 10.262708418056121, + "grad_norm": 0.022784839061184888, + "learning_rate": 2.2367190985030517e-06, + "loss": 0.0002, + "step": 25236 + }, + { + "epoch": 10.263115087433917, + "grad_norm": 0.05644008556581004, + "learning_rate": 2.2360804964071036e-06, + "loss": 0.0006, + "step": 25237 + }, + { + "epoch": 10.263521756811713, + "grad_norm": 0.5501518670210501, + "learning_rate": 2.235441974011698e-06, + "loss": 0.008, + "step": 25238 + }, + { + "epoch": 10.263928426189509, + "grad_norm": 0.10116414863601297, + "learning_rate": 2.234803531323386e-06, + "loss": 0.001, + "step": 25239 + }, + { + "epoch": 10.264335095567304, + "grad_norm": 0.006831311316869032, + "learning_rate": 2.2341651683487243e-06, + "loss": 0.0001, + "step": 25240 + }, + { + "epoch": 10.2647417649451, + "grad_norm": 0.16631774620791395, + "learning_rate": 2.233526885094263e-06, + "loss": 0.0015, + "step": 25241 + }, + { + "epoch": 10.265148434322896, + "grad_norm": 0.0023852838619491425, + "learning_rate": 2.2328886815665585e-06, + "loss": 0.0, + "step": 25242 + }, + { + "epoch": 10.265555103700692, + "grad_norm": 0.08019043796748294, + "learning_rate": 2.232250557772162e-06, + "loss": 0.001, + "step": 25243 + }, + { + "epoch": 10.265961773078487, + "grad_norm": 0.00391753849996964, + "learning_rate": 2.2316125137176227e-06, + "loss": 0.0, + "step": 25244 + }, + { + "epoch": 10.266368442456283, + "grad_norm": 0.4066276127292603, + "learning_rate": 2.2309745494094903e-06, + "loss": 0.003, + "step": 25245 + }, + { + "epoch": 10.266775111834079, + "grad_norm": 0.014367031862479217, + "learning_rate": 2.2303366648543145e-06, + "loss": 0.0001, + "step": 25246 + }, + { + "epoch": 10.267181781211875, + "grad_norm": 0.00043328140111212587, + "learning_rate": 2.229698860058643e-06, + "loss": 0.0, + "step": 25247 + }, + { + "epoch": 10.26758845058967, + "grad_norm": 0.008008141741751093, + "learning_rate": 2.2290611350290216e-06, + "loss": 0.0001, + "step": 25248 + }, + { + "epoch": 10.267995119967466, + "grad_norm": 0.0002540888123628386, + "learning_rate": 2.2284234897720003e-06, + "loss": 0.0, + "step": 25249 + }, + { + "epoch": 10.268401789345262, + "grad_norm": 0.009926176759620988, + "learning_rate": 2.2277859242941236e-06, + "loss": 0.0001, + "step": 25250 + }, + { + "epoch": 10.268808458723058, + "grad_norm": 0.0016722579970757413, + "learning_rate": 2.227148438601936e-06, + "loss": 0.0, + "step": 25251 + }, + { + "epoch": 10.269215128100853, + "grad_norm": 0.0022178262184993905, + "learning_rate": 2.2265110327019823e-06, + "loss": 0.0, + "step": 25252 + }, + { + "epoch": 10.26962179747865, + "grad_norm": 0.002061550540162815, + "learning_rate": 2.225873706600804e-06, + "loss": 0.0, + "step": 25253 + }, + { + "epoch": 10.270028466856445, + "grad_norm": 0.05909886310635823, + "learning_rate": 2.2252364603049425e-06, + "loss": 0.0009, + "step": 25254 + }, + { + "epoch": 10.27043513623424, + "grad_norm": 0.7720003345146402, + "learning_rate": 2.224599293820945e-06, + "loss": 0.0067, + "step": 25255 + }, + { + "epoch": 10.270841805612037, + "grad_norm": 0.03498737617137877, + "learning_rate": 2.2239622071553493e-06, + "loss": 0.0003, + "step": 25256 + }, + { + "epoch": 10.271248474989834, + "grad_norm": 0.06493755831376136, + "learning_rate": 2.2233252003146942e-06, + "loss": 0.0007, + "step": 25257 + }, + { + "epoch": 10.27165514436763, + "grad_norm": 0.002972322310934576, + "learning_rate": 2.2226882733055197e-06, + "loss": 0.0, + "step": 25258 + }, + { + "epoch": 10.272061813745426, + "grad_norm": 0.7308583139270439, + "learning_rate": 2.222051426134365e-06, + "loss": 0.0059, + "step": 25259 + }, + { + "epoch": 10.272468483123221, + "grad_norm": 0.8116794321417745, + "learning_rate": 2.2214146588077667e-06, + "loss": 0.0088, + "step": 25260 + }, + { + "epoch": 10.272875152501017, + "grad_norm": 2.623241457908159, + "learning_rate": 2.2207779713322586e-06, + "loss": 0.0582, + "step": 25261 + }, + { + "epoch": 10.273281821878813, + "grad_norm": 0.013594119203121865, + "learning_rate": 2.2201413637143833e-06, + "loss": 0.0001, + "step": 25262 + }, + { + "epoch": 10.273688491256609, + "grad_norm": 0.03685785235248619, + "learning_rate": 2.219504835960672e-06, + "loss": 0.0003, + "step": 25263 + }, + { + "epoch": 10.274095160634404, + "grad_norm": 0.00035581591502687463, + "learning_rate": 2.2188683880776595e-06, + "loss": 0.0, + "step": 25264 + }, + { + "epoch": 10.2745018300122, + "grad_norm": 0.9960533473408694, + "learning_rate": 2.2182320200718787e-06, + "loss": 0.0076, + "step": 25265 + }, + { + "epoch": 10.274908499389996, + "grad_norm": 0.3341992271706582, + "learning_rate": 2.217595731949863e-06, + "loss": 0.0026, + "step": 25266 + }, + { + "epoch": 10.275315168767792, + "grad_norm": 0.0313566531139048, + "learning_rate": 2.216959523718142e-06, + "loss": 0.0002, + "step": 25267 + }, + { + "epoch": 10.275721838145587, + "grad_norm": 0.00825614039550946, + "learning_rate": 2.2163233953832508e-06, + "loss": 0.0001, + "step": 25268 + }, + { + "epoch": 10.276128507523383, + "grad_norm": 0.24391639950829686, + "learning_rate": 2.215687346951717e-06, + "loss": 0.0023, + "step": 25269 + }, + { + "epoch": 10.276535176901179, + "grad_norm": 0.017894104007211017, + "learning_rate": 2.215051378430071e-06, + "loss": 0.0002, + "step": 25270 + }, + { + "epoch": 10.276941846278975, + "grad_norm": 0.004528750715582923, + "learning_rate": 2.2144154898248404e-06, + "loss": 0.0001, + "step": 25271 + }, + { + "epoch": 10.27734851565677, + "grad_norm": 0.05466795697794829, + "learning_rate": 2.213779681142553e-06, + "loss": 0.0006, + "step": 25272 + }, + { + "epoch": 10.277755185034566, + "grad_norm": 0.2151727966160636, + "learning_rate": 2.2131439523897358e-06, + "loss": 0.0014, + "step": 25273 + }, + { + "epoch": 10.278161854412362, + "grad_norm": 0.03449933477782099, + "learning_rate": 2.212508303572912e-06, + "loss": 0.0004, + "step": 25274 + }, + { + "epoch": 10.278568523790158, + "grad_norm": 0.13679014674934717, + "learning_rate": 2.2118727346986136e-06, + "loss": 0.0013, + "step": 25275 + }, + { + "epoch": 10.278975193167954, + "grad_norm": 0.10494485541432559, + "learning_rate": 2.21123724577336e-06, + "loss": 0.0007, + "step": 25276 + }, + { + "epoch": 10.279381862545751, + "grad_norm": 0.006371601947509781, + "learning_rate": 2.2106018368036775e-06, + "loss": 0.0001, + "step": 25277 + }, + { + "epoch": 10.279788531923547, + "grad_norm": 0.028039103121526476, + "learning_rate": 2.209966507796086e-06, + "loss": 0.0002, + "step": 25278 + }, + { + "epoch": 10.280195201301343, + "grad_norm": 0.015374528187243747, + "learning_rate": 2.2093312587571105e-06, + "loss": 0.0002, + "step": 25279 + }, + { + "epoch": 10.280601870679138, + "grad_norm": 0.037263808750656484, + "learning_rate": 2.208696089693266e-06, + "loss": 0.0004, + "step": 25280 + }, + { + "epoch": 10.281008540056934, + "grad_norm": 0.0012298624652613856, + "learning_rate": 2.2080610006110814e-06, + "loss": 0.0, + "step": 25281 + }, + { + "epoch": 10.28141520943473, + "grad_norm": 0.004391211611576875, + "learning_rate": 2.2074259915170727e-06, + "loss": 0.0, + "step": 25282 + }, + { + "epoch": 10.281821878812526, + "grad_norm": 0.6168449269647455, + "learning_rate": 2.206791062417757e-06, + "loss": 0.0043, + "step": 25283 + }, + { + "epoch": 10.282228548190322, + "grad_norm": 0.010001170307126052, + "learning_rate": 2.2061562133196533e-06, + "loss": 0.0001, + "step": 25284 + }, + { + "epoch": 10.282635217568117, + "grad_norm": 0.24532638192405853, + "learning_rate": 2.2055214442292793e-06, + "loss": 0.0029, + "step": 25285 + }, + { + "epoch": 10.283041886945913, + "grad_norm": 0.17416792802607936, + "learning_rate": 2.2048867551531505e-06, + "loss": 0.0014, + "step": 25286 + }, + { + "epoch": 10.283448556323709, + "grad_norm": 0.06938198600701619, + "learning_rate": 2.2042521460977794e-06, + "loss": 0.0007, + "step": 25287 + }, + { + "epoch": 10.283855225701505, + "grad_norm": 0.033926339648886264, + "learning_rate": 2.2036176170696865e-06, + "loss": 0.0004, + "step": 25288 + }, + { + "epoch": 10.2842618950793, + "grad_norm": 1.2495261360032115, + "learning_rate": 2.202983168075382e-06, + "loss": 0.01, + "step": 25289 + }, + { + "epoch": 10.284668564457096, + "grad_norm": 0.2365448569727257, + "learning_rate": 2.20234879912138e-06, + "loss": 0.0013, + "step": 25290 + }, + { + "epoch": 10.285075233834892, + "grad_norm": 3.7034234960785524, + "learning_rate": 2.2017145102141913e-06, + "loss": 0.0357, + "step": 25291 + }, + { + "epoch": 10.285481903212688, + "grad_norm": 0.07119503195097425, + "learning_rate": 2.2010803013603287e-06, + "loss": 0.0006, + "step": 25292 + }, + { + "epoch": 10.285888572590483, + "grad_norm": 0.25908970788846647, + "learning_rate": 2.2004461725662995e-06, + "loss": 0.0022, + "step": 25293 + }, + { + "epoch": 10.28629524196828, + "grad_norm": 0.105960295433843, + "learning_rate": 2.1998121238386174e-06, + "loss": 0.001, + "step": 25294 + }, + { + "epoch": 10.286701911346075, + "grad_norm": 0.0032368321654628615, + "learning_rate": 2.199178155183791e-06, + "loss": 0.0, + "step": 25295 + }, + { + "epoch": 10.28710858072387, + "grad_norm": 0.1332556175677377, + "learning_rate": 2.1985442666083256e-06, + "loss": 0.0016, + "step": 25296 + }, + { + "epoch": 10.287515250101666, + "grad_norm": 0.1327912379978662, + "learning_rate": 2.1979104581187305e-06, + "loss": 0.0019, + "step": 25297 + }, + { + "epoch": 10.287921919479464, + "grad_norm": 0.019648340192200683, + "learning_rate": 2.19727672972151e-06, + "loss": 0.0002, + "step": 25298 + }, + { + "epoch": 10.28832858885726, + "grad_norm": 0.36530586943157006, + "learning_rate": 2.196643081423172e-06, + "loss": 0.0021, + "step": 25299 + }, + { + "epoch": 10.288735258235056, + "grad_norm": 0.0007849854736622653, + "learning_rate": 2.196009513230217e-06, + "loss": 0.0, + "step": 25300 + }, + { + "epoch": 10.289141927612851, + "grad_norm": 0.07367662901247884, + "learning_rate": 2.195376025149156e-06, + "loss": 0.0008, + "step": 25301 + }, + { + "epoch": 10.289548596990647, + "grad_norm": 0.7283659780421129, + "learning_rate": 2.1947426171864873e-06, + "loss": 0.0069, + "step": 25302 + }, + { + "epoch": 10.289955266368443, + "grad_norm": 0.002597482835003452, + "learning_rate": 2.1941092893487137e-06, + "loss": 0.0, + "step": 25303 + }, + { + "epoch": 10.290361935746239, + "grad_norm": 0.2808919129726178, + "learning_rate": 2.193476041642336e-06, + "loss": 0.002, + "step": 25304 + }, + { + "epoch": 10.290768605124034, + "grad_norm": 0.03851179055093317, + "learning_rate": 2.1928428740738573e-06, + "loss": 0.0003, + "step": 25305 + }, + { + "epoch": 10.29117527450183, + "grad_norm": 0.23631684203475609, + "learning_rate": 2.1922097866497718e-06, + "loss": 0.0029, + "step": 25306 + }, + { + "epoch": 10.291581943879626, + "grad_norm": 0.11820149102925528, + "learning_rate": 2.1915767793765862e-06, + "loss": 0.0009, + "step": 25307 + }, + { + "epoch": 10.291988613257422, + "grad_norm": 1.2576912199419812, + "learning_rate": 2.190943852260794e-06, + "loss": 0.0152, + "step": 25308 + }, + { + "epoch": 10.292395282635217, + "grad_norm": 0.0027921124064802317, + "learning_rate": 2.190311005308894e-06, + "loss": 0.0, + "step": 25309 + }, + { + "epoch": 10.292801952013013, + "grad_norm": 0.0009195516077007969, + "learning_rate": 2.189678238527382e-06, + "loss": 0.0, + "step": 25310 + }, + { + "epoch": 10.293208621390809, + "grad_norm": 0.045222240797388645, + "learning_rate": 2.1890455519227525e-06, + "loss": 0.0003, + "step": 25311 + }, + { + "epoch": 10.293615290768605, + "grad_norm": 0.030105132275690075, + "learning_rate": 2.1884129455015023e-06, + "loss": 0.0003, + "step": 25312 + }, + { + "epoch": 10.2940219601464, + "grad_norm": 0.027450619568741532, + "learning_rate": 2.1877804192701225e-06, + "loss": 0.0002, + "step": 25313 + }, + { + "epoch": 10.294428629524196, + "grad_norm": 0.004721951605460137, + "learning_rate": 2.187147973235111e-06, + "loss": 0.0001, + "step": 25314 + }, + { + "epoch": 10.294835298901992, + "grad_norm": 1.5196677020997538, + "learning_rate": 2.1865156074029583e-06, + "loss": 0.0149, + "step": 25315 + }, + { + "epoch": 10.295241968279788, + "grad_norm": 0.005443596585192108, + "learning_rate": 2.1858833217801547e-06, + "loss": 0.0001, + "step": 25316 + }, + { + "epoch": 10.295648637657584, + "grad_norm": 0.004371531830455804, + "learning_rate": 2.1852511163731915e-06, + "loss": 0.0, + "step": 25317 + }, + { + "epoch": 10.296055307035381, + "grad_norm": 0.004297964520429802, + "learning_rate": 2.1846189911885586e-06, + "loss": 0.0001, + "step": 25318 + }, + { + "epoch": 10.296461976413177, + "grad_norm": 0.026351722633845664, + "learning_rate": 2.1839869462327423e-06, + "loss": 0.0003, + "step": 25319 + }, + { + "epoch": 10.296868645790973, + "grad_norm": 0.010631077061399484, + "learning_rate": 2.1833549815122366e-06, + "loss": 0.0001, + "step": 25320 + }, + { + "epoch": 10.297275315168768, + "grad_norm": 0.012794943772854206, + "learning_rate": 2.1827230970335257e-06, + "loss": 0.0001, + "step": 25321 + }, + { + "epoch": 10.297681984546564, + "grad_norm": 0.09529247665247213, + "learning_rate": 2.1820912928030978e-06, + "loss": 0.0008, + "step": 25322 + }, + { + "epoch": 10.29808865392436, + "grad_norm": 0.07810980000922656, + "learning_rate": 2.1814595688274365e-06, + "loss": 0.0007, + "step": 25323 + }, + { + "epoch": 10.298495323302156, + "grad_norm": 0.005430834161248904, + "learning_rate": 2.180827925113027e-06, + "loss": 0.0001, + "step": 25324 + }, + { + "epoch": 10.298901992679951, + "grad_norm": 0.027002948498327073, + "learning_rate": 2.180196361666356e-06, + "loss": 0.0003, + "step": 25325 + }, + { + "epoch": 10.299308662057747, + "grad_norm": 0.047921781397463566, + "learning_rate": 2.1795648784939004e-06, + "loss": 0.0004, + "step": 25326 + }, + { + "epoch": 10.299715331435543, + "grad_norm": 0.0643514530835669, + "learning_rate": 2.178933475602152e-06, + "loss": 0.0008, + "step": 25327 + }, + { + "epoch": 10.300122000813339, + "grad_norm": 14.786988724566658, + "learning_rate": 2.1783021529975856e-06, + "loss": 0.4364, + "step": 25328 + }, + { + "epoch": 10.300528670191134, + "grad_norm": 0.040703956715268046, + "learning_rate": 2.1776709106866854e-06, + "loss": 0.0003, + "step": 25329 + }, + { + "epoch": 10.30093533956893, + "grad_norm": 0.003446654414442295, + "learning_rate": 2.17703974867593e-06, + "loss": 0.0, + "step": 25330 + }, + { + "epoch": 10.301342008946726, + "grad_norm": 0.0005338481815971338, + "learning_rate": 2.1764086669717987e-06, + "loss": 0.0, + "step": 25331 + }, + { + "epoch": 10.301748678324522, + "grad_norm": 0.02605303069056891, + "learning_rate": 2.175777665580767e-06, + "loss": 0.0003, + "step": 25332 + }, + { + "epoch": 10.302155347702318, + "grad_norm": 0.09454478571439827, + "learning_rate": 2.1751467445093187e-06, + "loss": 0.0011, + "step": 25333 + }, + { + "epoch": 10.302562017080113, + "grad_norm": 0.059260138257962995, + "learning_rate": 2.1745159037639275e-06, + "loss": 0.0008, + "step": 25334 + }, + { + "epoch": 10.302968686457909, + "grad_norm": 0.06880484444404174, + "learning_rate": 2.1738851433510677e-06, + "loss": 0.0008, + "step": 25335 + }, + { + "epoch": 10.303375355835705, + "grad_norm": 0.00042235521141668957, + "learning_rate": 2.1732544632772166e-06, + "loss": 0.0, + "step": 25336 + }, + { + "epoch": 10.3037820252135, + "grad_norm": 0.003054206312889654, + "learning_rate": 2.1726238635488507e-06, + "loss": 0.0, + "step": 25337 + }, + { + "epoch": 10.304188694591296, + "grad_norm": 0.019063526552775087, + "learning_rate": 2.171993344172434e-06, + "loss": 0.0001, + "step": 25338 + }, + { + "epoch": 10.304595363969094, + "grad_norm": 0.012811617476458915, + "learning_rate": 2.171362905154448e-06, + "loss": 0.0002, + "step": 25339 + }, + { + "epoch": 10.30500203334689, + "grad_norm": 0.2725542256043014, + "learning_rate": 2.170732546501363e-06, + "loss": 0.0021, + "step": 25340 + }, + { + "epoch": 10.305408702724685, + "grad_norm": 0.4896020127648621, + "learning_rate": 2.1701022682196483e-06, + "loss": 0.0043, + "step": 25341 + }, + { + "epoch": 10.305815372102481, + "grad_norm": 0.05374579570321497, + "learning_rate": 2.1694720703157733e-06, + "loss": 0.0005, + "step": 25342 + }, + { + "epoch": 10.306222041480277, + "grad_norm": 0.01205662961710722, + "learning_rate": 2.16884195279621e-06, + "loss": 0.0001, + "step": 25343 + }, + { + "epoch": 10.306628710858073, + "grad_norm": 0.025306622262606226, + "learning_rate": 2.168211915667422e-06, + "loss": 0.0003, + "step": 25344 + }, + { + "epoch": 10.307035380235869, + "grad_norm": 0.019827393800432743, + "learning_rate": 2.167581958935884e-06, + "loss": 0.0002, + "step": 25345 + }, + { + "epoch": 10.307442049613664, + "grad_norm": 0.036691494842994235, + "learning_rate": 2.166952082608058e-06, + "loss": 0.0004, + "step": 25346 + }, + { + "epoch": 10.30784871899146, + "grad_norm": 0.027293290832780866, + "learning_rate": 2.166322286690411e-06, + "loss": 0.0003, + "step": 25347 + }, + { + "epoch": 10.308255388369256, + "grad_norm": 0.05103442050006217, + "learning_rate": 2.1656925711894094e-06, + "loss": 0.0005, + "step": 25348 + }, + { + "epoch": 10.308662057747052, + "grad_norm": 0.0005957111417380598, + "learning_rate": 2.165062936111516e-06, + "loss": 0.0, + "step": 25349 + }, + { + "epoch": 10.309068727124847, + "grad_norm": 0.00039720981453607047, + "learning_rate": 2.1644333814631947e-06, + "loss": 0.0, + "step": 25350 + }, + { + "epoch": 10.309475396502643, + "grad_norm": 0.006796740943243928, + "learning_rate": 2.1638039072509054e-06, + "loss": 0.0001, + "step": 25351 + }, + { + "epoch": 10.309882065880439, + "grad_norm": 0.04123386410423183, + "learning_rate": 2.1631745134811156e-06, + "loss": 0.0004, + "step": 25352 + }, + { + "epoch": 10.310288735258235, + "grad_norm": 0.4878582039686515, + "learning_rate": 2.162545200160284e-06, + "loss": 0.0041, + "step": 25353 + }, + { + "epoch": 10.31069540463603, + "grad_norm": 0.019496329076123396, + "learning_rate": 2.161915967294871e-06, + "loss": 0.0002, + "step": 25354 + }, + { + "epoch": 10.311102074013826, + "grad_norm": 0.07247192192696883, + "learning_rate": 2.1612868148913347e-06, + "loss": 0.0007, + "step": 25355 + }, + { + "epoch": 10.311508743391622, + "grad_norm": 2.9103234195458936, + "learning_rate": 2.160657742956134e-06, + "loss": 0.0351, + "step": 25356 + }, + { + "epoch": 10.311915412769418, + "grad_norm": 0.6053829995217794, + "learning_rate": 2.160028751495724e-06, + "loss": 0.0079, + "step": 25357 + }, + { + "epoch": 10.312322082147213, + "grad_norm": 2.3209636218974707, + "learning_rate": 2.1593998405165684e-06, + "loss": 0.0233, + "step": 25358 + }, + { + "epoch": 10.312728751525011, + "grad_norm": 0.00047565223509527215, + "learning_rate": 2.1587710100251192e-06, + "loss": 0.0, + "step": 25359 + }, + { + "epoch": 10.313135420902807, + "grad_norm": 0.251927748228517, + "learning_rate": 2.1581422600278324e-06, + "loss": 0.003, + "step": 25360 + }, + { + "epoch": 10.313542090280603, + "grad_norm": 0.001795048085287431, + "learning_rate": 2.1575135905311617e-06, + "loss": 0.0, + "step": 25361 + }, + { + "epoch": 10.313948759658398, + "grad_norm": 0.0008807265989460043, + "learning_rate": 2.156885001541561e-06, + "loss": 0.0, + "step": 25362 + }, + { + "epoch": 10.314355429036194, + "grad_norm": 0.4767852139599732, + "learning_rate": 2.156256493065483e-06, + "loss": 0.0063, + "step": 25363 + }, + { + "epoch": 10.31476209841399, + "grad_norm": 0.029184343825406488, + "learning_rate": 2.1556280651093763e-06, + "loss": 0.0003, + "step": 25364 + }, + { + "epoch": 10.315168767791786, + "grad_norm": 0.11493337844762462, + "learning_rate": 2.1549997176797e-06, + "loss": 0.001, + "step": 25365 + }, + { + "epoch": 10.315575437169581, + "grad_norm": 0.041462041672292206, + "learning_rate": 2.1543714507828982e-06, + "loss": 0.0008, + "step": 25366 + }, + { + "epoch": 10.315982106547377, + "grad_norm": 0.011688497314491229, + "learning_rate": 2.153743264425423e-06, + "loss": 0.0001, + "step": 25367 + }, + { + "epoch": 10.316388775925173, + "grad_norm": 0.24596163175499516, + "learning_rate": 2.1531151586137223e-06, + "loss": 0.003, + "step": 25368 + }, + { + "epoch": 10.316795445302969, + "grad_norm": 0.029171352087957802, + "learning_rate": 2.152487133354243e-06, + "loss": 0.0003, + "step": 25369 + }, + { + "epoch": 10.317202114680764, + "grad_norm": 0.0016695044805671146, + "learning_rate": 2.1518591886534303e-06, + "loss": 0.0, + "step": 25370 + }, + { + "epoch": 10.31760878405856, + "grad_norm": 1.8870239490701883, + "learning_rate": 2.151231324517736e-06, + "loss": 0.0368, + "step": 25371 + }, + { + "epoch": 10.318015453436356, + "grad_norm": 0.02174954649241752, + "learning_rate": 2.150603540953602e-06, + "loss": 0.0002, + "step": 25372 + }, + { + "epoch": 10.318422122814152, + "grad_norm": 0.13830355987232848, + "learning_rate": 2.149975837967474e-06, + "loss": 0.0013, + "step": 25373 + }, + { + "epoch": 10.318828792191947, + "grad_norm": 0.036294834560030034, + "learning_rate": 2.1493482155657943e-06, + "loss": 0.0003, + "step": 25374 + }, + { + "epoch": 10.319235461569743, + "grad_norm": 0.057858102276047874, + "learning_rate": 2.148720673755006e-06, + "loss": 0.0006, + "step": 25375 + }, + { + "epoch": 10.319642130947539, + "grad_norm": 0.012469913781867174, + "learning_rate": 2.148093212541552e-06, + "loss": 0.0002, + "step": 25376 + }, + { + "epoch": 10.320048800325335, + "grad_norm": 0.1166697423697891, + "learning_rate": 2.1474658319318697e-06, + "loss": 0.0014, + "step": 25377 + }, + { + "epoch": 10.32045546970313, + "grad_norm": 0.3222876513243654, + "learning_rate": 2.146838531932406e-06, + "loss": 0.0032, + "step": 25378 + }, + { + "epoch": 10.320862139080926, + "grad_norm": 0.00014083350053192993, + "learning_rate": 2.1462113125495963e-06, + "loss": 0.0, + "step": 25379 + }, + { + "epoch": 10.321268808458724, + "grad_norm": 0.014198060508210176, + "learning_rate": 2.145584173789881e-06, + "loss": 0.0001, + "step": 25380 + }, + { + "epoch": 10.32167547783652, + "grad_norm": 0.004330129692270328, + "learning_rate": 2.144957115659697e-06, + "loss": 0.0, + "step": 25381 + }, + { + "epoch": 10.322082147214315, + "grad_norm": 1.1636494080427764, + "learning_rate": 2.1443301381654812e-06, + "loss": 0.0107, + "step": 25382 + }, + { + "epoch": 10.322488816592111, + "grad_norm": 0.02120148834199892, + "learning_rate": 2.1437032413136683e-06, + "loss": 0.0002, + "step": 25383 + }, + { + "epoch": 10.322895485969907, + "grad_norm": 0.7698082711190138, + "learning_rate": 2.1430764251106973e-06, + "loss": 0.0071, + "step": 25384 + }, + { + "epoch": 10.323302155347703, + "grad_norm": 0.0005707507958389447, + "learning_rate": 2.1424496895630022e-06, + "loss": 0.0, + "step": 25385 + }, + { + "epoch": 10.323708824725498, + "grad_norm": 0.01418363963392113, + "learning_rate": 2.141823034677015e-06, + "loss": 0.0001, + "step": 25386 + }, + { + "epoch": 10.324115494103294, + "grad_norm": 0.16823286183994965, + "learning_rate": 2.1411964604591696e-06, + "loss": 0.0019, + "step": 25387 + }, + { + "epoch": 10.32452216348109, + "grad_norm": 0.001477127521273913, + "learning_rate": 2.1405699669158976e-06, + "loss": 0.0, + "step": 25388 + }, + { + "epoch": 10.324928832858886, + "grad_norm": 0.06346062715735111, + "learning_rate": 2.1399435540536306e-06, + "loss": 0.0006, + "step": 25389 + }, + { + "epoch": 10.325335502236682, + "grad_norm": 0.017505935838367872, + "learning_rate": 2.139317221878796e-06, + "loss": 0.0001, + "step": 25390 + }, + { + "epoch": 10.325742171614477, + "grad_norm": 2.000460289421081, + "learning_rate": 2.1386909703978285e-06, + "loss": 0.024, + "step": 25391 + }, + { + "epoch": 10.326148840992273, + "grad_norm": 0.14322184537069338, + "learning_rate": 2.1380647996171556e-06, + "loss": 0.0014, + "step": 25392 + }, + { + "epoch": 10.326555510370069, + "grad_norm": 0.012545933833250604, + "learning_rate": 2.1374387095432037e-06, + "loss": 0.0001, + "step": 25393 + }, + { + "epoch": 10.326962179747865, + "grad_norm": 0.00627219580906164, + "learning_rate": 2.1368127001824013e-06, + "loss": 0.0, + "step": 25394 + }, + { + "epoch": 10.32736884912566, + "grad_norm": 0.008341174170772063, + "learning_rate": 2.136186771541173e-06, + "loss": 0.0001, + "step": 25395 + }, + { + "epoch": 10.327775518503456, + "grad_norm": 8.2580531148181e-05, + "learning_rate": 2.1355609236259435e-06, + "loss": 0.0, + "step": 25396 + }, + { + "epoch": 10.328182187881252, + "grad_norm": 0.0812127378516442, + "learning_rate": 2.134935156443142e-06, + "loss": 0.0011, + "step": 25397 + }, + { + "epoch": 10.328588857259048, + "grad_norm": 0.011252846583769465, + "learning_rate": 2.1343094699991897e-06, + "loss": 0.0002, + "step": 25398 + }, + { + "epoch": 10.328995526636843, + "grad_norm": 0.00670082689728963, + "learning_rate": 2.1336838643005086e-06, + "loss": 0.0001, + "step": 25399 + }, + { + "epoch": 10.329402196014641, + "grad_norm": 0.009967238152536294, + "learning_rate": 2.133058339353522e-06, + "loss": 0.0001, + "step": 25400 + }, + { + "epoch": 10.329808865392437, + "grad_norm": 0.060012677051067075, + "learning_rate": 2.13243289516465e-06, + "loss": 0.0006, + "step": 25401 + }, + { + "epoch": 10.330215534770232, + "grad_norm": 0.021494623928589234, + "learning_rate": 2.1318075317403152e-06, + "loss": 0.0002, + "step": 25402 + }, + { + "epoch": 10.330622204148028, + "grad_norm": 0.0927839103376895, + "learning_rate": 2.1311822490869326e-06, + "loss": 0.0009, + "step": 25403 + }, + { + "epoch": 10.331028873525824, + "grad_norm": 0.0634215263764939, + "learning_rate": 2.1305570472109283e-06, + "loss": 0.0004, + "step": 25404 + }, + { + "epoch": 10.33143554290362, + "grad_norm": 0.3660759852026396, + "learning_rate": 2.1299319261187156e-06, + "loss": 0.0034, + "step": 25405 + }, + { + "epoch": 10.331842212281416, + "grad_norm": 0.03189850108342935, + "learning_rate": 2.1293068858167134e-06, + "loss": 0.0002, + "step": 25406 + }, + { + "epoch": 10.332248881659211, + "grad_norm": 0.00707867938030564, + "learning_rate": 2.1286819263113358e-06, + "loss": 0.0, + "step": 25407 + }, + { + "epoch": 10.332655551037007, + "grad_norm": 0.0551579776219093, + "learning_rate": 2.128057047609001e-06, + "loss": 0.0005, + "step": 25408 + }, + { + "epoch": 10.333062220414803, + "grad_norm": 0.17590682090271448, + "learning_rate": 2.127432249716119e-06, + "loss": 0.0009, + "step": 25409 + }, + { + "epoch": 10.333468889792599, + "grad_norm": 0.003568760814603508, + "learning_rate": 2.1268075326391104e-06, + "loss": 0.0, + "step": 25410 + }, + { + "epoch": 10.333875559170394, + "grad_norm": 0.18670576410744513, + "learning_rate": 2.1261828963843843e-06, + "loss": 0.0017, + "step": 25411 + }, + { + "epoch": 10.33428222854819, + "grad_norm": 0.010128539483521137, + "learning_rate": 2.1255583409583548e-06, + "loss": 0.0001, + "step": 25412 + }, + { + "epoch": 10.334688897925986, + "grad_norm": 1.7976531339358925, + "learning_rate": 2.1249338663674315e-06, + "loss": 0.0078, + "step": 25413 + }, + { + "epoch": 10.335095567303782, + "grad_norm": 0.020005778842637705, + "learning_rate": 2.1243094726180247e-06, + "loss": 0.0001, + "step": 25414 + }, + { + "epoch": 10.335502236681577, + "grad_norm": 0.01381716436311868, + "learning_rate": 2.123685159716545e-06, + "loss": 0.0001, + "step": 25415 + }, + { + "epoch": 10.335908906059373, + "grad_norm": 0.0042715877671938305, + "learning_rate": 2.1230609276694003e-06, + "loss": 0.0, + "step": 25416 + }, + { + "epoch": 10.336315575437169, + "grad_norm": 0.3670192861864432, + "learning_rate": 2.122436776483e-06, + "loss": 0.0027, + "step": 25417 + }, + { + "epoch": 10.336722244814965, + "grad_norm": 0.547236598265773, + "learning_rate": 2.121812706163753e-06, + "loss": 0.0048, + "step": 25418 + }, + { + "epoch": 10.33712891419276, + "grad_norm": 0.1968585123533119, + "learning_rate": 2.1211887167180623e-06, + "loss": 0.0018, + "step": 25419 + }, + { + "epoch": 10.337535583570556, + "grad_norm": 0.04039953838078435, + "learning_rate": 2.1205648081523345e-06, + "loss": 0.0004, + "step": 25420 + }, + { + "epoch": 10.337942252948354, + "grad_norm": 0.0043122643074229436, + "learning_rate": 2.1199409804729754e-06, + "loss": 0.0, + "step": 25421 + }, + { + "epoch": 10.33834892232615, + "grad_norm": 0.30828900071146037, + "learning_rate": 2.119317233686384e-06, + "loss": 0.0034, + "step": 25422 + }, + { + "epoch": 10.338755591703945, + "grad_norm": 0.007264163875696998, + "learning_rate": 2.1186935677989717e-06, + "loss": 0.0001, + "step": 25423 + }, + { + "epoch": 10.339162261081741, + "grad_norm": 0.0011237734245168674, + "learning_rate": 2.118069982817136e-06, + "loss": 0.0, + "step": 25424 + }, + { + "epoch": 10.339568930459537, + "grad_norm": 0.03886461882465121, + "learning_rate": 2.1174464787472792e-06, + "loss": 0.0004, + "step": 25425 + }, + { + "epoch": 10.339975599837333, + "grad_norm": 0.003070735679000218, + "learning_rate": 2.1168230555958014e-06, + "loss": 0.0, + "step": 25426 + }, + { + "epoch": 10.340382269215128, + "grad_norm": 2.272214189408855, + "learning_rate": 2.1161997133691013e-06, + "loss": 0.0255, + "step": 25427 + }, + { + "epoch": 10.340788938592924, + "grad_norm": 0.08990434178491402, + "learning_rate": 2.11557645207358e-06, + "loss": 0.0006, + "step": 25428 + }, + { + "epoch": 10.34119560797072, + "grad_norm": 0.04466013301054461, + "learning_rate": 2.114953271715632e-06, + "loss": 0.0004, + "step": 25429 + }, + { + "epoch": 10.341602277348516, + "grad_norm": 0.04673044154667407, + "learning_rate": 2.1143301723016598e-06, + "loss": 0.0005, + "step": 25430 + }, + { + "epoch": 10.342008946726311, + "grad_norm": 0.14447634788546482, + "learning_rate": 2.113707153838056e-06, + "loss": 0.0011, + "step": 25431 + }, + { + "epoch": 10.342415616104107, + "grad_norm": 0.0033307018580097194, + "learning_rate": 2.1130842163312183e-06, + "loss": 0.0, + "step": 25432 + }, + { + "epoch": 10.342822285481903, + "grad_norm": 0.029156257599484287, + "learning_rate": 2.112461359787541e-06, + "loss": 0.0002, + "step": 25433 + }, + { + "epoch": 10.343228954859699, + "grad_norm": 0.009274286152220045, + "learning_rate": 2.111838584213417e-06, + "loss": 0.0001, + "step": 25434 + }, + { + "epoch": 10.343635624237494, + "grad_norm": 0.030656547122614137, + "learning_rate": 2.111215889615237e-06, + "loss": 0.0002, + "step": 25435 + }, + { + "epoch": 10.34404229361529, + "grad_norm": 0.02472471231117532, + "learning_rate": 2.1105932759994e-06, + "loss": 0.0002, + "step": 25436 + }, + { + "epoch": 10.344448962993086, + "grad_norm": 0.016122699283198424, + "learning_rate": 2.109970743372295e-06, + "loss": 0.0001, + "step": 25437 + }, + { + "epoch": 10.344855632370882, + "grad_norm": 1.0645049526563763, + "learning_rate": 2.1093482917403087e-06, + "loss": 0.008, + "step": 25438 + }, + { + "epoch": 10.345262301748678, + "grad_norm": 0.012289116560440216, + "learning_rate": 2.1087259211098345e-06, + "loss": 0.0001, + "step": 25439 + }, + { + "epoch": 10.345668971126473, + "grad_norm": 0.002393391051993031, + "learning_rate": 2.1081036314872594e-06, + "loss": 0.0, + "step": 25440 + }, + { + "epoch": 10.34607564050427, + "grad_norm": 0.012456460586407344, + "learning_rate": 2.1074814228789698e-06, + "loss": 0.0001, + "step": 25441 + }, + { + "epoch": 10.346482309882067, + "grad_norm": 0.7500543007954584, + "learning_rate": 2.106859295291359e-06, + "loss": 0.0069, + "step": 25442 + }, + { + "epoch": 10.346888979259862, + "grad_norm": 0.43893965691569997, + "learning_rate": 2.1062372487308093e-06, + "loss": 0.0053, + "step": 25443 + }, + { + "epoch": 10.347295648637658, + "grad_norm": 0.003363778230038794, + "learning_rate": 2.105615283203708e-06, + "loss": 0.0, + "step": 25444 + }, + { + "epoch": 10.347702318015454, + "grad_norm": 0.39925371934242504, + "learning_rate": 2.1049933987164384e-06, + "loss": 0.0033, + "step": 25445 + }, + { + "epoch": 10.34810898739325, + "grad_norm": 0.017308915372286206, + "learning_rate": 2.1043715952753853e-06, + "loss": 0.0002, + "step": 25446 + }, + { + "epoch": 10.348515656771045, + "grad_norm": 0.4103913253711121, + "learning_rate": 2.1037498728869277e-06, + "loss": 0.0023, + "step": 25447 + }, + { + "epoch": 10.348922326148841, + "grad_norm": 0.04067919844211137, + "learning_rate": 2.103128231557455e-06, + "loss": 0.0003, + "step": 25448 + }, + { + "epoch": 10.349328995526637, + "grad_norm": 0.006851380521740174, + "learning_rate": 2.102506671293346e-06, + "loss": 0.0, + "step": 25449 + }, + { + "epoch": 10.349735664904433, + "grad_norm": 0.03167360901511783, + "learning_rate": 2.101885192100981e-06, + "loss": 0.0004, + "step": 25450 + }, + { + "epoch": 10.350142334282229, + "grad_norm": 0.03707142450886089, + "learning_rate": 2.1012637939867387e-06, + "loss": 0.0003, + "step": 25451 + }, + { + "epoch": 10.350549003660024, + "grad_norm": 0.021531986535766098, + "learning_rate": 2.100642476957e-06, + "loss": 0.0002, + "step": 25452 + }, + { + "epoch": 10.35095567303782, + "grad_norm": 0.0019183181765170738, + "learning_rate": 2.100021241018141e-06, + "loss": 0.0, + "step": 25453 + }, + { + "epoch": 10.351362342415616, + "grad_norm": 0.09548277693513348, + "learning_rate": 2.099400086176537e-06, + "loss": 0.0009, + "step": 25454 + }, + { + "epoch": 10.351769011793412, + "grad_norm": 0.01179625207869764, + "learning_rate": 2.098779012438571e-06, + "loss": 0.0001, + "step": 25455 + }, + { + "epoch": 10.352175681171207, + "grad_norm": 0.05723903071100323, + "learning_rate": 2.0981580198106143e-06, + "loss": 0.0007, + "step": 25456 + }, + { + "epoch": 10.352582350549003, + "grad_norm": 0.010295230248469765, + "learning_rate": 2.0975371082990426e-06, + "loss": 0.0001, + "step": 25457 + }, + { + "epoch": 10.352989019926799, + "grad_norm": 0.015491984879986577, + "learning_rate": 2.0969162779102303e-06, + "loss": 0.0002, + "step": 25458 + }, + { + "epoch": 10.353395689304595, + "grad_norm": 0.0035415071436737723, + "learning_rate": 2.0962955286505493e-06, + "loss": 0.0, + "step": 25459 + }, + { + "epoch": 10.35380235868239, + "grad_norm": 0.0037761324547494024, + "learning_rate": 2.09567486052637e-06, + "loss": 0.0001, + "step": 25460 + }, + { + "epoch": 10.354209028060186, + "grad_norm": 0.013083005783381426, + "learning_rate": 2.0950542735440693e-06, + "loss": 0.0001, + "step": 25461 + }, + { + "epoch": 10.354615697437984, + "grad_norm": 0.020871001618929193, + "learning_rate": 2.0944337677100148e-06, + "loss": 0.0002, + "step": 25462 + }, + { + "epoch": 10.35502236681578, + "grad_norm": 0.0051874182695636, + "learning_rate": 2.093813343030576e-06, + "loss": 0.0, + "step": 25463 + }, + { + "epoch": 10.355429036193575, + "grad_norm": 0.016676191130246657, + "learning_rate": 2.0931929995121236e-06, + "loss": 0.0002, + "step": 25464 + }, + { + "epoch": 10.355835705571371, + "grad_norm": 0.02152717586118993, + "learning_rate": 2.0925727371610237e-06, + "loss": 0.0002, + "step": 25465 + }, + { + "epoch": 10.356242374949167, + "grad_norm": 0.48355495630943407, + "learning_rate": 2.091952555983644e-06, + "loss": 0.0073, + "step": 25466 + }, + { + "epoch": 10.356649044326963, + "grad_norm": 0.0015377037508299007, + "learning_rate": 2.091332455986349e-06, + "loss": 0.0, + "step": 25467 + }, + { + "epoch": 10.357055713704758, + "grad_norm": 0.0011926942379159125, + "learning_rate": 2.0907124371755093e-06, + "loss": 0.0, + "step": 25468 + }, + { + "epoch": 10.357462383082554, + "grad_norm": 0.025131507016246387, + "learning_rate": 2.090092499557488e-06, + "loss": 0.0002, + "step": 25469 + }, + { + "epoch": 10.35786905246035, + "grad_norm": 0.016270665665571178, + "learning_rate": 2.089472643138647e-06, + "loss": 0.0002, + "step": 25470 + }, + { + "epoch": 10.358275721838146, + "grad_norm": 0.0022180891378933473, + "learning_rate": 2.0888528679253507e-06, + "loss": 0.0, + "step": 25471 + }, + { + "epoch": 10.358682391215941, + "grad_norm": 0.0006775738118468045, + "learning_rate": 2.0882331739239615e-06, + "loss": 0.0, + "step": 25472 + }, + { + "epoch": 10.359089060593737, + "grad_norm": 0.01644612185831794, + "learning_rate": 2.087613561140839e-06, + "loss": 0.0002, + "step": 25473 + }, + { + "epoch": 10.359495729971533, + "grad_norm": 0.0991200890015408, + "learning_rate": 2.0869940295823465e-06, + "loss": 0.0006, + "step": 25474 + }, + { + "epoch": 10.359902399349329, + "grad_norm": 0.057861552003510855, + "learning_rate": 2.086374579254844e-06, + "loss": 0.0007, + "step": 25475 + }, + { + "epoch": 10.360309068727124, + "grad_norm": 5.412932968421375, + "learning_rate": 2.085755210164688e-06, + "loss": 0.0625, + "step": 25476 + }, + { + "epoch": 10.36071573810492, + "grad_norm": 0.1048463888970225, + "learning_rate": 2.08513592231824e-06, + "loss": 0.0008, + "step": 25477 + }, + { + "epoch": 10.361122407482716, + "grad_norm": 0.0062394157053461815, + "learning_rate": 2.0845167157218536e-06, + "loss": 0.0001, + "step": 25478 + }, + { + "epoch": 10.361529076860512, + "grad_norm": 0.0028871935287824915, + "learning_rate": 2.083897590381887e-06, + "loss": 0.0, + "step": 25479 + }, + { + "epoch": 10.361935746238307, + "grad_norm": 0.14106014412681067, + "learning_rate": 2.0832785463046933e-06, + "loss": 0.0016, + "step": 25480 + }, + { + "epoch": 10.362342415616103, + "grad_norm": 0.00961482955288136, + "learning_rate": 2.0826595834966324e-06, + "loss": 0.0001, + "step": 25481 + }, + { + "epoch": 10.3627490849939, + "grad_norm": 0.03758842552192192, + "learning_rate": 2.0820407019640563e-06, + "loss": 0.0003, + "step": 25482 + }, + { + "epoch": 10.363155754371697, + "grad_norm": 0.005611818268808952, + "learning_rate": 2.0814219017133163e-06, + "loss": 0.0, + "step": 25483 + }, + { + "epoch": 10.363562423749492, + "grad_norm": 0.19858639586946347, + "learning_rate": 2.0808031827507668e-06, + "loss": 0.0023, + "step": 25484 + }, + { + "epoch": 10.363969093127288, + "grad_norm": 0.07673594839580931, + "learning_rate": 2.0801845450827574e-06, + "loss": 0.0006, + "step": 25485 + }, + { + "epoch": 10.364375762505084, + "grad_norm": 0.7700795949212622, + "learning_rate": 2.079565988715637e-06, + "loss": 0.0055, + "step": 25486 + }, + { + "epoch": 10.36478243188288, + "grad_norm": 0.011654429037869906, + "learning_rate": 2.0789475136557615e-06, + "loss": 0.0001, + "step": 25487 + }, + { + "epoch": 10.365189101260675, + "grad_norm": 0.11795406551303747, + "learning_rate": 2.0783291199094757e-06, + "loss": 0.0013, + "step": 25488 + }, + { + "epoch": 10.365595770638471, + "grad_norm": 0.017504172501221788, + "learning_rate": 2.077710807483129e-06, + "loss": 0.0001, + "step": 25489 + }, + { + "epoch": 10.366002440016267, + "grad_norm": 0.009912473059083729, + "learning_rate": 2.0770925763830674e-06, + "loss": 0.0001, + "step": 25490 + }, + { + "epoch": 10.366409109394063, + "grad_norm": 0.08289250481405157, + "learning_rate": 2.0764744266156377e-06, + "loss": 0.0008, + "step": 25491 + }, + { + "epoch": 10.366815778771858, + "grad_norm": 0.014126742655849749, + "learning_rate": 2.0758563581871873e-06, + "loss": 0.0002, + "step": 25492 + }, + { + "epoch": 10.367222448149654, + "grad_norm": 0.013540260262857581, + "learning_rate": 2.075238371104056e-06, + "loss": 0.0002, + "step": 25493 + }, + { + "epoch": 10.36762911752745, + "grad_norm": 0.0020085367682000756, + "learning_rate": 2.0746204653725933e-06, + "loss": 0.0, + "step": 25494 + }, + { + "epoch": 10.368035786905246, + "grad_norm": 0.01973662348508615, + "learning_rate": 2.074002640999141e-06, + "loss": 0.0002, + "step": 25495 + }, + { + "epoch": 10.368442456283042, + "grad_norm": 0.18355130321251398, + "learning_rate": 2.0733848979900406e-06, + "loss": 0.0015, + "step": 25496 + }, + { + "epoch": 10.368849125660837, + "grad_norm": 0.04752715973669873, + "learning_rate": 2.0727672363516326e-06, + "loss": 0.0005, + "step": 25497 + }, + { + "epoch": 10.369255795038633, + "grad_norm": 0.07976121420421597, + "learning_rate": 2.07214965609026e-06, + "loss": 0.0006, + "step": 25498 + }, + { + "epoch": 10.369662464416429, + "grad_norm": 4.395173559932165, + "learning_rate": 2.071532157212258e-06, + "loss": 0.0417, + "step": 25499 + }, + { + "epoch": 10.370069133794225, + "grad_norm": 0.0020989843432319922, + "learning_rate": 2.0709147397239715e-06, + "loss": 0.0, + "step": 25500 + }, + { + "epoch": 10.37047580317202, + "grad_norm": 0.016262204537872353, + "learning_rate": 2.0702974036317357e-06, + "loss": 0.0002, + "step": 25501 + }, + { + "epoch": 10.370882472549816, + "grad_norm": 0.0124333877786343, + "learning_rate": 2.069680148941888e-06, + "loss": 0.0001, + "step": 25502 + }, + { + "epoch": 10.371289141927614, + "grad_norm": 0.4471786215335396, + "learning_rate": 2.069062975660765e-06, + "loss": 0.0012, + "step": 25503 + }, + { + "epoch": 10.37169581130541, + "grad_norm": 0.00758419072781218, + "learning_rate": 2.068445883794702e-06, + "loss": 0.0001, + "step": 25504 + }, + { + "epoch": 10.372102480683205, + "grad_norm": 0.0015728556473810994, + "learning_rate": 2.067828873350033e-06, + "loss": 0.0, + "step": 25505 + }, + { + "epoch": 10.372509150061001, + "grad_norm": 0.00038371568791848847, + "learning_rate": 2.067211944333092e-06, + "loss": 0.0, + "step": 25506 + }, + { + "epoch": 10.372915819438797, + "grad_norm": 0.04164066891247401, + "learning_rate": 2.0665950967502135e-06, + "loss": 0.0003, + "step": 25507 + }, + { + "epoch": 10.373322488816592, + "grad_norm": 2.063027909705208, + "learning_rate": 2.0659783306077306e-06, + "loss": 0.021, + "step": 25508 + }, + { + "epoch": 10.373729158194388, + "grad_norm": 0.11059169712155699, + "learning_rate": 2.065361645911973e-06, + "loss": 0.0013, + "step": 25509 + }, + { + "epoch": 10.374135827572184, + "grad_norm": 0.04052525966859531, + "learning_rate": 2.0647450426692704e-06, + "loss": 0.0004, + "step": 25510 + }, + { + "epoch": 10.37454249694998, + "grad_norm": 0.0011758331725067706, + "learning_rate": 2.0641285208859554e-06, + "loss": 0.0, + "step": 25511 + }, + { + "epoch": 10.374949166327776, + "grad_norm": 0.13247174490334748, + "learning_rate": 2.063512080568352e-06, + "loss": 0.0009, + "step": 25512 + }, + { + "epoch": 10.375355835705571, + "grad_norm": 0.0015787146369444377, + "learning_rate": 2.062895721722793e-06, + "loss": 0.0, + "step": 25513 + }, + { + "epoch": 10.375762505083367, + "grad_norm": 0.0029323041849114795, + "learning_rate": 2.062279444355605e-06, + "loss": 0.0, + "step": 25514 + }, + { + "epoch": 10.376169174461163, + "grad_norm": 0.020873073908030323, + "learning_rate": 2.0616632484731134e-06, + "loss": 0.0002, + "step": 25515 + }, + { + "epoch": 10.376575843838959, + "grad_norm": 0.0004882979728058804, + "learning_rate": 2.061047134081644e-06, + "loss": 0.0, + "step": 25516 + }, + { + "epoch": 10.376982513216754, + "grad_norm": 0.02938038271407196, + "learning_rate": 2.060431101187521e-06, + "loss": 0.0004, + "step": 25517 + }, + { + "epoch": 10.37738918259455, + "grad_norm": 0.032099338033650116, + "learning_rate": 2.0598151497970685e-06, + "loss": 0.0002, + "step": 25518 + }, + { + "epoch": 10.377795851972346, + "grad_norm": 0.2618486514746069, + "learning_rate": 2.059199279916607e-06, + "loss": 0.0027, + "step": 25519 + }, + { + "epoch": 10.378202521350142, + "grad_norm": 0.0037001655046651153, + "learning_rate": 2.058583491552465e-06, + "loss": 0.0, + "step": 25520 + }, + { + "epoch": 10.378609190727937, + "grad_norm": 0.9844214165549557, + "learning_rate": 2.0579677847109593e-06, + "loss": 0.0066, + "step": 25521 + }, + { + "epoch": 10.379015860105733, + "grad_norm": 0.01108579375051344, + "learning_rate": 2.0573521593984115e-06, + "loss": 0.0001, + "step": 25522 + }, + { + "epoch": 10.37942252948353, + "grad_norm": 0.013753622942828985, + "learning_rate": 2.056736615621142e-06, + "loss": 0.0001, + "step": 25523 + }, + { + "epoch": 10.379829198861326, + "grad_norm": 0.3068741509780045, + "learning_rate": 2.0561211533854687e-06, + "loss": 0.0011, + "step": 25524 + }, + { + "epoch": 10.380235868239122, + "grad_norm": 0.07375039108584446, + "learning_rate": 2.0555057726977068e-06, + "loss": 0.0007, + "step": 25525 + }, + { + "epoch": 10.380642537616918, + "grad_norm": 0.0003225268706259864, + "learning_rate": 2.0548904735641795e-06, + "loss": 0.0, + "step": 25526 + }, + { + "epoch": 10.381049206994714, + "grad_norm": 0.0034279080856589044, + "learning_rate": 2.0542752559911993e-06, + "loss": 0.0, + "step": 25527 + }, + { + "epoch": 10.38145587637251, + "grad_norm": 1.8660821077972125, + "learning_rate": 2.0536601199850834e-06, + "loss": 0.0088, + "step": 25528 + }, + { + "epoch": 10.381862545750305, + "grad_norm": 0.14457293679849811, + "learning_rate": 2.053045065552146e-06, + "loss": 0.0015, + "step": 25529 + }, + { + "epoch": 10.382269215128101, + "grad_norm": 0.004291863752155265, + "learning_rate": 2.0524300926986996e-06, + "loss": 0.0, + "step": 25530 + }, + { + "epoch": 10.382675884505897, + "grad_norm": 0.015586682805218492, + "learning_rate": 2.051815201431059e-06, + "loss": 0.0002, + "step": 25531 + }, + { + "epoch": 10.383082553883693, + "grad_norm": 0.004606515746703544, + "learning_rate": 2.0512003917555324e-06, + "loss": 0.0001, + "step": 25532 + }, + { + "epoch": 10.383489223261488, + "grad_norm": 0.23969877800036468, + "learning_rate": 2.050585663678437e-06, + "loss": 0.0021, + "step": 25533 + }, + { + "epoch": 10.383895892639284, + "grad_norm": 0.04205855621124887, + "learning_rate": 2.0499710172060815e-06, + "loss": 0.0003, + "step": 25534 + }, + { + "epoch": 10.38430256201708, + "grad_norm": 0.14056481899538423, + "learning_rate": 2.049356452344774e-06, + "loss": 0.0014, + "step": 25535 + }, + { + "epoch": 10.384709231394876, + "grad_norm": 0.024828509352137637, + "learning_rate": 2.048741969100825e-06, + "loss": 0.0002, + "step": 25536 + }, + { + "epoch": 10.385115900772671, + "grad_norm": 0.01347565079976023, + "learning_rate": 2.04812756748054e-06, + "loss": 0.0001, + "step": 25537 + }, + { + "epoch": 10.385522570150467, + "grad_norm": 0.005426768145915102, + "learning_rate": 2.047513247490228e-06, + "loss": 0.0001, + "step": 25538 + }, + { + "epoch": 10.385929239528263, + "grad_norm": 9.777211999367474, + "learning_rate": 2.0468990091361952e-06, + "loss": 0.254, + "step": 25539 + }, + { + "epoch": 10.386335908906059, + "grad_norm": 0.04045675671548261, + "learning_rate": 2.046284852424747e-06, + "loss": 0.0004, + "step": 25540 + }, + { + "epoch": 10.386742578283854, + "grad_norm": 0.01199863351323496, + "learning_rate": 2.0456707773621874e-06, + "loss": 0.0001, + "step": 25541 + }, + { + "epoch": 10.38714924766165, + "grad_norm": 0.25661582081740314, + "learning_rate": 2.0450567839548206e-06, + "loss": 0.0035, + "step": 25542 + }, + { + "epoch": 10.387555917039446, + "grad_norm": 0.0187091243367246, + "learning_rate": 2.0444428722089494e-06, + "loss": 0.0002, + "step": 25543 + }, + { + "epoch": 10.387962586417244, + "grad_norm": 0.035150732061171905, + "learning_rate": 2.0438290421308737e-06, + "loss": 0.0002, + "step": 25544 + }, + { + "epoch": 10.38836925579504, + "grad_norm": 0.0093880041245596, + "learning_rate": 2.0432152937269e-06, + "loss": 0.0001, + "step": 25545 + }, + { + "epoch": 10.388775925172835, + "grad_norm": 0.02148274627600499, + "learning_rate": 2.0426016270033257e-06, + "loss": 0.0002, + "step": 25546 + }, + { + "epoch": 10.38918259455063, + "grad_norm": 0.0002496115919206374, + "learning_rate": 2.0419880419664504e-06, + "loss": 0.0, + "step": 25547 + }, + { + "epoch": 10.389589263928427, + "grad_norm": 0.04016344681634163, + "learning_rate": 2.0413745386225724e-06, + "loss": 0.0004, + "step": 25548 + }, + { + "epoch": 10.389995933306222, + "grad_norm": 0.006289084206674418, + "learning_rate": 2.040761116977992e-06, + "loss": 0.0001, + "step": 25549 + }, + { + "epoch": 10.390402602684018, + "grad_norm": 0.000300573958237262, + "learning_rate": 2.040147777039001e-06, + "loss": 0.0, + "step": 25550 + }, + { + "epoch": 10.390809272061814, + "grad_norm": 0.7445519201551243, + "learning_rate": 2.0395345188119008e-06, + "loss": 0.0053, + "step": 25551 + }, + { + "epoch": 10.39121594143961, + "grad_norm": 0.013777495156943605, + "learning_rate": 2.0389213423029864e-06, + "loss": 0.0002, + "step": 25552 + }, + { + "epoch": 10.391622610817405, + "grad_norm": 0.025795370061433315, + "learning_rate": 2.038308247518551e-06, + "loss": 0.0001, + "step": 25553 + }, + { + "epoch": 10.392029280195201, + "grad_norm": 0.014959952755980749, + "learning_rate": 2.037695234464888e-06, + "loss": 0.0001, + "step": 25554 + }, + { + "epoch": 10.392435949572997, + "grad_norm": 0.03425566484395524, + "learning_rate": 2.037082303148291e-06, + "loss": 0.0004, + "step": 25555 + }, + { + "epoch": 10.392842618950793, + "grad_norm": 0.03712559555989634, + "learning_rate": 2.036469453575052e-06, + "loss": 0.0003, + "step": 25556 + }, + { + "epoch": 10.393249288328589, + "grad_norm": 0.1713820112238549, + "learning_rate": 2.0358566857514594e-06, + "loss": 0.0019, + "step": 25557 + }, + { + "epoch": 10.393655957706384, + "grad_norm": 0.02041432415098877, + "learning_rate": 2.0352439996838093e-06, + "loss": 0.0002, + "step": 25558 + }, + { + "epoch": 10.39406262708418, + "grad_norm": 0.00014899752169470525, + "learning_rate": 2.034631395378387e-06, + "loss": 0.0, + "step": 25559 + }, + { + "epoch": 10.394469296461976, + "grad_norm": 0.009180213842359935, + "learning_rate": 2.0340188728414834e-06, + "loss": 0.0001, + "step": 25560 + }, + { + "epoch": 10.394875965839772, + "grad_norm": 0.0332888749678502, + "learning_rate": 2.0334064320793847e-06, + "loss": 0.0004, + "step": 25561 + }, + { + "epoch": 10.395282635217567, + "grad_norm": 0.005723283535134118, + "learning_rate": 2.0327940730983776e-06, + "loss": 0.0, + "step": 25562 + }, + { + "epoch": 10.395689304595363, + "grad_norm": 0.0021756043855422308, + "learning_rate": 2.0321817959047476e-06, + "loss": 0.0, + "step": 25563 + }, + { + "epoch": 10.39609597397316, + "grad_norm": 0.013759009823270808, + "learning_rate": 2.0315696005047838e-06, + "loss": 0.0001, + "step": 25564 + }, + { + "epoch": 10.396502643350956, + "grad_norm": 0.0021941269958244214, + "learning_rate": 2.0309574869047687e-06, + "loss": 0.0, + "step": 25565 + }, + { + "epoch": 10.396909312728752, + "grad_norm": 0.0073392031514001475, + "learning_rate": 2.030345455110986e-06, + "loss": 0.0, + "step": 25566 + }, + { + "epoch": 10.397315982106548, + "grad_norm": 0.060350409293572715, + "learning_rate": 2.0297335051297186e-06, + "loss": 0.0003, + "step": 25567 + }, + { + "epoch": 10.397722651484344, + "grad_norm": 0.010833374559807447, + "learning_rate": 2.029121636967247e-06, + "loss": 0.0001, + "step": 25568 + }, + { + "epoch": 10.39812932086214, + "grad_norm": 0.016155476307013524, + "learning_rate": 2.0285098506298538e-06, + "loss": 0.0001, + "step": 25569 + }, + { + "epoch": 10.398535990239935, + "grad_norm": 0.000590634887249778, + "learning_rate": 2.0278981461238167e-06, + "loss": 0.0, + "step": 25570 + }, + { + "epoch": 10.398942659617731, + "grad_norm": 0.0018569508918811796, + "learning_rate": 2.0272865234554195e-06, + "loss": 0.0, + "step": 25571 + }, + { + "epoch": 10.399349328995527, + "grad_norm": 0.006827418602995968, + "learning_rate": 2.0266749826309397e-06, + "loss": 0.0001, + "step": 25572 + }, + { + "epoch": 10.399755998373323, + "grad_norm": 0.027807434351515357, + "learning_rate": 2.026063523656653e-06, + "loss": 0.0003, + "step": 25573 + }, + { + "epoch": 10.400162667751118, + "grad_norm": 0.0025538604891041723, + "learning_rate": 2.0254521465388377e-06, + "loss": 0.0, + "step": 25574 + }, + { + "epoch": 10.400569337128914, + "grad_norm": 0.06909336410528259, + "learning_rate": 2.0248408512837693e-06, + "loss": 0.0006, + "step": 25575 + }, + { + "epoch": 10.40097600650671, + "grad_norm": 0.11866902096324805, + "learning_rate": 2.024229637897721e-06, + "loss": 0.0011, + "step": 25576 + }, + { + "epoch": 10.401382675884506, + "grad_norm": 0.00023855849957254508, + "learning_rate": 2.023618506386973e-06, + "loss": 0.0, + "step": 25577 + }, + { + "epoch": 10.401789345262301, + "grad_norm": 0.19987400827059199, + "learning_rate": 2.023007456757795e-06, + "loss": 0.0029, + "step": 25578 + }, + { + "epoch": 10.402196014640097, + "grad_norm": 0.09142786076344372, + "learning_rate": 2.0223964890164606e-06, + "loss": 0.001, + "step": 25579 + }, + { + "epoch": 10.402602684017893, + "grad_norm": 0.002076982365237611, + "learning_rate": 2.021785603169241e-06, + "loss": 0.0, + "step": 25580 + }, + { + "epoch": 10.403009353395689, + "grad_norm": 0.2486923608684616, + "learning_rate": 2.0211747992224063e-06, + "loss": 0.0031, + "step": 25581 + }, + { + "epoch": 10.403416022773484, + "grad_norm": 0.00519227643489224, + "learning_rate": 2.0205640771822288e-06, + "loss": 0.0001, + "step": 25582 + }, + { + "epoch": 10.40382269215128, + "grad_norm": 0.0035103069114009455, + "learning_rate": 2.0199534370549747e-06, + "loss": 0.0, + "step": 25583 + }, + { + "epoch": 10.404229361529076, + "grad_norm": 0.0014405706312137709, + "learning_rate": 2.019342878846917e-06, + "loss": 0.0, + "step": 25584 + }, + { + "epoch": 10.404636030906874, + "grad_norm": 5.647042443323973e-05, + "learning_rate": 2.018732402564322e-06, + "loss": 0.0, + "step": 25585 + }, + { + "epoch": 10.40504270028467, + "grad_norm": 0.005826746171610881, + "learning_rate": 2.0181220082134557e-06, + "loss": 0.0001, + "step": 25586 + }, + { + "epoch": 10.405449369662465, + "grad_norm": 0.0007531934627958373, + "learning_rate": 2.0175116958005837e-06, + "loss": 0.0, + "step": 25587 + }, + { + "epoch": 10.40585603904026, + "grad_norm": 0.01191827080535373, + "learning_rate": 2.016901465331972e-06, + "loss": 0.0002, + "step": 25588 + }, + { + "epoch": 10.406262708418057, + "grad_norm": 0.00019055126641328812, + "learning_rate": 2.016291316813881e-06, + "loss": 0.0, + "step": 25589 + }, + { + "epoch": 10.406669377795852, + "grad_norm": 0.02062061415341809, + "learning_rate": 2.0156812502525814e-06, + "loss": 0.0002, + "step": 25590 + }, + { + "epoch": 10.407076047173648, + "grad_norm": 0.3235997427200434, + "learning_rate": 2.0150712656543315e-06, + "loss": 0.0018, + "step": 25591 + }, + { + "epoch": 10.407482716551444, + "grad_norm": 0.0002599089085928204, + "learning_rate": 2.014461363025394e-06, + "loss": 0.0, + "step": 25592 + }, + { + "epoch": 10.40788938592924, + "grad_norm": 5.6316035255673985e-05, + "learning_rate": 2.0138515423720297e-06, + "loss": 0.0, + "step": 25593 + }, + { + "epoch": 10.408296055307035, + "grad_norm": 0.022167985124885936, + "learning_rate": 2.0132418037004985e-06, + "loss": 0.0002, + "step": 25594 + }, + { + "epoch": 10.408702724684831, + "grad_norm": 0.012072761679259691, + "learning_rate": 2.0126321470170608e-06, + "loss": 0.0001, + "step": 25595 + }, + { + "epoch": 10.409109394062627, + "grad_norm": 0.14271805621920247, + "learning_rate": 2.012022572327971e-06, + "loss": 0.0008, + "step": 25596 + }, + { + "epoch": 10.409516063440423, + "grad_norm": 0.28689101257120925, + "learning_rate": 2.0114130796394917e-06, + "loss": 0.0021, + "step": 25597 + }, + { + "epoch": 10.409922732818218, + "grad_norm": 0.3127828251715684, + "learning_rate": 2.010803668957878e-06, + "loss": 0.0028, + "step": 25598 + }, + { + "epoch": 10.410329402196014, + "grad_norm": 0.17214740387856925, + "learning_rate": 2.0101943402893855e-06, + "loss": 0.0012, + "step": 25599 + }, + { + "epoch": 10.41073607157381, + "grad_norm": 0.00047959095655901695, + "learning_rate": 2.0095850936402695e-06, + "loss": 0.0, + "step": 25600 + }, + { + "epoch": 10.411142740951606, + "grad_norm": 0.002232533585011172, + "learning_rate": 2.0089759290167844e-06, + "loss": 0.0, + "step": 25601 + }, + { + "epoch": 10.411549410329402, + "grad_norm": 0.05039889571280622, + "learning_rate": 2.008366846425179e-06, + "loss": 0.0005, + "step": 25602 + }, + { + "epoch": 10.411956079707197, + "grad_norm": 0.0005560580969909218, + "learning_rate": 2.007757845871714e-06, + "loss": 0.0, + "step": 25603 + }, + { + "epoch": 10.412362749084993, + "grad_norm": 0.0011597067762156585, + "learning_rate": 2.0071489273626376e-06, + "loss": 0.0, + "step": 25604 + }, + { + "epoch": 10.41276941846279, + "grad_norm": 0.11255110938157764, + "learning_rate": 2.006540090904199e-06, + "loss": 0.0012, + "step": 25605 + }, + { + "epoch": 10.413176087840586, + "grad_norm": 0.03497476528428609, + "learning_rate": 2.00593133650265e-06, + "loss": 0.0003, + "step": 25606 + }, + { + "epoch": 10.413582757218382, + "grad_norm": 0.20676232078399298, + "learning_rate": 2.0053226641642386e-06, + "loss": 0.0019, + "step": 25607 + }, + { + "epoch": 10.413989426596178, + "grad_norm": 0.005897824241321813, + "learning_rate": 2.004714073895214e-06, + "loss": 0.0001, + "step": 25608 + }, + { + "epoch": 10.414396095973974, + "grad_norm": 0.18254465245503182, + "learning_rate": 2.0041055657018216e-06, + "loss": 0.0012, + "step": 25609 + }, + { + "epoch": 10.41480276535177, + "grad_norm": 0.010004296729083983, + "learning_rate": 2.0034971395903115e-06, + "loss": 0.0001, + "step": 25610 + }, + { + "epoch": 10.415209434729565, + "grad_norm": 0.18606438051184165, + "learning_rate": 2.0028887955669272e-06, + "loss": 0.0018, + "step": 25611 + }, + { + "epoch": 10.415616104107361, + "grad_norm": 0.0341131873404557, + "learning_rate": 2.002280533637916e-06, + "loss": 0.0003, + "step": 25612 + }, + { + "epoch": 10.416022773485157, + "grad_norm": 0.005714183571062009, + "learning_rate": 2.0016723538095185e-06, + "loss": 0.0001, + "step": 25613 + }, + { + "epoch": 10.416429442862952, + "grad_norm": 0.002343914057275098, + "learning_rate": 2.001064256087981e-06, + "loss": 0.0, + "step": 25614 + }, + { + "epoch": 10.416836112240748, + "grad_norm": 0.012699308486905049, + "learning_rate": 2.000456240479541e-06, + "loss": 0.0002, + "step": 25615 + }, + { + "epoch": 10.417242781618544, + "grad_norm": 0.23544111148225094, + "learning_rate": 1.9998483069904475e-06, + "loss": 0.0024, + "step": 25616 + }, + { + "epoch": 10.41764945099634, + "grad_norm": 0.004056849781028039, + "learning_rate": 1.9992404556269375e-06, + "loss": 0.0, + "step": 25617 + }, + { + "epoch": 10.418056120374136, + "grad_norm": 0.00015278657634852207, + "learning_rate": 1.9986326863952497e-06, + "loss": 0.0, + "step": 25618 + }, + { + "epoch": 10.418462789751931, + "grad_norm": 0.0015782771931353674, + "learning_rate": 1.9980249993016254e-06, + "loss": 0.0, + "step": 25619 + }, + { + "epoch": 10.418869459129727, + "grad_norm": 0.11556773350050377, + "learning_rate": 1.997417394352302e-06, + "loss": 0.001, + "step": 25620 + }, + { + "epoch": 10.419276128507523, + "grad_norm": 0.04028472598995001, + "learning_rate": 1.9968098715535166e-06, + "loss": 0.0003, + "step": 25621 + }, + { + "epoch": 10.419682797885319, + "grad_norm": 0.0003629728578977653, + "learning_rate": 1.9962024309115025e-06, + "loss": 0.0, + "step": 25622 + }, + { + "epoch": 10.420089467263114, + "grad_norm": 0.23188195453669114, + "learning_rate": 1.9955950724325013e-06, + "loss": 0.002, + "step": 25623 + }, + { + "epoch": 10.42049613664091, + "grad_norm": 0.025465873318673383, + "learning_rate": 1.994987796122746e-06, + "loss": 0.0003, + "step": 25624 + }, + { + "epoch": 10.420902806018706, + "grad_norm": 1.7923589466231569, + "learning_rate": 1.9943806019884694e-06, + "loss": 0.0097, + "step": 25625 + }, + { + "epoch": 10.421309475396503, + "grad_norm": 0.000770075296067921, + "learning_rate": 1.9937734900359054e-06, + "loss": 0.0, + "step": 25626 + }, + { + "epoch": 10.4217161447743, + "grad_norm": 0.09204999180416955, + "learning_rate": 1.9931664602712853e-06, + "loss": 0.0007, + "step": 25627 + }, + { + "epoch": 10.422122814152095, + "grad_norm": 0.03401837793116494, + "learning_rate": 1.992559512700839e-06, + "loss": 0.0003, + "step": 25628 + }, + { + "epoch": 10.42252948352989, + "grad_norm": 0.02879890244252544, + "learning_rate": 1.9919526473308015e-06, + "loss": 0.0002, + "step": 25629 + }, + { + "epoch": 10.422936152907686, + "grad_norm": 0.03927574443528712, + "learning_rate": 1.991345864167401e-06, + "loss": 0.0003, + "step": 25630 + }, + { + "epoch": 10.423342822285482, + "grad_norm": 0.01075418585881784, + "learning_rate": 1.990739163216866e-06, + "loss": 0.0001, + "step": 25631 + }, + { + "epoch": 10.423749491663278, + "grad_norm": 0.00026349360090883995, + "learning_rate": 1.9901325444854237e-06, + "loss": 0.0, + "step": 25632 + }, + { + "epoch": 10.424156161041074, + "grad_norm": 0.1909854443947826, + "learning_rate": 1.989526007979302e-06, + "loss": 0.002, + "step": 25633 + }, + { + "epoch": 10.42456283041887, + "grad_norm": 0.0004353582541330548, + "learning_rate": 1.988919553704729e-06, + "loss": 0.0, + "step": 25634 + }, + { + "epoch": 10.424969499796665, + "grad_norm": 0.01684808550998266, + "learning_rate": 1.988313181667925e-06, + "loss": 0.0002, + "step": 25635 + }, + { + "epoch": 10.425376169174461, + "grad_norm": 0.27120968720581984, + "learning_rate": 1.987706891875123e-06, + "loss": 0.0024, + "step": 25636 + }, + { + "epoch": 10.425782838552257, + "grad_norm": 0.011263664612606113, + "learning_rate": 1.9871006843325404e-06, + "loss": 0.0002, + "step": 25637 + }, + { + "epoch": 10.426189507930053, + "grad_norm": 0.029649870013451533, + "learning_rate": 1.9864945590464024e-06, + "loss": 0.0002, + "step": 25638 + }, + { + "epoch": 10.426596177307848, + "grad_norm": 0.18489707365299116, + "learning_rate": 1.98588851602293e-06, + "loss": 0.0018, + "step": 25639 + }, + { + "epoch": 10.427002846685644, + "grad_norm": 0.3026718455531794, + "learning_rate": 1.985282555268344e-06, + "loss": 0.0019, + "step": 25640 + }, + { + "epoch": 10.42740951606344, + "grad_norm": 0.08825193665704498, + "learning_rate": 1.984676676788868e-06, + "loss": 0.0008, + "step": 25641 + }, + { + "epoch": 10.427816185441236, + "grad_norm": 0.0035840076075240542, + "learning_rate": 1.9840708805907195e-06, + "loss": 0.0, + "step": 25642 + }, + { + "epoch": 10.428222854819031, + "grad_norm": 0.01809533351945599, + "learning_rate": 1.9834651666801185e-06, + "loss": 0.0002, + "step": 25643 + }, + { + "epoch": 10.428629524196827, + "grad_norm": 0.0006171013405466885, + "learning_rate": 1.982859535063282e-06, + "loss": 0.0, + "step": 25644 + }, + { + "epoch": 10.429036193574623, + "grad_norm": 0.20699931756463588, + "learning_rate": 1.9822539857464274e-06, + "loss": 0.0017, + "step": 25645 + }, + { + "epoch": 10.42944286295242, + "grad_norm": 0.002617702332499837, + "learning_rate": 1.9816485187357716e-06, + "loss": 0.0, + "step": 25646 + }, + { + "epoch": 10.429849532330216, + "grad_norm": 0.015912149916383816, + "learning_rate": 1.981043134037526e-06, + "loss": 0.0002, + "step": 25647 + }, + { + "epoch": 10.430256201708012, + "grad_norm": 0.016016383144629543, + "learning_rate": 1.980437831657911e-06, + "loss": 0.0002, + "step": 25648 + }, + { + "epoch": 10.430662871085808, + "grad_norm": 0.009325991418107561, + "learning_rate": 1.9798326116031373e-06, + "loss": 0.0001, + "step": 25649 + }, + { + "epoch": 10.431069540463604, + "grad_norm": 0.015429975884163135, + "learning_rate": 1.9792274738794193e-06, + "loss": 0.0002, + "step": 25650 + }, + { + "epoch": 10.4314762098414, + "grad_norm": 0.005325860622381026, + "learning_rate": 1.978622418492967e-06, + "loss": 0.0001, + "step": 25651 + }, + { + "epoch": 10.431882879219195, + "grad_norm": 0.08255960319999602, + "learning_rate": 1.9780174454499935e-06, + "loss": 0.0009, + "step": 25652 + }, + { + "epoch": 10.43228954859699, + "grad_norm": 0.10146503667001619, + "learning_rate": 1.9774125547567057e-06, + "loss": 0.0008, + "step": 25653 + }, + { + "epoch": 10.432696217974787, + "grad_norm": 0.020408193767971157, + "learning_rate": 1.976807746419318e-06, + "loss": 0.0002, + "step": 25654 + }, + { + "epoch": 10.433102887352582, + "grad_norm": 0.2745866100993185, + "learning_rate": 1.9762030204440365e-06, + "loss": 0.0015, + "step": 25655 + }, + { + "epoch": 10.433509556730378, + "grad_norm": 0.1301871443788803, + "learning_rate": 1.97559837683707e-06, + "loss": 0.0014, + "step": 25656 + }, + { + "epoch": 10.433916226108174, + "grad_norm": 0.0002528753745051491, + "learning_rate": 1.9749938156046242e-06, + "loss": 0.0, + "step": 25657 + }, + { + "epoch": 10.43432289548597, + "grad_norm": 0.005641399073249348, + "learning_rate": 1.9743893367529064e-06, + "loss": 0.0001, + "step": 25658 + }, + { + "epoch": 10.434729564863765, + "grad_norm": 0.046591809316610985, + "learning_rate": 1.9737849402881204e-06, + "loss": 0.0004, + "step": 25659 + }, + { + "epoch": 10.435136234241561, + "grad_norm": 0.20508788391966892, + "learning_rate": 1.9731806262164697e-06, + "loss": 0.0024, + "step": 25660 + }, + { + "epoch": 10.435542903619357, + "grad_norm": 0.0017690968755084673, + "learning_rate": 1.972576394544161e-06, + "loss": 0.0, + "step": 25661 + }, + { + "epoch": 10.435949572997153, + "grad_norm": 0.03723174820485084, + "learning_rate": 1.971972245277397e-06, + "loss": 0.0004, + "step": 25662 + }, + { + "epoch": 10.436356242374949, + "grad_norm": 0.0034354196334591604, + "learning_rate": 1.9713681784223783e-06, + "loss": 0.0, + "step": 25663 + }, + { + "epoch": 10.436762911752744, + "grad_norm": 0.022016752398748934, + "learning_rate": 1.9707641939853063e-06, + "loss": 0.0003, + "step": 25664 + }, + { + "epoch": 10.43716958113054, + "grad_norm": 0.043069361233462305, + "learning_rate": 1.9701602919723793e-06, + "loss": 0.0004, + "step": 25665 + }, + { + "epoch": 10.437576250508336, + "grad_norm": 1.9768473564572842, + "learning_rate": 1.9695564723897963e-06, + "loss": 0.0089, + "step": 25666 + }, + { + "epoch": 10.437982919886133, + "grad_norm": 0.007762957617707522, + "learning_rate": 1.968952735243761e-06, + "loss": 0.0001, + "step": 25667 + }, + { + "epoch": 10.43838958926393, + "grad_norm": 0.004487778718862322, + "learning_rate": 1.968349080540468e-06, + "loss": 0.0, + "step": 25668 + }, + { + "epoch": 10.438796258641725, + "grad_norm": 0.03909088001296271, + "learning_rate": 1.967745508286113e-06, + "loss": 0.0003, + "step": 25669 + }, + { + "epoch": 10.43920292801952, + "grad_norm": 0.002490520260367453, + "learning_rate": 1.9671420184868927e-06, + "loss": 0.0, + "step": 25670 + }, + { + "epoch": 10.439609597397316, + "grad_norm": 0.12360234893697211, + "learning_rate": 1.966538611149004e-06, + "loss": 0.0011, + "step": 25671 + }, + { + "epoch": 10.440016266775112, + "grad_norm": 0.06613708868724119, + "learning_rate": 1.9659352862786387e-06, + "loss": 0.0006, + "step": 25672 + }, + { + "epoch": 10.440422936152908, + "grad_norm": 0.07314965325461392, + "learning_rate": 1.965332043881988e-06, + "loss": 0.0006, + "step": 25673 + }, + { + "epoch": 10.440829605530704, + "grad_norm": 0.7306175030868629, + "learning_rate": 1.9647288839652502e-06, + "loss": 0.0095, + "step": 25674 + }, + { + "epoch": 10.4412362749085, + "grad_norm": 0.36262087470347554, + "learning_rate": 1.9641258065346156e-06, + "loss": 0.0037, + "step": 25675 + }, + { + "epoch": 10.441642944286295, + "grad_norm": 0.0042305379704043935, + "learning_rate": 1.9635228115962733e-06, + "loss": 0.0001, + "step": 25676 + }, + { + "epoch": 10.442049613664091, + "grad_norm": 0.006704485177755527, + "learning_rate": 1.9629198991564137e-06, + "loss": 0.0001, + "step": 25677 + }, + { + "epoch": 10.442456283041887, + "grad_norm": 0.019822931944107086, + "learning_rate": 1.9623170692212266e-06, + "loss": 0.0002, + "step": 25678 + }, + { + "epoch": 10.442862952419683, + "grad_norm": 0.00458084440221169, + "learning_rate": 1.9617143217968983e-06, + "loss": 0.0, + "step": 25679 + }, + { + "epoch": 10.443269621797478, + "grad_norm": 0.0035316316483409082, + "learning_rate": 1.9611116568896193e-06, + "loss": 0.0, + "step": 25680 + }, + { + "epoch": 10.443676291175274, + "grad_norm": 0.1654042530154806, + "learning_rate": 1.9605090745055755e-06, + "loss": 0.002, + "step": 25681 + }, + { + "epoch": 10.44408296055307, + "grad_norm": 0.006540877949744923, + "learning_rate": 1.959906574650953e-06, + "loss": 0.0001, + "step": 25682 + }, + { + "epoch": 10.444489629930866, + "grad_norm": 0.001310977974916441, + "learning_rate": 1.959304157331935e-06, + "loss": 0.0, + "step": 25683 + }, + { + "epoch": 10.444896299308661, + "grad_norm": 0.007317232701619358, + "learning_rate": 1.958701822554707e-06, + "loss": 0.0001, + "step": 25684 + }, + { + "epoch": 10.445302968686457, + "grad_norm": 0.03267480983418822, + "learning_rate": 1.9580995703254524e-06, + "loss": 0.0004, + "step": 25685 + }, + { + "epoch": 10.445709638064255, + "grad_norm": 0.007582692055251192, + "learning_rate": 1.95749740065035e-06, + "loss": 0.0001, + "step": 25686 + }, + { + "epoch": 10.44611630744205, + "grad_norm": 0.008406826258326993, + "learning_rate": 1.9568953135355862e-06, + "loss": 0.0001, + "step": 25687 + }, + { + "epoch": 10.446522976819846, + "grad_norm": 0.0130902372299785, + "learning_rate": 1.9562933089873415e-06, + "loss": 0.0001, + "step": 25688 + }, + { + "epoch": 10.446929646197642, + "grad_norm": 0.001996833809078907, + "learning_rate": 1.955691387011793e-06, + "loss": 0.0, + "step": 25689 + }, + { + "epoch": 10.447336315575438, + "grad_norm": 0.012189278661086997, + "learning_rate": 1.955089547615122e-06, + "loss": 0.0001, + "step": 25690 + }, + { + "epoch": 10.447742984953234, + "grad_norm": 0.0026188932426240608, + "learning_rate": 1.9544877908035053e-06, + "loss": 0.0, + "step": 25691 + }, + { + "epoch": 10.44814965433103, + "grad_norm": 0.07249200581276315, + "learning_rate": 1.953886116583119e-06, + "loss": 0.0005, + "step": 25692 + }, + { + "epoch": 10.448556323708825, + "grad_norm": 0.032134408661594616, + "learning_rate": 1.953284524960143e-06, + "loss": 0.0002, + "step": 25693 + }, + { + "epoch": 10.44896299308662, + "grad_norm": 0.19267645340826178, + "learning_rate": 1.9526830159407506e-06, + "loss": 0.0024, + "step": 25694 + }, + { + "epoch": 10.449369662464417, + "grad_norm": 3.1712473873222393, + "learning_rate": 1.9520815895311184e-06, + "loss": 0.0256, + "step": 25695 + }, + { + "epoch": 10.449776331842212, + "grad_norm": 0.00046605692837672094, + "learning_rate": 1.951480245737418e-06, + "loss": 0.0, + "step": 25696 + }, + { + "epoch": 10.450183001220008, + "grad_norm": 0.006407234393786668, + "learning_rate": 1.950878984565825e-06, + "loss": 0.0, + "step": 25697 + }, + { + "epoch": 10.450589670597804, + "grad_norm": 0.04886259817835273, + "learning_rate": 1.950277806022509e-06, + "loss": 0.0004, + "step": 25698 + }, + { + "epoch": 10.4509963399756, + "grad_norm": 0.010032576850245912, + "learning_rate": 1.9496767101136406e-06, + "loss": 0.0, + "step": 25699 + }, + { + "epoch": 10.451403009353395, + "grad_norm": 0.008422503975135042, + "learning_rate": 1.949075696845395e-06, + "loss": 0.0001, + "step": 25700 + }, + { + "epoch": 10.451809678731191, + "grad_norm": 0.0005937206536010808, + "learning_rate": 1.948474766223939e-06, + "loss": 0.0, + "step": 25701 + }, + { + "epoch": 10.452216348108987, + "grad_norm": 0.007260812777719242, + "learning_rate": 1.947873918255442e-06, + "loss": 0.0001, + "step": 25702 + }, + { + "epoch": 10.452623017486783, + "grad_norm": 0.08011096847086797, + "learning_rate": 1.947273152946071e-06, + "loss": 0.0007, + "step": 25703 + }, + { + "epoch": 10.453029686864578, + "grad_norm": 0.6165815197400363, + "learning_rate": 1.9466724703019946e-06, + "loss": 0.0062, + "step": 25704 + }, + { + "epoch": 10.453436356242374, + "grad_norm": 0.0027878812159119032, + "learning_rate": 1.946071870329377e-06, + "loss": 0.0, + "step": 25705 + }, + { + "epoch": 10.45384302562017, + "grad_norm": 0.026962077534239348, + "learning_rate": 1.945471353034386e-06, + "loss": 0.0002, + "step": 25706 + }, + { + "epoch": 10.454249694997966, + "grad_norm": 0.002919290960972547, + "learning_rate": 1.944870918423186e-06, + "loss": 0.0, + "step": 25707 + }, + { + "epoch": 10.454656364375763, + "grad_norm": 0.0018538699355048667, + "learning_rate": 1.94427056650194e-06, + "loss": 0.0, + "step": 25708 + }, + { + "epoch": 10.455063033753559, + "grad_norm": 0.0013261087685585331, + "learning_rate": 1.9436702972768106e-06, + "loss": 0.0, + "step": 25709 + }, + { + "epoch": 10.455469703131355, + "grad_norm": 0.004020948626171819, + "learning_rate": 1.943070110753961e-06, + "loss": 0.0, + "step": 25710 + }, + { + "epoch": 10.45587637250915, + "grad_norm": 0.013687521143429988, + "learning_rate": 1.942470006939552e-06, + "loss": 0.0002, + "step": 25711 + }, + { + "epoch": 10.456283041886946, + "grad_norm": 0.38357919388717543, + "learning_rate": 1.9418699858397394e-06, + "loss": 0.0022, + "step": 25712 + }, + { + "epoch": 10.456689711264742, + "grad_norm": 3.43675603727113, + "learning_rate": 1.9412700474606917e-06, + "loss": 0.0517, + "step": 25713 + }, + { + "epoch": 10.457096380642538, + "grad_norm": 0.021603427349553728, + "learning_rate": 1.9406701918085614e-06, + "loss": 0.0002, + "step": 25714 + }, + { + "epoch": 10.457503050020334, + "grad_norm": 0.0039363892355300245, + "learning_rate": 1.940070418889508e-06, + "loss": 0.0, + "step": 25715 + }, + { + "epoch": 10.45790971939813, + "grad_norm": 0.011853043422685249, + "learning_rate": 1.939470728709688e-06, + "loss": 0.0001, + "step": 25716 + }, + { + "epoch": 10.458316388775925, + "grad_norm": 0.12008537707918872, + "learning_rate": 1.9388711212752585e-06, + "loss": 0.0013, + "step": 25717 + }, + { + "epoch": 10.458723058153721, + "grad_norm": 0.01902820938825013, + "learning_rate": 1.9382715965923704e-06, + "loss": 0.0001, + "step": 25718 + }, + { + "epoch": 10.459129727531517, + "grad_norm": 0.010905152960708613, + "learning_rate": 1.937672154667185e-06, + "loss": 0.0001, + "step": 25719 + }, + { + "epoch": 10.459536396909312, + "grad_norm": 0.002528061656238934, + "learning_rate": 1.9370727955058533e-06, + "loss": 0.0, + "step": 25720 + }, + { + "epoch": 10.459943066287108, + "grad_norm": 0.022120454757721154, + "learning_rate": 1.936473519114526e-06, + "loss": 0.0002, + "step": 25721 + }, + { + "epoch": 10.460349735664904, + "grad_norm": 0.05841734644568369, + "learning_rate": 1.935874325499357e-06, + "loss": 0.0005, + "step": 25722 + }, + { + "epoch": 10.4607564050427, + "grad_norm": 0.004800352124139479, + "learning_rate": 1.9352752146664968e-06, + "loss": 0.0001, + "step": 25723 + }, + { + "epoch": 10.461163074420496, + "grad_norm": 0.01818415280543436, + "learning_rate": 1.9346761866220954e-06, + "loss": 0.0001, + "step": 25724 + }, + { + "epoch": 10.461569743798291, + "grad_norm": 0.5208934543848284, + "learning_rate": 1.9340772413722987e-06, + "loss": 0.0067, + "step": 25725 + }, + { + "epoch": 10.461976413176087, + "grad_norm": 0.002987143333871304, + "learning_rate": 1.9334783789232616e-06, + "loss": 0.0, + "step": 25726 + }, + { + "epoch": 10.462383082553885, + "grad_norm": 1.5430179165222655, + "learning_rate": 1.932879599281129e-06, + "loss": 0.01, + "step": 25727 + }, + { + "epoch": 10.46278975193168, + "grad_norm": 0.001429256917499282, + "learning_rate": 1.932280902452047e-06, + "loss": 0.0, + "step": 25728 + }, + { + "epoch": 10.463196421309476, + "grad_norm": 0.10759475002253523, + "learning_rate": 1.931682288442163e-06, + "loss": 0.0009, + "step": 25729 + }, + { + "epoch": 10.463603090687272, + "grad_norm": 0.02295663525477934, + "learning_rate": 1.9310837572576203e-06, + "loss": 0.0001, + "step": 25730 + }, + { + "epoch": 10.464009760065068, + "grad_norm": 0.019577784094862844, + "learning_rate": 1.9304853089045605e-06, + "loss": 0.0002, + "step": 25731 + }, + { + "epoch": 10.464416429442863, + "grad_norm": 0.0010827524067410323, + "learning_rate": 1.9298869433891342e-06, + "loss": 0.0, + "step": 25732 + }, + { + "epoch": 10.46482309882066, + "grad_norm": 0.009134376994482024, + "learning_rate": 1.92928866071748e-06, + "loss": 0.0001, + "step": 25733 + }, + { + "epoch": 10.465229768198455, + "grad_norm": 0.01154900381970395, + "learning_rate": 1.928690460895738e-06, + "loss": 0.0001, + "step": 25734 + }, + { + "epoch": 10.46563643757625, + "grad_norm": 0.4126797802707759, + "learning_rate": 1.9280923439300524e-06, + "loss": 0.0053, + "step": 25735 + }, + { + "epoch": 10.466043106954046, + "grad_norm": 0.2188538835872288, + "learning_rate": 1.9274943098265608e-06, + "loss": 0.0028, + "step": 25736 + }, + { + "epoch": 10.466449776331842, + "grad_norm": 0.0052993126200200505, + "learning_rate": 1.9268963585914025e-06, + "loss": 0.0001, + "step": 25737 + }, + { + "epoch": 10.466856445709638, + "grad_norm": 0.11014575995227659, + "learning_rate": 1.926298490230717e-06, + "loss": 0.001, + "step": 25738 + }, + { + "epoch": 10.467263115087434, + "grad_norm": 0.01019515951198495, + "learning_rate": 1.9257007047506394e-06, + "loss": 0.0001, + "step": 25739 + }, + { + "epoch": 10.46766978446523, + "grad_norm": 0.15157172957713086, + "learning_rate": 1.9251030021573094e-06, + "loss": 0.0012, + "step": 25740 + }, + { + "epoch": 10.468076453843025, + "grad_norm": 0.01038790008966732, + "learning_rate": 1.9245053824568606e-06, + "loss": 0.0001, + "step": 25741 + }, + { + "epoch": 10.468483123220821, + "grad_norm": 0.005160949442199216, + "learning_rate": 1.9239078456554285e-06, + "loss": 0.0001, + "step": 25742 + }, + { + "epoch": 10.468889792598617, + "grad_norm": 0.019104865979822798, + "learning_rate": 1.9233103917591435e-06, + "loss": 0.0001, + "step": 25743 + }, + { + "epoch": 10.469296461976413, + "grad_norm": 0.019696797915203103, + "learning_rate": 1.922713020774145e-06, + "loss": 0.0002, + "step": 25744 + }, + { + "epoch": 10.469703131354208, + "grad_norm": 0.020704206578485748, + "learning_rate": 1.9221157327065632e-06, + "loss": 0.0002, + "step": 25745 + }, + { + "epoch": 10.470109800732004, + "grad_norm": 0.00017866698095245396, + "learning_rate": 1.921518527562529e-06, + "loss": 0.0, + "step": 25746 + }, + { + "epoch": 10.4705164701098, + "grad_norm": 0.04880930326869417, + "learning_rate": 1.920921405348173e-06, + "loss": 0.0005, + "step": 25747 + }, + { + "epoch": 10.470923139487596, + "grad_norm": 0.039925133175796004, + "learning_rate": 1.9203243660696237e-06, + "loss": 0.0003, + "step": 25748 + }, + { + "epoch": 10.471329808865393, + "grad_norm": 0.00043567487435106333, + "learning_rate": 1.919727409733012e-06, + "loss": 0.0, + "step": 25749 + }, + { + "epoch": 10.471736478243189, + "grad_norm": 0.00828689119225389, + "learning_rate": 1.9191305363444635e-06, + "loss": 0.0001, + "step": 25750 + }, + { + "epoch": 10.472143147620985, + "grad_norm": 0.09401613412038368, + "learning_rate": 1.9185337459101084e-06, + "loss": 0.001, + "step": 25751 + }, + { + "epoch": 10.47254981699878, + "grad_norm": 0.05445609807159952, + "learning_rate": 1.917937038436072e-06, + "loss": 0.0006, + "step": 25752 + }, + { + "epoch": 10.472956486376576, + "grad_norm": 0.003307920400019573, + "learning_rate": 1.917340413928479e-06, + "loss": 0.0, + "step": 25753 + }, + { + "epoch": 10.473363155754372, + "grad_norm": 0.0017480616772739452, + "learning_rate": 1.9167438723934562e-06, + "loss": 0.0, + "step": 25754 + }, + { + "epoch": 10.473769825132168, + "grad_norm": 0.1630751551719291, + "learning_rate": 1.916147413837124e-06, + "loss": 0.0013, + "step": 25755 + }, + { + "epoch": 10.474176494509964, + "grad_norm": 0.050633741035325974, + "learning_rate": 1.915551038265605e-06, + "loss": 0.0004, + "step": 25756 + }, + { + "epoch": 10.47458316388776, + "grad_norm": 0.17261955119426906, + "learning_rate": 1.9149547456850272e-06, + "loss": 0.0013, + "step": 25757 + }, + { + "epoch": 10.474989833265555, + "grad_norm": 0.05238073128024766, + "learning_rate": 1.914358536101506e-06, + "loss": 0.0005, + "step": 25758 + }, + { + "epoch": 10.47539650264335, + "grad_norm": 0.03015934144035096, + "learning_rate": 1.913762409521166e-06, + "loss": 0.0003, + "step": 25759 + }, + { + "epoch": 10.475803172021147, + "grad_norm": 0.03221429905977333, + "learning_rate": 1.9131663659501232e-06, + "loss": 0.0003, + "step": 25760 + }, + { + "epoch": 10.476209841398942, + "grad_norm": 0.06760958045665127, + "learning_rate": 1.912570405394498e-06, + "loss": 0.0006, + "step": 25761 + }, + { + "epoch": 10.476616510776738, + "grad_norm": 0.005184525883240135, + "learning_rate": 1.9119745278604073e-06, + "loss": 0.0, + "step": 25762 + }, + { + "epoch": 10.477023180154534, + "grad_norm": 0.016793311634754036, + "learning_rate": 1.911378733353967e-06, + "loss": 0.0001, + "step": 25763 + }, + { + "epoch": 10.47742984953233, + "grad_norm": 0.012773415794570869, + "learning_rate": 1.9107830218812972e-06, + "loss": 0.0001, + "step": 25764 + }, + { + "epoch": 10.477836518910125, + "grad_norm": 0.026390108892936543, + "learning_rate": 1.9101873934485114e-06, + "loss": 0.0002, + "step": 25765 + }, + { + "epoch": 10.478243188287921, + "grad_norm": 0.03645379549004566, + "learning_rate": 1.9095918480617224e-06, + "loss": 0.0002, + "step": 25766 + }, + { + "epoch": 10.478649857665717, + "grad_norm": 0.005763090107474333, + "learning_rate": 1.9089963857270454e-06, + "loss": 0.0001, + "step": 25767 + }, + { + "epoch": 10.479056527043515, + "grad_norm": 0.008271068116751507, + "learning_rate": 1.908401006450593e-06, + "loss": 0.0001, + "step": 25768 + }, + { + "epoch": 10.47946319642131, + "grad_norm": 1.113246718415414, + "learning_rate": 1.907805710238474e-06, + "loss": 0.0126, + "step": 25769 + }, + { + "epoch": 10.479869865799106, + "grad_norm": 0.018138373263789923, + "learning_rate": 1.9072104970968042e-06, + "loss": 0.0002, + "step": 25770 + }, + { + "epoch": 10.480276535176902, + "grad_norm": 0.7254460702237945, + "learning_rate": 1.9066153670316922e-06, + "loss": 0.007, + "step": 25771 + }, + { + "epoch": 10.480683204554698, + "grad_norm": 0.12170237891927974, + "learning_rate": 1.9060203200492466e-06, + "loss": 0.0015, + "step": 25772 + }, + { + "epoch": 10.481089873932493, + "grad_norm": 0.006811291606012723, + "learning_rate": 1.9054253561555757e-06, + "loss": 0.0001, + "step": 25773 + }, + { + "epoch": 10.48149654331029, + "grad_norm": 0.06830699770525553, + "learning_rate": 1.9048304753567881e-06, + "loss": 0.0007, + "step": 25774 + }, + { + "epoch": 10.481903212688085, + "grad_norm": 1.3986559985008995, + "learning_rate": 1.9042356776589898e-06, + "loss": 0.0143, + "step": 25775 + }, + { + "epoch": 10.48230988206588, + "grad_norm": 0.00046781417652892537, + "learning_rate": 1.903640963068284e-06, + "loss": 0.0, + "step": 25776 + }, + { + "epoch": 10.482716551443676, + "grad_norm": 0.028284134411907278, + "learning_rate": 1.903046331590781e-06, + "loss": 0.0002, + "step": 25777 + }, + { + "epoch": 10.483123220821472, + "grad_norm": 0.0003439133093252801, + "learning_rate": 1.9024517832325829e-06, + "loss": 0.0, + "step": 25778 + }, + { + "epoch": 10.483529890199268, + "grad_norm": 0.05876460402929893, + "learning_rate": 1.9018573179997913e-06, + "loss": 0.0003, + "step": 25779 + }, + { + "epoch": 10.483936559577064, + "grad_norm": 0.25420527365954876, + "learning_rate": 1.9012629358985114e-06, + "loss": 0.002, + "step": 25780 + }, + { + "epoch": 10.48434322895486, + "grad_norm": 5.6252991328571476, + "learning_rate": 1.9006686369348438e-06, + "loss": 0.0631, + "step": 25781 + }, + { + "epoch": 10.484749898332655, + "grad_norm": 0.0022024669665785582, + "learning_rate": 1.9000744211148847e-06, + "loss": 0.0, + "step": 25782 + }, + { + "epoch": 10.485156567710451, + "grad_norm": 0.04535877595169391, + "learning_rate": 1.8994802884447427e-06, + "loss": 0.0005, + "step": 25783 + }, + { + "epoch": 10.485563237088247, + "grad_norm": 0.001218818194052538, + "learning_rate": 1.8988862389305108e-06, + "loss": 0.0, + "step": 25784 + }, + { + "epoch": 10.485969906466043, + "grad_norm": 0.6404964247622784, + "learning_rate": 1.8982922725782904e-06, + "loss": 0.006, + "step": 25785 + }, + { + "epoch": 10.486376575843838, + "grad_norm": 0.005122378809900256, + "learning_rate": 1.897698389394176e-06, + "loss": 0.0001, + "step": 25786 + }, + { + "epoch": 10.486783245221634, + "grad_norm": 0.00217322271144124, + "learning_rate": 1.8971045893842665e-06, + "loss": 0.0, + "step": 25787 + }, + { + "epoch": 10.48718991459943, + "grad_norm": 0.1163551161383611, + "learning_rate": 1.8965108725546554e-06, + "loss": 0.0012, + "step": 25788 + }, + { + "epoch": 10.487596583977226, + "grad_norm": 0.012721130666956121, + "learning_rate": 1.8959172389114367e-06, + "loss": 0.0001, + "step": 25789 + }, + { + "epoch": 10.488003253355023, + "grad_norm": 0.20463610095379423, + "learning_rate": 1.8953236884607085e-06, + "loss": 0.0025, + "step": 25790 + }, + { + "epoch": 10.488409922732819, + "grad_norm": 0.006090648245981703, + "learning_rate": 1.8947302212085627e-06, + "loss": 0.0001, + "step": 25791 + }, + { + "epoch": 10.488816592110615, + "grad_norm": 0.005104623254067196, + "learning_rate": 1.894136837161089e-06, + "loss": 0.0, + "step": 25792 + }, + { + "epoch": 10.48922326148841, + "grad_norm": 2.0583908409257545, + "learning_rate": 1.8935435363243816e-06, + "loss": 0.0177, + "step": 25793 + }, + { + "epoch": 10.489629930866206, + "grad_norm": 0.05953486846176465, + "learning_rate": 1.8929503187045285e-06, + "loss": 0.0002, + "step": 25794 + }, + { + "epoch": 10.490036600244002, + "grad_norm": 0.07678123672457397, + "learning_rate": 1.8923571843076182e-06, + "loss": 0.0008, + "step": 25795 + }, + { + "epoch": 10.490443269621798, + "grad_norm": 0.020960717715159557, + "learning_rate": 1.8917641331397452e-06, + "loss": 0.0002, + "step": 25796 + }, + { + "epoch": 10.490849938999594, + "grad_norm": 0.006992326964257774, + "learning_rate": 1.8911711652069942e-06, + "loss": 0.0001, + "step": 25797 + }, + { + "epoch": 10.49125660837739, + "grad_norm": 0.0007549889396049899, + "learning_rate": 1.8905782805154515e-06, + "loss": 0.0, + "step": 25798 + }, + { + "epoch": 10.491663277755185, + "grad_norm": 0.0004246587361445646, + "learning_rate": 1.8899854790712046e-06, + "loss": 0.0, + "step": 25799 + }, + { + "epoch": 10.49206994713298, + "grad_norm": 0.0008699793907041717, + "learning_rate": 1.8893927608803375e-06, + "loss": 0.0, + "step": 25800 + }, + { + "epoch": 10.492476616510777, + "grad_norm": 0.0024892252479880787, + "learning_rate": 1.8888001259489375e-06, + "loss": 0.0, + "step": 25801 + }, + { + "epoch": 10.492883285888572, + "grad_norm": 0.010719835272852044, + "learning_rate": 1.888207574283082e-06, + "loss": 0.0, + "step": 25802 + }, + { + "epoch": 10.493289955266368, + "grad_norm": 0.0039373894276698955, + "learning_rate": 1.8876151058888626e-06, + "loss": 0.0, + "step": 25803 + }, + { + "epoch": 10.493696624644164, + "grad_norm": 0.011259142639935434, + "learning_rate": 1.8870227207723557e-06, + "loss": 0.0001, + "step": 25804 + }, + { + "epoch": 10.49410329402196, + "grad_norm": 0.602862044919471, + "learning_rate": 1.8864304189396442e-06, + "loss": 0.0061, + "step": 25805 + }, + { + "epoch": 10.494509963399755, + "grad_norm": 0.02587061665771187, + "learning_rate": 1.885838200396808e-06, + "loss": 0.0003, + "step": 25806 + }, + { + "epoch": 10.494916632777551, + "grad_norm": 0.0016981636694526699, + "learning_rate": 1.885246065149927e-06, + "loss": 0.0, + "step": 25807 + }, + { + "epoch": 10.495323302155347, + "grad_norm": 0.009798073831097761, + "learning_rate": 1.884654013205076e-06, + "loss": 0.0001, + "step": 25808 + }, + { + "epoch": 10.495729971533144, + "grad_norm": 0.6040286777480132, + "learning_rate": 1.8840620445683378e-06, + "loss": 0.0064, + "step": 25809 + }, + { + "epoch": 10.49613664091094, + "grad_norm": 0.054163014867342076, + "learning_rate": 1.8834701592457883e-06, + "loss": 0.0004, + "step": 25810 + }, + { + "epoch": 10.496543310288736, + "grad_norm": 0.3196929861005163, + "learning_rate": 1.8828783572435029e-06, + "loss": 0.0032, + "step": 25811 + }, + { + "epoch": 10.496949979666532, + "grad_norm": 0.05178928666654375, + "learning_rate": 1.8822866385675553e-06, + "loss": 0.0005, + "step": 25812 + }, + { + "epoch": 10.497356649044328, + "grad_norm": 0.0026673620076108134, + "learning_rate": 1.8816950032240221e-06, + "loss": 0.0, + "step": 25813 + }, + { + "epoch": 10.497763318422123, + "grad_norm": 0.005145518789023766, + "learning_rate": 1.8811034512189752e-06, + "loss": 0.0, + "step": 25814 + }, + { + "epoch": 10.498169987799919, + "grad_norm": 0.2937014303660603, + "learning_rate": 1.880511982558485e-06, + "loss": 0.0031, + "step": 25815 + }, + { + "epoch": 10.498576657177715, + "grad_norm": 0.2265387004073026, + "learning_rate": 1.879920597248628e-06, + "loss": 0.0015, + "step": 25816 + }, + { + "epoch": 10.49898332655551, + "grad_norm": 0.023432914352050006, + "learning_rate": 1.8793292952954722e-06, + "loss": 0.0003, + "step": 25817 + }, + { + "epoch": 10.499389995933306, + "grad_norm": 0.20694681030859455, + "learning_rate": 1.8787380767050888e-06, + "loss": 0.002, + "step": 25818 + }, + { + "epoch": 10.499796665311102, + "grad_norm": 0.0018039186300850552, + "learning_rate": 1.8781469414835474e-06, + "loss": 0.0, + "step": 25819 + }, + { + "epoch": 10.500203334688898, + "grad_norm": 0.021322847303051142, + "learning_rate": 1.877555889636914e-06, + "loss": 0.0002, + "step": 25820 + }, + { + "epoch": 10.500610004066694, + "grad_norm": 0.002417168178489126, + "learning_rate": 1.876964921171255e-06, + "loss": 0.0, + "step": 25821 + }, + { + "epoch": 10.50101667344449, + "grad_norm": 0.19661200353026181, + "learning_rate": 1.876374036092643e-06, + "loss": 0.0017, + "step": 25822 + }, + { + "epoch": 10.501423342822285, + "grad_norm": 0.02136203028684134, + "learning_rate": 1.8757832344071392e-06, + "loss": 0.0002, + "step": 25823 + }, + { + "epoch": 10.501830012200081, + "grad_norm": 0.004425704226553424, + "learning_rate": 1.8751925161208085e-06, + "loss": 0.0, + "step": 25824 + }, + { + "epoch": 10.502236681577877, + "grad_norm": 0.0010704118104474004, + "learning_rate": 1.874601881239717e-06, + "loss": 0.0, + "step": 25825 + }, + { + "epoch": 10.502643350955672, + "grad_norm": 0.10102664626716389, + "learning_rate": 1.874011329769926e-06, + "loss": 0.0006, + "step": 25826 + }, + { + "epoch": 10.503050020333468, + "grad_norm": 1.091785216151942, + "learning_rate": 1.8734208617174986e-06, + "loss": 0.0072, + "step": 25827 + }, + { + "epoch": 10.503456689711264, + "grad_norm": 0.07839417433036017, + "learning_rate": 1.8728304770884931e-06, + "loss": 0.0009, + "step": 25828 + }, + { + "epoch": 10.50386335908906, + "grad_norm": 0.007692161447840291, + "learning_rate": 1.8722401758889763e-06, + "loss": 0.0001, + "step": 25829 + }, + { + "epoch": 10.504270028466856, + "grad_norm": 0.015215517126167158, + "learning_rate": 1.8716499581250047e-06, + "loss": 0.0002, + "step": 25830 + }, + { + "epoch": 10.504676697844653, + "grad_norm": 0.0008787707530443513, + "learning_rate": 1.8710598238026368e-06, + "loss": 0.0, + "step": 25831 + }, + { + "epoch": 10.505083367222449, + "grad_norm": 0.17733245025974004, + "learning_rate": 1.870469772927931e-06, + "loss": 0.0025, + "step": 25832 + }, + { + "epoch": 10.505490036600245, + "grad_norm": 0.0029787874544478328, + "learning_rate": 1.8698798055069446e-06, + "loss": 0.0, + "step": 25833 + }, + { + "epoch": 10.50589670597804, + "grad_norm": 0.0025336150179260807, + "learning_rate": 1.8692899215457317e-06, + "loss": 0.0, + "step": 25834 + }, + { + "epoch": 10.506303375355836, + "grad_norm": 0.0018567930968940742, + "learning_rate": 1.868700121050352e-06, + "loss": 0.0, + "step": 25835 + }, + { + "epoch": 10.506710044733632, + "grad_norm": 0.006216266487326948, + "learning_rate": 1.8681104040268605e-06, + "loss": 0.0, + "step": 25836 + }, + { + "epoch": 10.507116714111428, + "grad_norm": 0.04655667284867398, + "learning_rate": 1.8675207704813058e-06, + "loss": 0.0004, + "step": 25837 + }, + { + "epoch": 10.507523383489223, + "grad_norm": 0.16872428155108718, + "learning_rate": 1.8669312204197432e-06, + "loss": 0.0013, + "step": 25838 + }, + { + "epoch": 10.50793005286702, + "grad_norm": 0.22390440337851453, + "learning_rate": 1.866341753848222e-06, + "loss": 0.0026, + "step": 25839 + }, + { + "epoch": 10.508336722244815, + "grad_norm": 0.02476284440017075, + "learning_rate": 1.8657523707727998e-06, + "loss": 0.0002, + "step": 25840 + }, + { + "epoch": 10.50874339162261, + "grad_norm": 0.281347325736387, + "learning_rate": 1.8651630711995228e-06, + "loss": 0.0041, + "step": 25841 + }, + { + "epoch": 10.509150061000406, + "grad_norm": 0.155523075936395, + "learning_rate": 1.8645738551344417e-06, + "loss": 0.0009, + "step": 25842 + }, + { + "epoch": 10.509556730378202, + "grad_norm": 0.008365188542015815, + "learning_rate": 1.863984722583604e-06, + "loss": 0.0001, + "step": 25843 + }, + { + "epoch": 10.509963399755998, + "grad_norm": 0.014603690643216165, + "learning_rate": 1.8633956735530578e-06, + "loss": 0.0002, + "step": 25844 + }, + { + "epoch": 10.510370069133794, + "grad_norm": 0.02799698386905214, + "learning_rate": 1.8628067080488498e-06, + "loss": 0.0003, + "step": 25845 + }, + { + "epoch": 10.51077673851159, + "grad_norm": 9.168942757576223e-05, + "learning_rate": 1.862217826077024e-06, + "loss": 0.0, + "step": 25846 + }, + { + "epoch": 10.511183407889385, + "grad_norm": 0.029476560508835785, + "learning_rate": 1.8616290276436312e-06, + "loss": 0.0002, + "step": 25847 + }, + { + "epoch": 10.511590077267181, + "grad_norm": 0.06866442156488128, + "learning_rate": 1.8610403127547116e-06, + "loss": 0.0002, + "step": 25848 + }, + { + "epoch": 10.511996746644977, + "grad_norm": 0.032458253145456105, + "learning_rate": 1.8604516814163099e-06, + "loss": 0.0002, + "step": 25849 + }, + { + "epoch": 10.512403416022774, + "grad_norm": 0.007966376413023385, + "learning_rate": 1.8598631336344675e-06, + "loss": 0.0001, + "step": 25850 + }, + { + "epoch": 10.51281008540057, + "grad_norm": 0.00011786932395860928, + "learning_rate": 1.8592746694152275e-06, + "loss": 0.0, + "step": 25851 + }, + { + "epoch": 10.513216754778366, + "grad_norm": 0.0695879685483072, + "learning_rate": 1.8586862887646273e-06, + "loss": 0.0005, + "step": 25852 + }, + { + "epoch": 10.513623424156162, + "grad_norm": 0.02552704278086821, + "learning_rate": 1.8580979916887122e-06, + "loss": 0.0001, + "step": 25853 + }, + { + "epoch": 10.514030093533957, + "grad_norm": 0.003325859850468119, + "learning_rate": 1.8575097781935192e-06, + "loss": 0.0, + "step": 25854 + }, + { + "epoch": 10.514436762911753, + "grad_norm": 0.003336209573350961, + "learning_rate": 1.856921648285087e-06, + "loss": 0.0, + "step": 25855 + }, + { + "epoch": 10.514843432289549, + "grad_norm": 0.005551236388078365, + "learning_rate": 1.8563336019694512e-06, + "loss": 0.0001, + "step": 25856 + }, + { + "epoch": 10.515250101667345, + "grad_norm": 0.020922173800753847, + "learning_rate": 1.8557456392526508e-06, + "loss": 0.0001, + "step": 25857 + }, + { + "epoch": 10.51565677104514, + "grad_norm": 0.00473604781990466, + "learning_rate": 1.855157760140719e-06, + "loss": 0.0, + "step": 25858 + }, + { + "epoch": 10.516063440422936, + "grad_norm": 0.005141942112921708, + "learning_rate": 1.8545699646396908e-06, + "loss": 0.0001, + "step": 25859 + }, + { + "epoch": 10.516470109800732, + "grad_norm": 0.027154641224346597, + "learning_rate": 1.8539822527556029e-06, + "loss": 0.0002, + "step": 25860 + }, + { + "epoch": 10.516876779178528, + "grad_norm": 0.007192102067731548, + "learning_rate": 1.8533946244944878e-06, + "loss": 0.0001, + "step": 25861 + }, + { + "epoch": 10.517283448556324, + "grad_norm": 0.012919997361258267, + "learning_rate": 1.8528070798623765e-06, + "loss": 0.0001, + "step": 25862 + }, + { + "epoch": 10.51769011793412, + "grad_norm": 0.045966621547033584, + "learning_rate": 1.8522196188653007e-06, + "loss": 0.0004, + "step": 25863 + }, + { + "epoch": 10.518096787311915, + "grad_norm": 0.11051337328954634, + "learning_rate": 1.8516322415092914e-06, + "loss": 0.0007, + "step": 25864 + }, + { + "epoch": 10.51850345668971, + "grad_norm": 0.0911475674726973, + "learning_rate": 1.8510449478003788e-06, + "loss": 0.0008, + "step": 25865 + }, + { + "epoch": 10.518910126067507, + "grad_norm": 5.7533481408804095, + "learning_rate": 1.8504577377445875e-06, + "loss": 0.123, + "step": 25866 + }, + { + "epoch": 10.519316795445302, + "grad_norm": 0.07937702899104342, + "learning_rate": 1.8498706113479525e-06, + "loss": 0.001, + "step": 25867 + }, + { + "epoch": 10.519723464823098, + "grad_norm": 0.0031173635563741487, + "learning_rate": 1.8492835686164968e-06, + "loss": 0.0001, + "step": 25868 + }, + { + "epoch": 10.520130134200894, + "grad_norm": 0.38613233273884096, + "learning_rate": 1.8486966095562474e-06, + "loss": 0.0014, + "step": 25869 + }, + { + "epoch": 10.52053680357869, + "grad_norm": 0.024195143782840885, + "learning_rate": 1.8481097341732312e-06, + "loss": 0.0002, + "step": 25870 + }, + { + "epoch": 10.520943472956485, + "grad_norm": 0.042793247008574495, + "learning_rate": 1.8475229424734697e-06, + "loss": 0.0002, + "step": 25871 + }, + { + "epoch": 10.521350142334283, + "grad_norm": 0.06723023421429893, + "learning_rate": 1.846936234462986e-06, + "loss": 0.0007, + "step": 25872 + }, + { + "epoch": 10.521756811712079, + "grad_norm": 0.548585969707426, + "learning_rate": 1.8463496101478085e-06, + "loss": 0.0045, + "step": 25873 + }, + { + "epoch": 10.522163481089875, + "grad_norm": 0.01950219618449061, + "learning_rate": 1.8457630695339557e-06, + "loss": 0.0001, + "step": 25874 + }, + { + "epoch": 10.52257015046767, + "grad_norm": 0.16394538786850354, + "learning_rate": 1.8451766126274483e-06, + "loss": 0.0014, + "step": 25875 + }, + { + "epoch": 10.522976819845466, + "grad_norm": 0.012268867645026034, + "learning_rate": 1.8445902394343074e-06, + "loss": 0.0001, + "step": 25876 + }, + { + "epoch": 10.523383489223262, + "grad_norm": 0.02729839949743104, + "learning_rate": 1.8440039499605532e-06, + "loss": 0.0003, + "step": 25877 + }, + { + "epoch": 10.523790158601058, + "grad_norm": 0.00946238694232315, + "learning_rate": 1.8434177442122025e-06, + "loss": 0.0001, + "step": 25878 + }, + { + "epoch": 10.524196827978853, + "grad_norm": 0.12297880453810167, + "learning_rate": 1.8428316221952713e-06, + "loss": 0.0012, + "step": 25879 + }, + { + "epoch": 10.52460349735665, + "grad_norm": 0.0023088013740034403, + "learning_rate": 1.8422455839157815e-06, + "loss": 0.0, + "step": 25880 + }, + { + "epoch": 10.525010166734445, + "grad_norm": 0.020471131298147403, + "learning_rate": 1.8416596293797461e-06, + "loss": 0.0003, + "step": 25881 + }, + { + "epoch": 10.52541683611224, + "grad_norm": 0.021685212065307415, + "learning_rate": 1.8410737585931803e-06, + "loss": 0.0003, + "step": 25882 + }, + { + "epoch": 10.525823505490036, + "grad_norm": 0.0004901487317284398, + "learning_rate": 1.8404879715621005e-06, + "loss": 0.0, + "step": 25883 + }, + { + "epoch": 10.526230174867832, + "grad_norm": 0.03285751442909156, + "learning_rate": 1.839902268292516e-06, + "loss": 0.0004, + "step": 25884 + }, + { + "epoch": 10.526636844245628, + "grad_norm": 0.029265362222202426, + "learning_rate": 1.83931664879044e-06, + "loss": 0.0003, + "step": 25885 + }, + { + "epoch": 10.527043513623424, + "grad_norm": 0.016177909303164997, + "learning_rate": 1.838731113061889e-06, + "loss": 0.0002, + "step": 25886 + }, + { + "epoch": 10.52745018300122, + "grad_norm": 0.004364667921360342, + "learning_rate": 1.83814566111287e-06, + "loss": 0.0, + "step": 25887 + }, + { + "epoch": 10.527856852379015, + "grad_norm": 0.005942726666194734, + "learning_rate": 1.837560292949394e-06, + "loss": 0.0001, + "step": 25888 + }, + { + "epoch": 10.528263521756811, + "grad_norm": 0.014177791377598057, + "learning_rate": 1.8369750085774695e-06, + "loss": 0.0001, + "step": 25889 + }, + { + "epoch": 10.528670191134607, + "grad_norm": 0.14717978204327628, + "learning_rate": 1.836389808003104e-06, + "loss": 0.0007, + "step": 25890 + }, + { + "epoch": 10.529076860512404, + "grad_norm": 0.054833026911697946, + "learning_rate": 1.8358046912323069e-06, + "loss": 0.0006, + "step": 25891 + }, + { + "epoch": 10.5294835298902, + "grad_norm": 0.04456294595835234, + "learning_rate": 1.8352196582710802e-06, + "loss": 0.0004, + "step": 25892 + }, + { + "epoch": 10.529890199267996, + "grad_norm": 0.030540803903832486, + "learning_rate": 1.8346347091254358e-06, + "loss": 0.0003, + "step": 25893 + }, + { + "epoch": 10.530296868645792, + "grad_norm": 0.8092173501036621, + "learning_rate": 1.8340498438013766e-06, + "loss": 0.005, + "step": 25894 + }, + { + "epoch": 10.530703538023587, + "grad_norm": 0.024230585145075544, + "learning_rate": 1.8334650623049043e-06, + "loss": 0.0002, + "step": 25895 + }, + { + "epoch": 10.531110207401383, + "grad_norm": 0.02117563816284148, + "learning_rate": 1.8328803646420234e-06, + "loss": 0.0002, + "step": 25896 + }, + { + "epoch": 10.531516876779179, + "grad_norm": 0.0156334701948484, + "learning_rate": 1.8322957508187366e-06, + "loss": 0.0003, + "step": 25897 + }, + { + "epoch": 10.531923546156975, + "grad_norm": 0.009683399730805767, + "learning_rate": 1.8317112208410415e-06, + "loss": 0.0001, + "step": 25898 + }, + { + "epoch": 10.53233021553477, + "grad_norm": 0.2788647306437986, + "learning_rate": 1.8311267747149452e-06, + "loss": 0.0023, + "step": 25899 + }, + { + "epoch": 10.532736884912566, + "grad_norm": 0.009088844653238644, + "learning_rate": 1.8305424124464433e-06, + "loss": 0.0001, + "step": 25900 + }, + { + "epoch": 10.533143554290362, + "grad_norm": 0.011050612982714123, + "learning_rate": 1.8299581340415351e-06, + "loss": 0.0001, + "step": 25901 + }, + { + "epoch": 10.533550223668158, + "grad_norm": 0.019574012389664282, + "learning_rate": 1.8293739395062183e-06, + "loss": 0.0002, + "step": 25902 + }, + { + "epoch": 10.533956893045954, + "grad_norm": 0.006216940469298109, + "learning_rate": 1.8287898288464901e-06, + "loss": 0.0001, + "step": 25903 + }, + { + "epoch": 10.53436356242375, + "grad_norm": 0.00013454322096612597, + "learning_rate": 1.828205802068348e-06, + "loss": 0.0, + "step": 25904 + }, + { + "epoch": 10.534770231801545, + "grad_norm": 0.003460885443103119, + "learning_rate": 1.8276218591777828e-06, + "loss": 0.0, + "step": 25905 + }, + { + "epoch": 10.53517690117934, + "grad_norm": 0.007790895774553427, + "learning_rate": 1.8270380001807941e-06, + "loss": 0.0, + "step": 25906 + }, + { + "epoch": 10.535583570557137, + "grad_norm": 0.0007681734540085646, + "learning_rate": 1.826454225083375e-06, + "loss": 0.0, + "step": 25907 + }, + { + "epoch": 10.535990239934932, + "grad_norm": 0.007474617763370565, + "learning_rate": 1.825870533891516e-06, + "loss": 0.0001, + "step": 25908 + }, + { + "epoch": 10.536396909312728, + "grad_norm": 0.0037996196102778124, + "learning_rate": 1.8252869266112094e-06, + "loss": 0.0, + "step": 25909 + }, + { + "epoch": 10.536803578690524, + "grad_norm": 0.02457521537383142, + "learning_rate": 1.824703403248448e-06, + "loss": 0.0002, + "step": 25910 + }, + { + "epoch": 10.53721024806832, + "grad_norm": 0.11362946348930737, + "learning_rate": 1.8241199638092177e-06, + "loss": 0.0007, + "step": 25911 + }, + { + "epoch": 10.537616917446115, + "grad_norm": 0.03426965053187618, + "learning_rate": 1.8235366082995132e-06, + "loss": 0.0003, + "step": 25912 + }, + { + "epoch": 10.538023586823913, + "grad_norm": 0.4516032710853321, + "learning_rate": 1.8229533367253205e-06, + "loss": 0.004, + "step": 25913 + }, + { + "epoch": 10.538430256201709, + "grad_norm": 0.044089895719362525, + "learning_rate": 1.8223701490926283e-06, + "loss": 0.0002, + "step": 25914 + }, + { + "epoch": 10.538836925579504, + "grad_norm": 0.019021275475133718, + "learning_rate": 1.8217870454074205e-06, + "loss": 0.0002, + "step": 25915 + }, + { + "epoch": 10.5392435949573, + "grad_norm": 0.018690061573410962, + "learning_rate": 1.8212040256756857e-06, + "loss": 0.0002, + "step": 25916 + }, + { + "epoch": 10.539650264335096, + "grad_norm": 0.047677988225747564, + "learning_rate": 1.8206210899034082e-06, + "loss": 0.0005, + "step": 25917 + }, + { + "epoch": 10.540056933712892, + "grad_norm": 0.011670406041053363, + "learning_rate": 1.8200382380965687e-06, + "loss": 0.0001, + "step": 25918 + }, + { + "epoch": 10.540463603090688, + "grad_norm": 0.5315900464139475, + "learning_rate": 1.8194554702611567e-06, + "loss": 0.0042, + "step": 25919 + }, + { + "epoch": 10.540870272468483, + "grad_norm": 0.020392376730349517, + "learning_rate": 1.8188727864031509e-06, + "loss": 0.0002, + "step": 25920 + }, + { + "epoch": 10.541276941846279, + "grad_norm": 0.02829796309632684, + "learning_rate": 1.8182901865285329e-06, + "loss": 0.0002, + "step": 25921 + }, + { + "epoch": 10.541683611224075, + "grad_norm": 0.007246139118758666, + "learning_rate": 1.817707670643285e-06, + "loss": 0.0, + "step": 25922 + }, + { + "epoch": 10.54209028060187, + "grad_norm": 0.012349502506884878, + "learning_rate": 1.8171252387533856e-06, + "loss": 0.0001, + "step": 25923 + }, + { + "epoch": 10.542496949979666, + "grad_norm": 0.005041131423443517, + "learning_rate": 1.816542890864811e-06, + "loss": 0.0, + "step": 25924 + }, + { + "epoch": 10.542903619357462, + "grad_norm": 0.5823539391504856, + "learning_rate": 1.815960626983545e-06, + "loss": 0.0039, + "step": 25925 + }, + { + "epoch": 10.543310288735258, + "grad_norm": 0.00015320486377430633, + "learning_rate": 1.8153784471155623e-06, + "loss": 0.0, + "step": 25926 + }, + { + "epoch": 10.543716958113054, + "grad_norm": 0.08999414633780986, + "learning_rate": 1.8147963512668386e-06, + "loss": 0.0011, + "step": 25927 + }, + { + "epoch": 10.54412362749085, + "grad_norm": 0.002753137778526969, + "learning_rate": 1.814214339443351e-06, + "loss": 0.0, + "step": 25928 + }, + { + "epoch": 10.544530296868645, + "grad_norm": 0.020341101524258953, + "learning_rate": 1.8136324116510717e-06, + "loss": 0.0002, + "step": 25929 + }, + { + "epoch": 10.544936966246441, + "grad_norm": 0.003157714354112501, + "learning_rate": 1.8130505678959764e-06, + "loss": 0.0, + "step": 25930 + }, + { + "epoch": 10.545343635624237, + "grad_norm": 0.02958795871319789, + "learning_rate": 1.8124688081840348e-06, + "loss": 0.0002, + "step": 25931 + }, + { + "epoch": 10.545750305002034, + "grad_norm": 0.1557938986817069, + "learning_rate": 1.811887132521224e-06, + "loss": 0.0012, + "step": 25932 + }, + { + "epoch": 10.54615697437983, + "grad_norm": 0.003509092876587297, + "learning_rate": 1.8113055409135127e-06, + "loss": 0.0, + "step": 25933 + }, + { + "epoch": 10.546563643757626, + "grad_norm": 0.004729163483505319, + "learning_rate": 1.810724033366872e-06, + "loss": 0.0, + "step": 25934 + }, + { + "epoch": 10.546970313135422, + "grad_norm": 0.011135638614391874, + "learning_rate": 1.8101426098872699e-06, + "loss": 0.0001, + "step": 25935 + }, + { + "epoch": 10.547376982513217, + "grad_norm": 0.00540387587981877, + "learning_rate": 1.8095612704806765e-06, + "loss": 0.0001, + "step": 25936 + }, + { + "epoch": 10.547783651891013, + "grad_norm": 0.07466793800269678, + "learning_rate": 1.808980015153059e-06, + "loss": 0.0006, + "step": 25937 + }, + { + "epoch": 10.548190321268809, + "grad_norm": 0.0029445123669357962, + "learning_rate": 1.8083988439103828e-06, + "loss": 0.0, + "step": 25938 + }, + { + "epoch": 10.548596990646605, + "grad_norm": 0.02217425209879879, + "learning_rate": 1.8078177567586164e-06, + "loss": 0.0002, + "step": 25939 + }, + { + "epoch": 10.5490036600244, + "grad_norm": 0.11153894475872787, + "learning_rate": 1.8072367537037227e-06, + "loss": 0.0012, + "step": 25940 + }, + { + "epoch": 10.549410329402196, + "grad_norm": 0.010323914360231407, + "learning_rate": 1.8066558347516683e-06, + "loss": 0.0001, + "step": 25941 + }, + { + "epoch": 10.549816998779992, + "grad_norm": 0.006040523266658092, + "learning_rate": 1.8060749999084126e-06, + "loss": 0.0, + "step": 25942 + }, + { + "epoch": 10.550223668157788, + "grad_norm": 0.003816934620556254, + "learning_rate": 1.8054942491799222e-06, + "loss": 0.0, + "step": 25943 + }, + { + "epoch": 10.550630337535583, + "grad_norm": 0.0015853116999524632, + "learning_rate": 1.8049135825721587e-06, + "loss": 0.0, + "step": 25944 + }, + { + "epoch": 10.55103700691338, + "grad_norm": 0.01067344841186434, + "learning_rate": 1.804333000091082e-06, + "loss": 0.0001, + "step": 25945 + }, + { + "epoch": 10.551443676291175, + "grad_norm": 0.03676963965163854, + "learning_rate": 1.8037525017426505e-06, + "loss": 0.0002, + "step": 25946 + }, + { + "epoch": 10.55185034566897, + "grad_norm": 3.9383873457656104, + "learning_rate": 1.8031720875328263e-06, + "loss": 0.0259, + "step": 25947 + }, + { + "epoch": 10.552257015046766, + "grad_norm": 0.01226227936432269, + "learning_rate": 1.8025917574675655e-06, + "loss": 0.0002, + "step": 25948 + }, + { + "epoch": 10.552663684424562, + "grad_norm": 0.0028106743675886756, + "learning_rate": 1.8020115115528224e-06, + "loss": 0.0, + "step": 25949 + }, + { + "epoch": 10.553070353802358, + "grad_norm": 0.0012660744870767436, + "learning_rate": 1.8014313497945602e-06, + "loss": 0.0, + "step": 25950 + }, + { + "epoch": 10.553477023180154, + "grad_norm": 0.05687724953115015, + "learning_rate": 1.8008512721987326e-06, + "loss": 0.0005, + "step": 25951 + }, + { + "epoch": 10.55388369255795, + "grad_norm": 0.20955689855159568, + "learning_rate": 1.8002712787712916e-06, + "loss": 0.0022, + "step": 25952 + }, + { + "epoch": 10.554290361935745, + "grad_norm": 0.42133126423701595, + "learning_rate": 1.7996913695181938e-06, + "loss": 0.0034, + "step": 25953 + }, + { + "epoch": 10.554697031313543, + "grad_norm": 0.05529815823198775, + "learning_rate": 1.7991115444453911e-06, + "loss": 0.0005, + "step": 25954 + }, + { + "epoch": 10.555103700691339, + "grad_norm": 0.024305956994222316, + "learning_rate": 1.7985318035588328e-06, + "loss": 0.0001, + "step": 25955 + }, + { + "epoch": 10.555510370069134, + "grad_norm": 0.004799225710047556, + "learning_rate": 1.7979521468644757e-06, + "loss": 0.0, + "step": 25956 + }, + { + "epoch": 10.55591703944693, + "grad_norm": 0.676466931636305, + "learning_rate": 1.7973725743682679e-06, + "loss": 0.008, + "step": 25957 + }, + { + "epoch": 10.556323708824726, + "grad_norm": 0.03613294137310462, + "learning_rate": 1.7967930860761595e-06, + "loss": 0.0004, + "step": 25958 + }, + { + "epoch": 10.556730378202522, + "grad_norm": 0.004388370825853388, + "learning_rate": 1.7962136819940989e-06, + "loss": 0.0001, + "step": 25959 + }, + { + "epoch": 10.557137047580317, + "grad_norm": 7.26560930434962, + "learning_rate": 1.7956343621280326e-06, + "loss": 0.0656, + "step": 25960 + }, + { + "epoch": 10.557543716958113, + "grad_norm": 0.01703344162711066, + "learning_rate": 1.795055126483909e-06, + "loss": 0.0002, + "step": 25961 + }, + { + "epoch": 10.557950386335909, + "grad_norm": 0.010971379410970485, + "learning_rate": 1.7944759750676721e-06, + "loss": 0.0001, + "step": 25962 + }, + { + "epoch": 10.558357055713705, + "grad_norm": 0.027560055401749377, + "learning_rate": 1.7938969078852708e-06, + "loss": 0.0002, + "step": 25963 + }, + { + "epoch": 10.5587637250915, + "grad_norm": 0.28292946116596024, + "learning_rate": 1.793317924942649e-06, + "loss": 0.0039, + "step": 25964 + }, + { + "epoch": 10.559170394469296, + "grad_norm": 0.37949538882641054, + "learning_rate": 1.7927390262457479e-06, + "loss": 0.0038, + "step": 25965 + }, + { + "epoch": 10.559577063847092, + "grad_norm": 0.2666712777739115, + "learning_rate": 1.7921602118005122e-06, + "loss": 0.0024, + "step": 25966 + }, + { + "epoch": 10.559983733224888, + "grad_norm": 0.09418824526796551, + "learning_rate": 1.7915814816128819e-06, + "loss": 0.0009, + "step": 25967 + }, + { + "epoch": 10.560390402602684, + "grad_norm": 0.0018228175598500249, + "learning_rate": 1.7910028356887999e-06, + "loss": 0.0, + "step": 25968 + }, + { + "epoch": 10.56079707198048, + "grad_norm": 0.011710653074327445, + "learning_rate": 1.7904242740342015e-06, + "loss": 0.0001, + "step": 25969 + }, + { + "epoch": 10.561203741358275, + "grad_norm": 0.09214218415093053, + "learning_rate": 1.7898457966550331e-06, + "loss": 0.0008, + "step": 25970 + }, + { + "epoch": 10.56161041073607, + "grad_norm": 0.017145509669355144, + "learning_rate": 1.78926740355723e-06, + "loss": 0.0002, + "step": 25971 + }, + { + "epoch": 10.562017080113867, + "grad_norm": 0.0012299882550255337, + "learning_rate": 1.7886890947467283e-06, + "loss": 0.0, + "step": 25972 + }, + { + "epoch": 10.562423749491664, + "grad_norm": 0.019974538140333772, + "learning_rate": 1.7881108702294669e-06, + "loss": 0.0001, + "step": 25973 + }, + { + "epoch": 10.56283041886946, + "grad_norm": 0.06469682827444073, + "learning_rate": 1.7875327300113799e-06, + "loss": 0.0005, + "step": 25974 + }, + { + "epoch": 10.563237088247256, + "grad_norm": 0.017418252575732616, + "learning_rate": 1.7869546740984001e-06, + "loss": 0.0001, + "step": 25975 + }, + { + "epoch": 10.563643757625051, + "grad_norm": 1.109299665828515, + "learning_rate": 1.7863767024964663e-06, + "loss": 0.0111, + "step": 25976 + }, + { + "epoch": 10.564050427002847, + "grad_norm": 0.00031590475995125507, + "learning_rate": 1.7857988152115102e-06, + "loss": 0.0, + "step": 25977 + }, + { + "epoch": 10.564457096380643, + "grad_norm": 0.012674254516817227, + "learning_rate": 1.785221012249463e-06, + "loss": 0.0001, + "step": 25978 + }, + { + "epoch": 10.564863765758439, + "grad_norm": 0.05475799747504703, + "learning_rate": 1.7846432936162561e-06, + "loss": 0.0004, + "step": 25979 + }, + { + "epoch": 10.565270435136235, + "grad_norm": 0.0010031758653068442, + "learning_rate": 1.7840656593178208e-06, + "loss": 0.0, + "step": 25980 + }, + { + "epoch": 10.56567710451403, + "grad_norm": 0.0025346010330455112, + "learning_rate": 1.7834881093600865e-06, + "loss": 0.0, + "step": 25981 + }, + { + "epoch": 10.566083773891826, + "grad_norm": 0.005470059356654123, + "learning_rate": 1.7829106437489796e-06, + "loss": 0.0001, + "step": 25982 + }, + { + "epoch": 10.566490443269622, + "grad_norm": 0.0021936883340148983, + "learning_rate": 1.7823332624904332e-06, + "loss": 0.0, + "step": 25983 + }, + { + "epoch": 10.566897112647418, + "grad_norm": 0.9464806331665054, + "learning_rate": 1.7817559655903717e-06, + "loss": 0.0079, + "step": 25984 + }, + { + "epoch": 10.567303782025213, + "grad_norm": 0.000394706868578689, + "learning_rate": 1.781178753054721e-06, + "loss": 0.0, + "step": 25985 + }, + { + "epoch": 10.56771045140301, + "grad_norm": 0.007685629671455509, + "learning_rate": 1.7806016248894076e-06, + "loss": 0.0001, + "step": 25986 + }, + { + "epoch": 10.568117120780805, + "grad_norm": 0.002753989316707632, + "learning_rate": 1.7800245811003548e-06, + "loss": 0.0, + "step": 25987 + }, + { + "epoch": 10.5685237901586, + "grad_norm": 0.02584653789283097, + "learning_rate": 1.7794476216934831e-06, + "loss": 0.0003, + "step": 25988 + }, + { + "epoch": 10.568930459536396, + "grad_norm": 0.0015642530868679335, + "learning_rate": 1.7788707466747223e-06, + "loss": 0.0, + "step": 25989 + }, + { + "epoch": 10.569337128914192, + "grad_norm": 0.01972709902377774, + "learning_rate": 1.7782939560499911e-06, + "loss": 0.0002, + "step": 25990 + }, + { + "epoch": 10.569743798291988, + "grad_norm": 0.029429995567737705, + "learning_rate": 1.7777172498252103e-06, + "loss": 0.0003, + "step": 25991 + }, + { + "epoch": 10.570150467669784, + "grad_norm": 0.009579323227233772, + "learning_rate": 1.7771406280063008e-06, + "loss": 0.0001, + "step": 25992 + }, + { + "epoch": 10.57055713704758, + "grad_norm": 0.26336266322817237, + "learning_rate": 1.77656409059918e-06, + "loss": 0.0013, + "step": 25993 + }, + { + "epoch": 10.570963806425375, + "grad_norm": 0.006063060373518517, + "learning_rate": 1.7759876376097684e-06, + "loss": 0.0, + "step": 25994 + }, + { + "epoch": 10.571370475803173, + "grad_norm": 0.02163602860679141, + "learning_rate": 1.7754112690439795e-06, + "loss": 0.0003, + "step": 25995 + }, + { + "epoch": 10.571777145180969, + "grad_norm": 0.1143368572783406, + "learning_rate": 1.7748349849077362e-06, + "loss": 0.0008, + "step": 25996 + }, + { + "epoch": 10.572183814558764, + "grad_norm": 0.06669780710884547, + "learning_rate": 1.7742587852069515e-06, + "loss": 0.0009, + "step": 25997 + }, + { + "epoch": 10.57259048393656, + "grad_norm": 0.0035344397358217285, + "learning_rate": 1.7736826699475406e-06, + "loss": 0.0, + "step": 25998 + }, + { + "epoch": 10.572997153314356, + "grad_norm": 0.0010467394966472206, + "learning_rate": 1.7731066391354167e-06, + "loss": 0.0, + "step": 25999 + }, + { + "epoch": 10.573403822692152, + "grad_norm": 0.0025875984092978074, + "learning_rate": 1.772530692776494e-06, + "loss": 0.0, + "step": 26000 + }, + { + "epoch": 10.573810492069947, + "grad_norm": 0.002010430108150641, + "learning_rate": 1.771954830876682e-06, + "loss": 0.0, + "step": 26001 + }, + { + "epoch": 10.574217161447743, + "grad_norm": 0.08512666100349886, + "learning_rate": 1.7713790534418973e-06, + "loss": 0.0006, + "step": 26002 + }, + { + "epoch": 10.574623830825539, + "grad_norm": 0.016178668241865927, + "learning_rate": 1.7708033604780471e-06, + "loss": 0.0002, + "step": 26003 + }, + { + "epoch": 10.575030500203335, + "grad_norm": 0.003198668226837863, + "learning_rate": 1.7702277519910438e-06, + "loss": 0.0, + "step": 26004 + }, + { + "epoch": 10.57543716958113, + "grad_norm": 0.05866482580716833, + "learning_rate": 1.7696522279867934e-06, + "loss": 0.0006, + "step": 26005 + }, + { + "epoch": 10.575843838958926, + "grad_norm": 0.21573724775874997, + "learning_rate": 1.7690767884712057e-06, + "loss": 0.0016, + "step": 26006 + }, + { + "epoch": 10.576250508336722, + "grad_norm": 0.09037620577470631, + "learning_rate": 1.768501433450186e-06, + "loss": 0.0009, + "step": 26007 + }, + { + "epoch": 10.576657177714518, + "grad_norm": 0.45743175934879426, + "learning_rate": 1.7679261629296408e-06, + "loss": 0.0056, + "step": 26008 + }, + { + "epoch": 10.577063847092314, + "grad_norm": 0.02990579250957297, + "learning_rate": 1.7673509769154783e-06, + "loss": 0.0004, + "step": 26009 + }, + { + "epoch": 10.57747051647011, + "grad_norm": 0.0013376379139327884, + "learning_rate": 1.7667758754136022e-06, + "loss": 0.0, + "step": 26010 + }, + { + "epoch": 10.577877185847905, + "grad_norm": 0.04967379367231315, + "learning_rate": 1.766200858429915e-06, + "loss": 0.0007, + "step": 26011 + }, + { + "epoch": 10.5782838552257, + "grad_norm": 0.10169589023588947, + "learning_rate": 1.76562592597032e-06, + "loss": 0.0013, + "step": 26012 + }, + { + "epoch": 10.578690524603497, + "grad_norm": 0.006905749736375922, + "learning_rate": 1.7650510780407192e-06, + "loss": 0.0001, + "step": 26013 + }, + { + "epoch": 10.579097193981294, + "grad_norm": 0.18166409871048392, + "learning_rate": 1.7644763146470113e-06, + "loss": 0.0018, + "step": 26014 + }, + { + "epoch": 10.57950386335909, + "grad_norm": 0.0012560704126111095, + "learning_rate": 1.763901635795101e-06, + "loss": 0.0, + "step": 26015 + }, + { + "epoch": 10.579910532736886, + "grad_norm": 0.2470028728579928, + "learning_rate": 1.7633270414908866e-06, + "loss": 0.002, + "step": 26016 + }, + { + "epoch": 10.580317202114681, + "grad_norm": 0.004430966764155668, + "learning_rate": 1.762752531740265e-06, + "loss": 0.0001, + "step": 26017 + }, + { + "epoch": 10.580723871492477, + "grad_norm": 0.0008895111070821127, + "learning_rate": 1.7621781065491338e-06, + "loss": 0.0, + "step": 26018 + }, + { + "epoch": 10.581130540870273, + "grad_norm": 0.03204855615413669, + "learning_rate": 1.7616037659233908e-06, + "loss": 0.0004, + "step": 26019 + }, + { + "epoch": 10.581537210248069, + "grad_norm": 0.2960152262285336, + "learning_rate": 1.7610295098689312e-06, + "loss": 0.0021, + "step": 26020 + }, + { + "epoch": 10.581943879625864, + "grad_norm": 0.03388442095786109, + "learning_rate": 1.7604553383916478e-06, + "loss": 0.0003, + "step": 26021 + }, + { + "epoch": 10.58235054900366, + "grad_norm": 0.00023565539389968986, + "learning_rate": 1.7598812514974395e-06, + "loss": 0.0, + "step": 26022 + }, + { + "epoch": 10.582757218381456, + "grad_norm": 0.00169367384614038, + "learning_rate": 1.759307249192198e-06, + "loss": 0.0, + "step": 26023 + }, + { + "epoch": 10.583163887759252, + "grad_norm": 0.0567258291237801, + "learning_rate": 1.7587333314818145e-06, + "loss": 0.0008, + "step": 26024 + }, + { + "epoch": 10.583570557137048, + "grad_norm": 0.0073669130965760546, + "learning_rate": 1.7581594983721816e-06, + "loss": 0.0001, + "step": 26025 + }, + { + "epoch": 10.583977226514843, + "grad_norm": 0.09648653897895716, + "learning_rate": 1.7575857498691884e-06, + "loss": 0.001, + "step": 26026 + }, + { + "epoch": 10.584383895892639, + "grad_norm": 2.1926920201231987, + "learning_rate": 1.7570120859787243e-06, + "loss": 0.0099, + "step": 26027 + }, + { + "epoch": 10.584790565270435, + "grad_norm": 0.026405635884701434, + "learning_rate": 1.7564385067066825e-06, + "loss": 0.0001, + "step": 26028 + }, + { + "epoch": 10.58519723464823, + "grad_norm": 0.00613940197076114, + "learning_rate": 1.755865012058947e-06, + "loss": 0.0001, + "step": 26029 + }, + { + "epoch": 10.585603904026026, + "grad_norm": 0.0011091126051546026, + "learning_rate": 1.7552916020414078e-06, + "loss": 0.0, + "step": 26030 + }, + { + "epoch": 10.586010573403822, + "grad_norm": 5.195875997745459, + "learning_rate": 1.7547182766599491e-06, + "loss": 0.0414, + "step": 26031 + }, + { + "epoch": 10.586417242781618, + "grad_norm": 0.04896386539155329, + "learning_rate": 1.754145035920457e-06, + "loss": 0.0003, + "step": 26032 + }, + { + "epoch": 10.586823912159414, + "grad_norm": 8.888826563146582e-05, + "learning_rate": 1.753571879828816e-06, + "loss": 0.0, + "step": 26033 + }, + { + "epoch": 10.58723058153721, + "grad_norm": 0.017203356587424678, + "learning_rate": 1.7529988083909077e-06, + "loss": 0.0002, + "step": 26034 + }, + { + "epoch": 10.587637250915005, + "grad_norm": 0.000500355138493237, + "learning_rate": 1.7524258216126222e-06, + "loss": 0.0, + "step": 26035 + }, + { + "epoch": 10.588043920292803, + "grad_norm": 0.01974952803827645, + "learning_rate": 1.7518529194998347e-06, + "loss": 0.0002, + "step": 26036 + }, + { + "epoch": 10.588450589670598, + "grad_norm": 1.8657733437195267, + "learning_rate": 1.751280102058427e-06, + "loss": 0.0207, + "step": 26037 + }, + { + "epoch": 10.588857259048394, + "grad_norm": 0.10356176057961637, + "learning_rate": 1.750707369294281e-06, + "loss": 0.001, + "step": 26038 + }, + { + "epoch": 10.58926392842619, + "grad_norm": 0.0013669920762112424, + "learning_rate": 1.7501347212132736e-06, + "loss": 0.0, + "step": 26039 + }, + { + "epoch": 10.589670597803986, + "grad_norm": 0.018604497848856694, + "learning_rate": 1.7495621578212874e-06, + "loss": 0.0001, + "step": 26040 + }, + { + "epoch": 10.590077267181782, + "grad_norm": 0.02675411294806839, + "learning_rate": 1.748989679124199e-06, + "loss": 0.0002, + "step": 26041 + }, + { + "epoch": 10.590483936559577, + "grad_norm": 0.004450391461166998, + "learning_rate": 1.7484172851278835e-06, + "loss": 0.0, + "step": 26042 + }, + { + "epoch": 10.590890605937373, + "grad_norm": 0.0008943857895151975, + "learning_rate": 1.7478449758382166e-06, + "loss": 0.0, + "step": 26043 + }, + { + "epoch": 10.591297275315169, + "grad_norm": 0.007614799402380232, + "learning_rate": 1.7472727512610754e-06, + "loss": 0.0001, + "step": 26044 + }, + { + "epoch": 10.591703944692965, + "grad_norm": 0.045761613201679804, + "learning_rate": 1.74670061140233e-06, + "loss": 0.0005, + "step": 26045 + }, + { + "epoch": 10.59211061407076, + "grad_norm": 0.0012174284842035067, + "learning_rate": 1.7461285562678586e-06, + "loss": 0.0, + "step": 26046 + }, + { + "epoch": 10.592517283448556, + "grad_norm": 0.02207037115455857, + "learning_rate": 1.7455565858635325e-06, + "loss": 0.0002, + "step": 26047 + }, + { + "epoch": 10.592923952826352, + "grad_norm": 0.09071097502538317, + "learning_rate": 1.7449847001952225e-06, + "loss": 0.0006, + "step": 26048 + }, + { + "epoch": 10.593330622204148, + "grad_norm": 1.1041826853573282, + "learning_rate": 1.7444128992687992e-06, + "loss": 0.0074, + "step": 26049 + }, + { + "epoch": 10.593737291581943, + "grad_norm": 0.029051681422377417, + "learning_rate": 1.7438411830901314e-06, + "loss": 0.0003, + "step": 26050 + }, + { + "epoch": 10.59414396095974, + "grad_norm": 0.022191821055280364, + "learning_rate": 1.74326955166509e-06, + "loss": 0.0003, + "step": 26051 + }, + { + "epoch": 10.594550630337535, + "grad_norm": 0.024215276455606775, + "learning_rate": 1.7426980049995402e-06, + "loss": 0.0003, + "step": 26052 + }, + { + "epoch": 10.59495729971533, + "grad_norm": 0.18442195528728944, + "learning_rate": 1.7421265430993528e-06, + "loss": 0.0015, + "step": 26053 + }, + { + "epoch": 10.595363969093126, + "grad_norm": 0.008804998788062929, + "learning_rate": 1.7415551659703921e-06, + "loss": 0.0, + "step": 26054 + }, + { + "epoch": 10.595770638470924, + "grad_norm": 0.017028197111919247, + "learning_rate": 1.7409838736185246e-06, + "loss": 0.0002, + "step": 26055 + }, + { + "epoch": 10.59617730784872, + "grad_norm": 0.0022431902813991682, + "learning_rate": 1.7404126660496146e-06, + "loss": 0.0, + "step": 26056 + }, + { + "epoch": 10.596583977226516, + "grad_norm": 0.018600529892225416, + "learning_rate": 1.739841543269525e-06, + "loss": 0.0002, + "step": 26057 + }, + { + "epoch": 10.596990646604311, + "grad_norm": 0.013943896922853019, + "learning_rate": 1.7392705052841174e-06, + "loss": 0.0001, + "step": 26058 + }, + { + "epoch": 10.597397315982107, + "grad_norm": 0.007266266348520623, + "learning_rate": 1.7386995520992577e-06, + "loss": 0.0001, + "step": 26059 + }, + { + "epoch": 10.597803985359903, + "grad_norm": 0.04291504107076797, + "learning_rate": 1.7381286837208044e-06, + "loss": 0.0006, + "step": 26060 + }, + { + "epoch": 10.598210654737699, + "grad_norm": 8.186356164987087e-05, + "learning_rate": 1.7375579001546194e-06, + "loss": 0.0, + "step": 26061 + }, + { + "epoch": 10.598617324115494, + "grad_norm": 0.016379156248530195, + "learning_rate": 1.7369872014065603e-06, + "loss": 0.0002, + "step": 26062 + }, + { + "epoch": 10.59902399349329, + "grad_norm": 0.011582234799141222, + "learning_rate": 1.7364165874824856e-06, + "loss": 0.0001, + "step": 26063 + }, + { + "epoch": 10.599430662871086, + "grad_norm": 0.02677570042239006, + "learning_rate": 1.7358460583882553e-06, + "loss": 0.0002, + "step": 26064 + }, + { + "epoch": 10.599837332248882, + "grad_norm": 0.05095821327496811, + "learning_rate": 1.7352756141297201e-06, + "loss": 0.0005, + "step": 26065 + }, + { + "epoch": 10.600244001626677, + "grad_norm": 0.010561167594194806, + "learning_rate": 1.734705254712744e-06, + "loss": 0.0001, + "step": 26066 + }, + { + "epoch": 10.600650671004473, + "grad_norm": 0.016480251369623798, + "learning_rate": 1.7341349801431783e-06, + "loss": 0.0001, + "step": 26067 + }, + { + "epoch": 10.601057340382269, + "grad_norm": 0.06394290115463809, + "learning_rate": 1.733564790426876e-06, + "loss": 0.0005, + "step": 26068 + }, + { + "epoch": 10.601464009760065, + "grad_norm": 0.02066774956001703, + "learning_rate": 1.7329946855696932e-06, + "loss": 0.0002, + "step": 26069 + }, + { + "epoch": 10.60187067913786, + "grad_norm": 0.0020213010635987115, + "learning_rate": 1.732424665577479e-06, + "loss": 0.0, + "step": 26070 + }, + { + "epoch": 10.602277348515656, + "grad_norm": 0.004677133595418748, + "learning_rate": 1.731854730456085e-06, + "loss": 0.0, + "step": 26071 + }, + { + "epoch": 10.602684017893452, + "grad_norm": 0.06415547404369043, + "learning_rate": 1.7312848802113657e-06, + "loss": 0.0004, + "step": 26072 + }, + { + "epoch": 10.603090687271248, + "grad_norm": 0.007996995996615646, + "learning_rate": 1.7307151148491696e-06, + "loss": 0.0001, + "step": 26073 + }, + { + "epoch": 10.603497356649044, + "grad_norm": 0.0008081973570285604, + "learning_rate": 1.7301454343753432e-06, + "loss": 0.0, + "step": 26074 + }, + { + "epoch": 10.60390402602684, + "grad_norm": 0.010076783503133603, + "learning_rate": 1.7295758387957372e-06, + "loss": 0.0001, + "step": 26075 + }, + { + "epoch": 10.604310695404635, + "grad_norm": 0.009573930660536474, + "learning_rate": 1.7290063281161972e-06, + "loss": 0.0001, + "step": 26076 + }, + { + "epoch": 10.604717364782433, + "grad_norm": 0.24243748182107316, + "learning_rate": 1.7284369023425696e-06, + "loss": 0.0023, + "step": 26077 + }, + { + "epoch": 10.605124034160228, + "grad_norm": 0.19042943665406933, + "learning_rate": 1.7278675614806984e-06, + "loss": 0.0013, + "step": 26078 + }, + { + "epoch": 10.605530703538024, + "grad_norm": 0.005963282540194068, + "learning_rate": 1.7272983055364324e-06, + "loss": 0.0, + "step": 26079 + }, + { + "epoch": 10.60593737291582, + "grad_norm": 0.04891604906479031, + "learning_rate": 1.7267291345156124e-06, + "loss": 0.0005, + "step": 26080 + }, + { + "epoch": 10.606344042293616, + "grad_norm": 0.2009111715314867, + "learning_rate": 1.7261600484240815e-06, + "loss": 0.0021, + "step": 26081 + }, + { + "epoch": 10.606750711671411, + "grad_norm": 0.028885510270610495, + "learning_rate": 1.7255910472676818e-06, + "loss": 0.0003, + "step": 26082 + }, + { + "epoch": 10.607157381049207, + "grad_norm": 0.042902404592078336, + "learning_rate": 1.7250221310522553e-06, + "loss": 0.0002, + "step": 26083 + }, + { + "epoch": 10.607564050427003, + "grad_norm": 0.2120685784879025, + "learning_rate": 1.7244532997836404e-06, + "loss": 0.0019, + "step": 26084 + }, + { + "epoch": 10.607970719804799, + "grad_norm": 0.6659732998809775, + "learning_rate": 1.7238845534676752e-06, + "loss": 0.0065, + "step": 26085 + }, + { + "epoch": 10.608377389182595, + "grad_norm": 0.000542709911115942, + "learning_rate": 1.7233158921102034e-06, + "loss": 0.0, + "step": 26086 + }, + { + "epoch": 10.60878405856039, + "grad_norm": 0.03791577800560992, + "learning_rate": 1.7227473157170594e-06, + "loss": 0.0004, + "step": 26087 + }, + { + "epoch": 10.609190727938186, + "grad_norm": 0.04913701689082575, + "learning_rate": 1.7221788242940795e-06, + "loss": 0.0006, + "step": 26088 + }, + { + "epoch": 10.609597397315982, + "grad_norm": 3.045830234176631e-05, + "learning_rate": 1.7216104178471004e-06, + "loss": 0.0, + "step": 26089 + }, + { + "epoch": 10.610004066693778, + "grad_norm": 0.00010172634806940786, + "learning_rate": 1.7210420963819563e-06, + "loss": 0.0, + "step": 26090 + }, + { + "epoch": 10.610410736071573, + "grad_norm": 0.4602137521000961, + "learning_rate": 1.7204738599044802e-06, + "loss": 0.004, + "step": 26091 + }, + { + "epoch": 10.61081740544937, + "grad_norm": 0.027745573720251656, + "learning_rate": 1.7199057084205084e-06, + "loss": 0.0002, + "step": 26092 + }, + { + "epoch": 10.611224074827165, + "grad_norm": 0.00012510698908135639, + "learning_rate": 1.7193376419358722e-06, + "loss": 0.0, + "step": 26093 + }, + { + "epoch": 10.61163074420496, + "grad_norm": 0.00608593919647657, + "learning_rate": 1.718769660456403e-06, + "loss": 0.0001, + "step": 26094 + }, + { + "epoch": 10.612037413582756, + "grad_norm": 0.0034751285311532196, + "learning_rate": 1.7182017639879312e-06, + "loss": 0.0, + "step": 26095 + }, + { + "epoch": 10.612444082960554, + "grad_norm": 0.02199951666091692, + "learning_rate": 1.7176339525362862e-06, + "loss": 0.0002, + "step": 26096 + }, + { + "epoch": 10.61285075233835, + "grad_norm": 0.10992735305713593, + "learning_rate": 1.7170662261072968e-06, + "loss": 0.0009, + "step": 26097 + }, + { + "epoch": 10.613257421716146, + "grad_norm": 0.0028233366304306583, + "learning_rate": 1.7164985847067884e-06, + "loss": 0.0, + "step": 26098 + }, + { + "epoch": 10.613664091093941, + "grad_norm": 0.0022689588002638896, + "learning_rate": 1.715931028340595e-06, + "loss": 0.0, + "step": 26099 + }, + { + "epoch": 10.614070760471737, + "grad_norm": 0.0006848901767835486, + "learning_rate": 1.7153635570145378e-06, + "loss": 0.0, + "step": 26100 + }, + { + "epoch": 10.614477429849533, + "grad_norm": 0.05294019926974242, + "learning_rate": 1.714796170734443e-06, + "loss": 0.0005, + "step": 26101 + }, + { + "epoch": 10.614884099227329, + "grad_norm": 0.019288587956482858, + "learning_rate": 1.714228869506136e-06, + "loss": 0.0002, + "step": 26102 + }, + { + "epoch": 10.615290768605124, + "grad_norm": 0.008652602636192238, + "learning_rate": 1.71366165333544e-06, + "loss": 0.0001, + "step": 26103 + }, + { + "epoch": 10.61569743798292, + "grad_norm": 0.05251553918534718, + "learning_rate": 1.7130945222281748e-06, + "loss": 0.0004, + "step": 26104 + }, + { + "epoch": 10.616104107360716, + "grad_norm": 0.022459540557161498, + "learning_rate": 1.7125274761901668e-06, + "loss": 0.0003, + "step": 26105 + }, + { + "epoch": 10.616510776738512, + "grad_norm": 0.02281911540338548, + "learning_rate": 1.7119605152272368e-06, + "loss": 0.0003, + "step": 26106 + }, + { + "epoch": 10.616917446116307, + "grad_norm": 0.0026953239947647544, + "learning_rate": 1.7113936393452025e-06, + "loss": 0.0, + "step": 26107 + }, + { + "epoch": 10.617324115494103, + "grad_norm": 0.018464491708169713, + "learning_rate": 1.7108268485498836e-06, + "loss": 0.0001, + "step": 26108 + }, + { + "epoch": 10.617730784871899, + "grad_norm": 0.18958812480489604, + "learning_rate": 1.7102601428470988e-06, + "loss": 0.0014, + "step": 26109 + }, + { + "epoch": 10.618137454249695, + "grad_norm": 0.0010186004683692892, + "learning_rate": 1.709693522242667e-06, + "loss": 0.0, + "step": 26110 + }, + { + "epoch": 10.61854412362749, + "grad_norm": 0.0024616688204861237, + "learning_rate": 1.709126986742401e-06, + "loss": 0.0, + "step": 26111 + }, + { + "epoch": 10.618950793005286, + "grad_norm": 0.01870710590586082, + "learning_rate": 1.7085605363521207e-06, + "loss": 0.0002, + "step": 26112 + }, + { + "epoch": 10.619357462383082, + "grad_norm": 0.019832137442670157, + "learning_rate": 1.7079941710776404e-06, + "loss": 0.0001, + "step": 26113 + }, + { + "epoch": 10.619764131760878, + "grad_norm": 0.000737674349709946, + "learning_rate": 1.707427890924772e-06, + "loss": 0.0, + "step": 26114 + }, + { + "epoch": 10.620170801138674, + "grad_norm": 0.04534944282632765, + "learning_rate": 1.706861695899331e-06, + "loss": 0.0003, + "step": 26115 + }, + { + "epoch": 10.62057747051647, + "grad_norm": 0.008215825946318304, + "learning_rate": 1.7062955860071284e-06, + "loss": 0.0001, + "step": 26116 + }, + { + "epoch": 10.620984139894265, + "grad_norm": 0.28991150712797553, + "learning_rate": 1.7057295612539727e-06, + "loss": 0.0041, + "step": 26117 + }, + { + "epoch": 10.621390809272063, + "grad_norm": 0.0078037378968731745, + "learning_rate": 1.7051636216456801e-06, + "loss": 0.0, + "step": 26118 + }, + { + "epoch": 10.621797478649858, + "grad_norm": 0.0004665266654128128, + "learning_rate": 1.7045977671880587e-06, + "loss": 0.0, + "step": 26119 + }, + { + "epoch": 10.622204148027654, + "grad_norm": 0.00037240501889673533, + "learning_rate": 1.7040319978869147e-06, + "loss": 0.0, + "step": 26120 + }, + { + "epoch": 10.62261081740545, + "grad_norm": 0.007098456824200323, + "learning_rate": 1.703466313748059e-06, + "loss": 0.0001, + "step": 26121 + }, + { + "epoch": 10.623017486783246, + "grad_norm": 0.12849178504769154, + "learning_rate": 1.7029007147772957e-06, + "loss": 0.0002, + "step": 26122 + }, + { + "epoch": 10.623424156161041, + "grad_norm": 0.0012476274522905503, + "learning_rate": 1.7023352009804328e-06, + "loss": 0.0, + "step": 26123 + }, + { + "epoch": 10.623830825538837, + "grad_norm": 0.2594897152715626, + "learning_rate": 1.7017697723632732e-06, + "loss": 0.0032, + "step": 26124 + }, + { + "epoch": 10.624237494916633, + "grad_norm": 0.10089515413291911, + "learning_rate": 1.7012044289316254e-06, + "loss": 0.0005, + "step": 26125 + }, + { + "epoch": 10.624644164294429, + "grad_norm": 0.0017846676276980639, + "learning_rate": 1.7006391706912906e-06, + "loss": 0.0, + "step": 26126 + }, + { + "epoch": 10.625050833672224, + "grad_norm": 0.04713284676371154, + "learning_rate": 1.700073997648073e-06, + "loss": 0.0006, + "step": 26127 + }, + { + "epoch": 10.62545750305002, + "grad_norm": 0.06250803383434524, + "learning_rate": 1.699508909807771e-06, + "loss": 0.0006, + "step": 26128 + }, + { + "epoch": 10.625864172427816, + "grad_norm": 0.0059810362201180795, + "learning_rate": 1.6989439071761893e-06, + "loss": 0.0, + "step": 26129 + }, + { + "epoch": 10.626270841805612, + "grad_norm": 0.38720864963862234, + "learning_rate": 1.698378989759123e-06, + "loss": 0.0029, + "step": 26130 + }, + { + "epoch": 10.626677511183408, + "grad_norm": 0.0728089024383944, + "learning_rate": 1.6978141575623774e-06, + "loss": 0.0007, + "step": 26131 + }, + { + "epoch": 10.627084180561203, + "grad_norm": 0.6281049602869219, + "learning_rate": 1.697249410591747e-06, + "loss": 0.0076, + "step": 26132 + }, + { + "epoch": 10.627490849938999, + "grad_norm": 1.0705247792730017, + "learning_rate": 1.6966847488530314e-06, + "loss": 0.0033, + "step": 26133 + }, + { + "epoch": 10.627897519316795, + "grad_norm": 0.0750351684009484, + "learning_rate": 1.6961201723520248e-06, + "loss": 0.0006, + "step": 26134 + }, + { + "epoch": 10.62830418869459, + "grad_norm": 0.005514218381253961, + "learning_rate": 1.695555681094524e-06, + "loss": 0.0001, + "step": 26135 + }, + { + "epoch": 10.628710858072386, + "grad_norm": 0.3627067677197222, + "learning_rate": 1.694991275086324e-06, + "loss": 0.0031, + "step": 26136 + }, + { + "epoch": 10.629117527450184, + "grad_norm": 0.5016765063480908, + "learning_rate": 1.694426954333218e-06, + "loss": 0.0041, + "step": 26137 + }, + { + "epoch": 10.62952419682798, + "grad_norm": 1.0224345987107766, + "learning_rate": 1.6938627188410007e-06, + "loss": 0.0063, + "step": 26138 + }, + { + "epoch": 10.629930866205775, + "grad_norm": 0.00016261899076272537, + "learning_rate": 1.6932985686154612e-06, + "loss": 0.0, + "step": 26139 + }, + { + "epoch": 10.630337535583571, + "grad_norm": 0.0016940838124549025, + "learning_rate": 1.6927345036623931e-06, + "loss": 0.0, + "step": 26140 + }, + { + "epoch": 10.630744204961367, + "grad_norm": 0.007231802906120188, + "learning_rate": 1.6921705239875875e-06, + "loss": 0.0001, + "step": 26141 + }, + { + "epoch": 10.631150874339163, + "grad_norm": 0.004922785925462041, + "learning_rate": 1.6916066295968281e-06, + "loss": 0.0, + "step": 26142 + }, + { + "epoch": 10.631557543716958, + "grad_norm": 0.11221667000941667, + "learning_rate": 1.691042820495913e-06, + "loss": 0.0011, + "step": 26143 + }, + { + "epoch": 10.631964213094754, + "grad_norm": 0.006380933862671509, + "learning_rate": 1.6904790966906238e-06, + "loss": 0.0001, + "step": 26144 + }, + { + "epoch": 10.63237088247255, + "grad_norm": 0.0012863631550055874, + "learning_rate": 1.6899154581867484e-06, + "loss": 0.0, + "step": 26145 + }, + { + "epoch": 10.632777551850346, + "grad_norm": 0.008956310263519713, + "learning_rate": 1.6893519049900743e-06, + "loss": 0.0001, + "step": 26146 + }, + { + "epoch": 10.633184221228142, + "grad_norm": 0.0059430624000086614, + "learning_rate": 1.6887884371063845e-06, + "loss": 0.0001, + "step": 26147 + }, + { + "epoch": 10.633590890605937, + "grad_norm": 0.924211044516964, + "learning_rate": 1.6882250545414625e-06, + "loss": 0.0084, + "step": 26148 + }, + { + "epoch": 10.633997559983733, + "grad_norm": 0.025902060084383463, + "learning_rate": 1.6876617573010957e-06, + "loss": 0.0003, + "step": 26149 + }, + { + "epoch": 10.634404229361529, + "grad_norm": 0.011291669315400318, + "learning_rate": 1.687098545391065e-06, + "loss": 0.0001, + "step": 26150 + }, + { + "epoch": 10.634810898739325, + "grad_norm": 0.0003309925585812409, + "learning_rate": 1.6865354188171511e-06, + "loss": 0.0, + "step": 26151 + }, + { + "epoch": 10.63521756811712, + "grad_norm": 0.02673745081562795, + "learning_rate": 1.6859723775851345e-06, + "loss": 0.0004, + "step": 26152 + }, + { + "epoch": 10.635624237494916, + "grad_norm": 0.001850245308057091, + "learning_rate": 1.6854094217007965e-06, + "loss": 0.0, + "step": 26153 + }, + { + "epoch": 10.636030906872712, + "grad_norm": 0.19494138777150297, + "learning_rate": 1.6848465511699164e-06, + "loss": 0.0016, + "step": 26154 + }, + { + "epoch": 10.636437576250508, + "grad_norm": 0.1618505550879261, + "learning_rate": 1.6842837659982681e-06, + "loss": 0.0008, + "step": 26155 + }, + { + "epoch": 10.636844245628303, + "grad_norm": 0.09663474028605229, + "learning_rate": 1.6837210661916336e-06, + "loss": 0.0007, + "step": 26156 + }, + { + "epoch": 10.6372509150061, + "grad_norm": 0.0006354187509873546, + "learning_rate": 1.6831584517557897e-06, + "loss": 0.0, + "step": 26157 + }, + { + "epoch": 10.637657584383895, + "grad_norm": 0.05689094836755478, + "learning_rate": 1.6825959226965082e-06, + "loss": 0.0005, + "step": 26158 + }, + { + "epoch": 10.638064253761693, + "grad_norm": 0.006317194875901926, + "learning_rate": 1.6820334790195668e-06, + "loss": 0.0001, + "step": 26159 + }, + { + "epoch": 10.638470923139488, + "grad_norm": 0.4421506930238414, + "learning_rate": 1.6814711207307376e-06, + "loss": 0.004, + "step": 26160 + }, + { + "epoch": 10.638877592517284, + "grad_norm": 0.0013546755625456245, + "learning_rate": 1.6809088478357927e-06, + "loss": 0.0, + "step": 26161 + }, + { + "epoch": 10.63928426189508, + "grad_norm": 0.10720029729685607, + "learning_rate": 1.680346660340506e-06, + "loss": 0.0009, + "step": 26162 + }, + { + "epoch": 10.639690931272876, + "grad_norm": 0.00023074154907097566, + "learning_rate": 1.67978455825065e-06, + "loss": 0.0, + "step": 26163 + }, + { + "epoch": 10.640097600650671, + "grad_norm": 0.10468592778042043, + "learning_rate": 1.6792225415719908e-06, + "loss": 0.0014, + "step": 26164 + }, + { + "epoch": 10.640504270028467, + "grad_norm": 0.017843362369274293, + "learning_rate": 1.678660610310302e-06, + "loss": 0.0001, + "step": 26165 + }, + { + "epoch": 10.640910939406263, + "grad_norm": 0.0008403274021112311, + "learning_rate": 1.6780987644713487e-06, + "loss": 0.0, + "step": 26166 + }, + { + "epoch": 10.641317608784059, + "grad_norm": 0.0013012726857288635, + "learning_rate": 1.6775370040609006e-06, + "loss": 0.0, + "step": 26167 + }, + { + "epoch": 10.641724278161854, + "grad_norm": 0.13738713182800916, + "learning_rate": 1.6769753290847213e-06, + "loss": 0.0018, + "step": 26168 + }, + { + "epoch": 10.64213094753965, + "grad_norm": 0.007586104645602609, + "learning_rate": 1.6764137395485812e-06, + "loss": 0.0001, + "step": 26169 + }, + { + "epoch": 10.642537616917446, + "grad_norm": 0.0001921394243841041, + "learning_rate": 1.675852235458244e-06, + "loss": 0.0, + "step": 26170 + }, + { + "epoch": 10.642944286295242, + "grad_norm": 0.846639904830492, + "learning_rate": 1.6752908168194716e-06, + "loss": 0.0037, + "step": 26171 + }, + { + "epoch": 10.643350955673037, + "grad_norm": 0.0534242217104818, + "learning_rate": 1.67472948363803e-06, + "loss": 0.0005, + "step": 26172 + }, + { + "epoch": 10.643757625050833, + "grad_norm": 0.018444740047830227, + "learning_rate": 1.6741682359196799e-06, + "loss": 0.0002, + "step": 26173 + }, + { + "epoch": 10.644164294428629, + "grad_norm": 0.027335817116350542, + "learning_rate": 1.6736070736701804e-06, + "loss": 0.0002, + "step": 26174 + }, + { + "epoch": 10.644570963806425, + "grad_norm": 0.011410280805477329, + "learning_rate": 1.6730459968952983e-06, + "loss": 0.0001, + "step": 26175 + }, + { + "epoch": 10.64497763318422, + "grad_norm": 0.0033537112887478226, + "learning_rate": 1.672485005600789e-06, + "loss": 0.0, + "step": 26176 + }, + { + "epoch": 10.645384302562016, + "grad_norm": 0.016744431880399188, + "learning_rate": 1.6719240997924123e-06, + "loss": 0.0002, + "step": 26177 + }, + { + "epoch": 10.645790971939814, + "grad_norm": 0.010055492178495534, + "learning_rate": 1.671363279475926e-06, + "loss": 0.0001, + "step": 26178 + }, + { + "epoch": 10.64619764131761, + "grad_norm": 0.0025129838408714184, + "learning_rate": 1.6708025446570874e-06, + "loss": 0.0, + "step": 26179 + }, + { + "epoch": 10.646604310695405, + "grad_norm": 0.002662087893685648, + "learning_rate": 1.6702418953416532e-06, + "loss": 0.0, + "step": 26180 + }, + { + "epoch": 10.647010980073201, + "grad_norm": 0.004943938053897129, + "learning_rate": 1.6696813315353754e-06, + "loss": 0.0001, + "step": 26181 + }, + { + "epoch": 10.647417649450997, + "grad_norm": 0.00436195779777115, + "learning_rate": 1.6691208532440139e-06, + "loss": 0.0, + "step": 26182 + }, + { + "epoch": 10.647824318828793, + "grad_norm": 0.0031945721596404143, + "learning_rate": 1.6685604604733196e-06, + "loss": 0.0, + "step": 26183 + }, + { + "epoch": 10.648230988206588, + "grad_norm": 0.00012756630506694744, + "learning_rate": 1.6680001532290457e-06, + "loss": 0.0, + "step": 26184 + }, + { + "epoch": 10.648637657584384, + "grad_norm": 0.73262934567564, + "learning_rate": 1.6674399315169432e-06, + "loss": 0.0038, + "step": 26185 + }, + { + "epoch": 10.64904432696218, + "grad_norm": 0.05254965283531483, + "learning_rate": 1.666879795342764e-06, + "loss": 0.0008, + "step": 26186 + }, + { + "epoch": 10.649450996339976, + "grad_norm": 0.023351750802220635, + "learning_rate": 1.6663197447122548e-06, + "loss": 0.0002, + "step": 26187 + }, + { + "epoch": 10.649857665717771, + "grad_norm": 0.0012545693915583997, + "learning_rate": 1.6657597796311697e-06, + "loss": 0.0, + "step": 26188 + }, + { + "epoch": 10.650264335095567, + "grad_norm": 0.014021780335726807, + "learning_rate": 1.6651999001052555e-06, + "loss": 0.0001, + "step": 26189 + }, + { + "epoch": 10.650671004473363, + "grad_norm": 0.017098567999058466, + "learning_rate": 1.6646401061402595e-06, + "loss": 0.0002, + "step": 26190 + }, + { + "epoch": 10.651077673851159, + "grad_norm": 0.0006646481127679573, + "learning_rate": 1.6640803977419274e-06, + "loss": 0.0, + "step": 26191 + }, + { + "epoch": 10.651484343228955, + "grad_norm": 0.0006975735258631973, + "learning_rate": 1.6635207749160054e-06, + "loss": 0.0, + "step": 26192 + }, + { + "epoch": 10.65189101260675, + "grad_norm": 1.2779489873837888, + "learning_rate": 1.662961237668238e-06, + "loss": 0.0157, + "step": 26193 + }, + { + "epoch": 10.652297681984546, + "grad_norm": 0.04069640238951271, + "learning_rate": 1.6624017860043674e-06, + "loss": 0.0003, + "step": 26194 + }, + { + "epoch": 10.652704351362342, + "grad_norm": 0.00021532270801778386, + "learning_rate": 1.6618424199301419e-06, + "loss": 0.0, + "step": 26195 + }, + { + "epoch": 10.653111020740138, + "grad_norm": 0.009826853358231137, + "learning_rate": 1.6612831394512996e-06, + "loss": 0.0001, + "step": 26196 + }, + { + "epoch": 10.653517690117933, + "grad_norm": 0.16363047366438788, + "learning_rate": 1.6607239445735824e-06, + "loss": 0.0015, + "step": 26197 + }, + { + "epoch": 10.65392435949573, + "grad_norm": 0.01624080424339499, + "learning_rate": 1.660164835302731e-06, + "loss": 0.0002, + "step": 26198 + }, + { + "epoch": 10.654331028873525, + "grad_norm": 0.03142993562125582, + "learning_rate": 1.6596058116444857e-06, + "loss": 0.0004, + "step": 26199 + }, + { + "epoch": 10.654737698251322, + "grad_norm": 0.011629964871542619, + "learning_rate": 1.659046873604584e-06, + "loss": 0.0001, + "step": 26200 + }, + { + "epoch": 10.655144367629118, + "grad_norm": 0.019830900474993068, + "learning_rate": 1.658488021188761e-06, + "loss": 0.0001, + "step": 26201 + }, + { + "epoch": 10.655551037006914, + "grad_norm": 0.0008379664425923295, + "learning_rate": 1.6579292544027602e-06, + "loss": 0.0, + "step": 26202 + }, + { + "epoch": 10.65595770638471, + "grad_norm": 0.22192657319616726, + "learning_rate": 1.6573705732523138e-06, + "loss": 0.0032, + "step": 26203 + }, + { + "epoch": 10.656364375762506, + "grad_norm": 0.09668680759400546, + "learning_rate": 1.6568119777431567e-06, + "loss": 0.0011, + "step": 26204 + }, + { + "epoch": 10.656771045140301, + "grad_norm": 0.005051965009308354, + "learning_rate": 1.6562534678810237e-06, + "loss": 0.0, + "step": 26205 + }, + { + "epoch": 10.657177714518097, + "grad_norm": 0.006553874835111451, + "learning_rate": 1.655695043671648e-06, + "loss": 0.0001, + "step": 26206 + }, + { + "epoch": 10.657584383895893, + "grad_norm": 0.003277326347009743, + "learning_rate": 1.6551367051207612e-06, + "loss": 0.0, + "step": 26207 + }, + { + "epoch": 10.657991053273689, + "grad_norm": 0.003956766147579472, + "learning_rate": 1.6545784522340968e-06, + "loss": 0.0, + "step": 26208 + }, + { + "epoch": 10.658397722651484, + "grad_norm": 1.6561760019217167, + "learning_rate": 1.654020285017386e-06, + "loss": 0.0193, + "step": 26209 + }, + { + "epoch": 10.65880439202928, + "grad_norm": 0.01835718134724095, + "learning_rate": 1.6534622034763558e-06, + "loss": 0.0002, + "step": 26210 + }, + { + "epoch": 10.659211061407076, + "grad_norm": 0.00514743501913611, + "learning_rate": 1.6529042076167389e-06, + "loss": 0.0, + "step": 26211 + }, + { + "epoch": 10.659617730784872, + "grad_norm": 0.010190192991720678, + "learning_rate": 1.6523462974442594e-06, + "loss": 0.0001, + "step": 26212 + }, + { + "epoch": 10.660024400162667, + "grad_norm": 0.0036686764106572405, + "learning_rate": 1.6517884729646482e-06, + "loss": 0.0, + "step": 26213 + }, + { + "epoch": 10.660431069540463, + "grad_norm": 1.2042554369789587e-05, + "learning_rate": 1.6512307341836264e-06, + "loss": 0.0, + "step": 26214 + }, + { + "epoch": 10.660837738918259, + "grad_norm": 0.6297873689800535, + "learning_rate": 1.650673081106925e-06, + "loss": 0.0064, + "step": 26215 + }, + { + "epoch": 10.661244408296055, + "grad_norm": 0.05465730473861567, + "learning_rate": 1.650115513740267e-06, + "loss": 0.0004, + "step": 26216 + }, + { + "epoch": 10.66165107767385, + "grad_norm": 0.015975103454640652, + "learning_rate": 1.6495580320893767e-06, + "loss": 0.0001, + "step": 26217 + }, + { + "epoch": 10.662057747051646, + "grad_norm": 4.910949925726386e-05, + "learning_rate": 1.6490006361599742e-06, + "loss": 0.0, + "step": 26218 + }, + { + "epoch": 10.662464416429444, + "grad_norm": 0.00439241714774259, + "learning_rate": 1.6484433259577837e-06, + "loss": 0.0, + "step": 26219 + }, + { + "epoch": 10.66287108580724, + "grad_norm": 0.026681434007805566, + "learning_rate": 1.647886101488524e-06, + "loss": 0.0002, + "step": 26220 + }, + { + "epoch": 10.663277755185035, + "grad_norm": 0.008477053907090387, + "learning_rate": 1.6473289627579193e-06, + "loss": 0.0001, + "step": 26221 + }, + { + "epoch": 10.663684424562831, + "grad_norm": 0.13182870811380962, + "learning_rate": 1.646771909771686e-06, + "loss": 0.001, + "step": 26222 + }, + { + "epoch": 10.664091093940627, + "grad_norm": 0.0003166242002917364, + "learning_rate": 1.6462149425355434e-06, + "loss": 0.0, + "step": 26223 + }, + { + "epoch": 10.664497763318423, + "grad_norm": 2.4117480144402723, + "learning_rate": 1.6456580610552087e-06, + "loss": 0.0215, + "step": 26224 + }, + { + "epoch": 10.664904432696218, + "grad_norm": 0.7361241685616033, + "learning_rate": 1.6451012653363985e-06, + "loss": 0.0089, + "step": 26225 + }, + { + "epoch": 10.665311102074014, + "grad_norm": 0.06350650435794639, + "learning_rate": 1.6445445553848293e-06, + "loss": 0.0005, + "step": 26226 + }, + { + "epoch": 10.66571777145181, + "grad_norm": 0.0031453945144066654, + "learning_rate": 1.6439879312062123e-06, + "loss": 0.0, + "step": 26227 + }, + { + "epoch": 10.666124440829606, + "grad_norm": 0.011838529781520094, + "learning_rate": 1.6434313928062673e-06, + "loss": 0.0001, + "step": 26228 + }, + { + "epoch": 10.666531110207401, + "grad_norm": 0.23020400819049183, + "learning_rate": 1.6428749401907051e-06, + "loss": 0.0021, + "step": 26229 + }, + { + "epoch": 10.666937779585197, + "grad_norm": 0.004847142701572247, + "learning_rate": 1.6423185733652381e-06, + "loss": 0.0, + "step": 26230 + }, + { + "epoch": 10.667344448962993, + "grad_norm": 0.002611932773170461, + "learning_rate": 1.641762292335577e-06, + "loss": 0.0, + "step": 26231 + }, + { + "epoch": 10.667751118340789, + "grad_norm": 0.00973249632053308, + "learning_rate": 1.6412060971074318e-06, + "loss": 0.0001, + "step": 26232 + }, + { + "epoch": 10.668157787718584, + "grad_norm": 0.0511483445260503, + "learning_rate": 1.6406499876865113e-06, + "loss": 0.0005, + "step": 26233 + }, + { + "epoch": 10.66856445709638, + "grad_norm": 0.03158746640382009, + "learning_rate": 1.6400939640785275e-06, + "loss": 0.0002, + "step": 26234 + }, + { + "epoch": 10.668971126474176, + "grad_norm": 0.004660706413765946, + "learning_rate": 1.6395380262891902e-06, + "loss": 0.0, + "step": 26235 + }, + { + "epoch": 10.669377795851972, + "grad_norm": 0.9925294952343822, + "learning_rate": 1.6389821743241996e-06, + "loss": 0.0096, + "step": 26236 + }, + { + "epoch": 10.669784465229768, + "grad_norm": 0.0032328465401992644, + "learning_rate": 1.6384264081892654e-06, + "loss": 0.0, + "step": 26237 + }, + { + "epoch": 10.670191134607563, + "grad_norm": 0.029442638757887846, + "learning_rate": 1.6378707278900896e-06, + "loss": 0.0003, + "step": 26238 + }, + { + "epoch": 10.670597803985359, + "grad_norm": 3.8202136133955973, + "learning_rate": 1.6373151334323812e-06, + "loss": 0.036, + "step": 26239 + }, + { + "epoch": 10.671004473363155, + "grad_norm": 0.006803453172513671, + "learning_rate": 1.636759624821843e-06, + "loss": 0.0001, + "step": 26240 + }, + { + "epoch": 10.671411142740952, + "grad_norm": 0.005139642525924103, + "learning_rate": 1.6362042020641756e-06, + "loss": 0.0, + "step": 26241 + }, + { + "epoch": 10.671817812118748, + "grad_norm": 0.0014546539617651926, + "learning_rate": 1.6356488651650814e-06, + "loss": 0.0, + "step": 26242 + }, + { + "epoch": 10.672224481496544, + "grad_norm": 0.0022847625258510533, + "learning_rate": 1.635093614130262e-06, + "loss": 0.0, + "step": 26243 + }, + { + "epoch": 10.67263115087434, + "grad_norm": 0.004417315321078912, + "learning_rate": 1.6345384489654159e-06, + "loss": 0.0, + "step": 26244 + }, + { + "epoch": 10.673037820252135, + "grad_norm": 0.03611127518073129, + "learning_rate": 1.6339833696762409e-06, + "loss": 0.0003, + "step": 26245 + }, + { + "epoch": 10.673444489629931, + "grad_norm": 0.008222737525531976, + "learning_rate": 1.63342837626844e-06, + "loss": 0.0001, + "step": 26246 + }, + { + "epoch": 10.673851159007727, + "grad_norm": 0.0006545061135724389, + "learning_rate": 1.6328734687477067e-06, + "loss": 0.0, + "step": 26247 + }, + { + "epoch": 10.674257828385523, + "grad_norm": 0.23106103722485674, + "learning_rate": 1.6323186471197393e-06, + "loss": 0.0024, + "step": 26248 + }, + { + "epoch": 10.674664497763318, + "grad_norm": 0.00014858529290917992, + "learning_rate": 1.6317639113902317e-06, + "loss": 0.0, + "step": 26249 + }, + { + "epoch": 10.675071167141114, + "grad_norm": 0.026619097425222164, + "learning_rate": 1.6312092615648788e-06, + "loss": 0.0002, + "step": 26250 + }, + { + "epoch": 10.67547783651891, + "grad_norm": 0.0027355406692297123, + "learning_rate": 1.6306546976493731e-06, + "loss": 0.0, + "step": 26251 + }, + { + "epoch": 10.675884505896706, + "grad_norm": 0.003642777601336028, + "learning_rate": 1.630100219649411e-06, + "loss": 0.0, + "step": 26252 + }, + { + "epoch": 10.676291175274502, + "grad_norm": 0.00023508364554723752, + "learning_rate": 1.6295458275706833e-06, + "loss": 0.0, + "step": 26253 + }, + { + "epoch": 10.676697844652297, + "grad_norm": 0.03490526086403263, + "learning_rate": 1.62899152141888e-06, + "loss": 0.0003, + "step": 26254 + }, + { + "epoch": 10.677104514030093, + "grad_norm": 0.02038314324426308, + "learning_rate": 1.6284373011996912e-06, + "loss": 0.0001, + "step": 26255 + }, + { + "epoch": 10.677511183407889, + "grad_norm": 0.0008312109314493234, + "learning_rate": 1.6278831669188078e-06, + "loss": 0.0, + "step": 26256 + }, + { + "epoch": 10.677917852785685, + "grad_norm": 0.19662440817378654, + "learning_rate": 1.6273291185819174e-06, + "loss": 0.0018, + "step": 26257 + }, + { + "epoch": 10.67832452216348, + "grad_norm": 0.03987272989895261, + "learning_rate": 1.6267751561947042e-06, + "loss": 0.0003, + "step": 26258 + }, + { + "epoch": 10.678731191541276, + "grad_norm": 0.09338422975599796, + "learning_rate": 1.6262212797628607e-06, + "loss": 0.0011, + "step": 26259 + }, + { + "epoch": 10.679137860919074, + "grad_norm": 0.43886873591807707, + "learning_rate": 1.625667489292071e-06, + "loss": 0.0044, + "step": 26260 + }, + { + "epoch": 10.67954453029687, + "grad_norm": 0.002270691395608233, + "learning_rate": 1.6251137847880182e-06, + "loss": 0.0, + "step": 26261 + }, + { + "epoch": 10.679951199674665, + "grad_norm": 0.0005355368266965107, + "learning_rate": 1.6245601662563882e-06, + "loss": 0.0, + "step": 26262 + }, + { + "epoch": 10.680357869052461, + "grad_norm": 0.00016028053011217505, + "learning_rate": 1.6240066337028625e-06, + "loss": 0.0, + "step": 26263 + }, + { + "epoch": 10.680764538430257, + "grad_norm": 0.09436480680813397, + "learning_rate": 1.6234531871331228e-06, + "loss": 0.0009, + "step": 26264 + }, + { + "epoch": 10.681171207808053, + "grad_norm": 0.021077499713821654, + "learning_rate": 1.622899826552854e-06, + "loss": 0.0001, + "step": 26265 + }, + { + "epoch": 10.681577877185848, + "grad_norm": 0.013685715927292574, + "learning_rate": 1.622346551967734e-06, + "loss": 0.0001, + "step": 26266 + }, + { + "epoch": 10.681984546563644, + "grad_norm": 0.0023352627062900215, + "learning_rate": 1.6217933633834438e-06, + "loss": 0.0, + "step": 26267 + }, + { + "epoch": 10.68239121594144, + "grad_norm": 0.006790963628878747, + "learning_rate": 1.621240260805661e-06, + "loss": 0.0001, + "step": 26268 + }, + { + "epoch": 10.682797885319236, + "grad_norm": 0.00014221508988698244, + "learning_rate": 1.6206872442400645e-06, + "loss": 0.0, + "step": 26269 + }, + { + "epoch": 10.683204554697031, + "grad_norm": 0.006002207233833851, + "learning_rate": 1.6201343136923298e-06, + "loss": 0.0, + "step": 26270 + }, + { + "epoch": 10.683611224074827, + "grad_norm": 0.12610150085446878, + "learning_rate": 1.6195814691681322e-06, + "loss": 0.0012, + "step": 26271 + }, + { + "epoch": 10.684017893452623, + "grad_norm": 0.023650994396628912, + "learning_rate": 1.6190287106731507e-06, + "loss": 0.0002, + "step": 26272 + }, + { + "epoch": 10.684424562830419, + "grad_norm": 0.003412558546147727, + "learning_rate": 1.6184760382130571e-06, + "loss": 0.0, + "step": 26273 + }, + { + "epoch": 10.684831232208214, + "grad_norm": 0.009333529500973367, + "learning_rate": 1.6179234517935261e-06, + "loss": 0.0001, + "step": 26274 + }, + { + "epoch": 10.68523790158601, + "grad_norm": 0.004062680095480981, + "learning_rate": 1.6173709514202284e-06, + "loss": 0.0, + "step": 26275 + }, + { + "epoch": 10.685644570963806, + "grad_norm": 0.00207514672170187, + "learning_rate": 1.6168185370988377e-06, + "loss": 0.0, + "step": 26276 + }, + { + "epoch": 10.686051240341602, + "grad_norm": 0.017924483179847732, + "learning_rate": 1.616266208835021e-06, + "loss": 0.0002, + "step": 26277 + }, + { + "epoch": 10.686457909719397, + "grad_norm": 4.592438411131667e-06, + "learning_rate": 1.6157139666344547e-06, + "loss": 0.0, + "step": 26278 + }, + { + "epoch": 10.686864579097193, + "grad_norm": 0.000634701631662367, + "learning_rate": 1.6151618105028034e-06, + "loss": 0.0, + "step": 26279 + }, + { + "epoch": 10.687271248474989, + "grad_norm": 0.0069494623614675495, + "learning_rate": 1.6146097404457351e-06, + "loss": 0.0001, + "step": 26280 + }, + { + "epoch": 10.687677917852785, + "grad_norm": 0.07132903452835664, + "learning_rate": 1.6140577564689198e-06, + "loss": 0.0006, + "step": 26281 + }, + { + "epoch": 10.688084587230582, + "grad_norm": 0.03015321864518, + "learning_rate": 1.6135058585780205e-06, + "loss": 0.0003, + "step": 26282 + }, + { + "epoch": 10.688491256608378, + "grad_norm": 0.041047046597799654, + "learning_rate": 1.6129540467787054e-06, + "loss": 0.0005, + "step": 26283 + }, + { + "epoch": 10.688897925986174, + "grad_norm": 0.009669936083688673, + "learning_rate": 1.612402321076636e-06, + "loss": 0.0001, + "step": 26284 + }, + { + "epoch": 10.68930459536397, + "grad_norm": 0.004482621445178983, + "learning_rate": 1.6118506814774804e-06, + "loss": 0.0, + "step": 26285 + }, + { + "epoch": 10.689711264741765, + "grad_norm": 0.0014352748340969504, + "learning_rate": 1.6112991279868985e-06, + "loss": 0.0, + "step": 26286 + }, + { + "epoch": 10.690117934119561, + "grad_norm": 0.43231253978713696, + "learning_rate": 1.6107476606105532e-06, + "loss": 0.0045, + "step": 26287 + }, + { + "epoch": 10.690524603497357, + "grad_norm": 0.1336697942092245, + "learning_rate": 1.610196279354106e-06, + "loss": 0.0017, + "step": 26288 + }, + { + "epoch": 10.690931272875153, + "grad_norm": 0.00032582366537672923, + "learning_rate": 1.6096449842232164e-06, + "loss": 0.0, + "step": 26289 + }, + { + "epoch": 10.691337942252948, + "grad_norm": 0.004676395222740295, + "learning_rate": 1.6090937752235415e-06, + "loss": 0.0, + "step": 26290 + }, + { + "epoch": 10.691744611630744, + "grad_norm": 0.006286853722861669, + "learning_rate": 1.608542652360744e-06, + "loss": 0.0001, + "step": 26291 + }, + { + "epoch": 10.69215128100854, + "grad_norm": 0.019262289319805153, + "learning_rate": 1.6079916156404796e-06, + "loss": 0.0002, + "step": 26292 + }, + { + "epoch": 10.692557950386336, + "grad_norm": 0.0017383630638998309, + "learning_rate": 1.607440665068405e-06, + "loss": 0.0, + "step": 26293 + }, + { + "epoch": 10.692964619764131, + "grad_norm": 0.15795411576011054, + "learning_rate": 1.6068898006501754e-06, + "loss": 0.0014, + "step": 26294 + }, + { + "epoch": 10.693371289141927, + "grad_norm": 0.004086022242537639, + "learning_rate": 1.6063390223914466e-06, + "loss": 0.0, + "step": 26295 + }, + { + "epoch": 10.693777958519723, + "grad_norm": 0.0038044148722301855, + "learning_rate": 1.6057883302978726e-06, + "loss": 0.0, + "step": 26296 + }, + { + "epoch": 10.694184627897519, + "grad_norm": 0.06185062415390529, + "learning_rate": 1.6052377243751028e-06, + "loss": 0.0007, + "step": 26297 + }, + { + "epoch": 10.694591297275315, + "grad_norm": 0.009834554982261264, + "learning_rate": 1.6046872046287954e-06, + "loss": 0.0001, + "step": 26298 + }, + { + "epoch": 10.69499796665311, + "grad_norm": 0.0018492808389987585, + "learning_rate": 1.6041367710645995e-06, + "loss": 0.0, + "step": 26299 + }, + { + "epoch": 10.695404636030906, + "grad_norm": 0.03859442811022401, + "learning_rate": 1.603586423688165e-06, + "loss": 0.0003, + "step": 26300 + }, + { + "epoch": 10.695811305408704, + "grad_norm": 0.0013203137128641425, + "learning_rate": 1.6030361625051417e-06, + "loss": 0.0, + "step": 26301 + }, + { + "epoch": 10.6962179747865, + "grad_norm": 0.0007299781594304643, + "learning_rate": 1.6024859875211785e-06, + "loss": 0.0, + "step": 26302 + }, + { + "epoch": 10.696624644164295, + "grad_norm": 0.29069964663663345, + "learning_rate": 1.6019358987419232e-06, + "loss": 0.0012, + "step": 26303 + }, + { + "epoch": 10.697031313542091, + "grad_norm": 0.0609512663496286, + "learning_rate": 1.6013858961730189e-06, + "loss": 0.0005, + "step": 26304 + }, + { + "epoch": 10.697437982919887, + "grad_norm": 7.243074993450403e-05, + "learning_rate": 1.600835979820119e-06, + "loss": 0.0, + "step": 26305 + }, + { + "epoch": 10.697844652297682, + "grad_norm": 0.2075046101545712, + "learning_rate": 1.6002861496888644e-06, + "loss": 0.0014, + "step": 26306 + }, + { + "epoch": 10.698251321675478, + "grad_norm": 0.020655721346698203, + "learning_rate": 1.5997364057848996e-06, + "loss": 0.0002, + "step": 26307 + }, + { + "epoch": 10.698657991053274, + "grad_norm": 0.015167129916183535, + "learning_rate": 1.599186748113869e-06, + "loss": 0.0002, + "step": 26308 + }, + { + "epoch": 10.69906466043107, + "grad_norm": 0.00011417480604230839, + "learning_rate": 1.5986371766814145e-06, + "loss": 0.0, + "step": 26309 + }, + { + "epoch": 10.699471329808866, + "grad_norm": 0.013954984891034894, + "learning_rate": 1.5980876914931753e-06, + "loss": 0.0001, + "step": 26310 + }, + { + "epoch": 10.699877999186661, + "grad_norm": 0.0063519803284049145, + "learning_rate": 1.5975382925547966e-06, + "loss": 0.0001, + "step": 26311 + }, + { + "epoch": 10.700284668564457, + "grad_norm": 0.040450828943974816, + "learning_rate": 1.5969889798719163e-06, + "loss": 0.0004, + "step": 26312 + }, + { + "epoch": 10.700691337942253, + "grad_norm": 0.0004979267033085364, + "learning_rate": 1.596439753450174e-06, + "loss": 0.0, + "step": 26313 + }, + { + "epoch": 10.701098007320049, + "grad_norm": 0.14623168788019686, + "learning_rate": 1.5958906132952056e-06, + "loss": 0.0011, + "step": 26314 + }, + { + "epoch": 10.701504676697844, + "grad_norm": 0.4942986661795429, + "learning_rate": 1.5953415594126508e-06, + "loss": 0.0037, + "step": 26315 + }, + { + "epoch": 10.70191134607564, + "grad_norm": 0.0023209083688096894, + "learning_rate": 1.5947925918081453e-06, + "loss": 0.0, + "step": 26316 + }, + { + "epoch": 10.702318015453436, + "grad_norm": 0.02009621493624112, + "learning_rate": 1.594243710487321e-06, + "loss": 0.0002, + "step": 26317 + }, + { + "epoch": 10.702724684831232, + "grad_norm": 0.024825660010788383, + "learning_rate": 1.5936949154558178e-06, + "loss": 0.0002, + "step": 26318 + }, + { + "epoch": 10.703131354209027, + "grad_norm": 0.010871077304145313, + "learning_rate": 1.593146206719267e-06, + "loss": 0.0001, + "step": 26319 + }, + { + "epoch": 10.703538023586823, + "grad_norm": 0.007310490517799151, + "learning_rate": 1.592597584283302e-06, + "loss": 0.0001, + "step": 26320 + }, + { + "epoch": 10.703944692964619, + "grad_norm": 0.04036046336665444, + "learning_rate": 1.5920490481535533e-06, + "loss": 0.0005, + "step": 26321 + }, + { + "epoch": 10.704351362342415, + "grad_norm": 0.02762905145056767, + "learning_rate": 1.5915005983356535e-06, + "loss": 0.0003, + "step": 26322 + }, + { + "epoch": 10.704758031720212, + "grad_norm": 0.0014337763977915151, + "learning_rate": 1.5909522348352292e-06, + "loss": 0.0, + "step": 26323 + }, + { + "epoch": 10.705164701098008, + "grad_norm": 0.00012454679539031325, + "learning_rate": 1.5904039576579156e-06, + "loss": 0.0, + "step": 26324 + }, + { + "epoch": 10.705571370475804, + "grad_norm": 0.028378051709305012, + "learning_rate": 1.5898557668093373e-06, + "loss": 0.0003, + "step": 26325 + }, + { + "epoch": 10.7059780398536, + "grad_norm": 0.00016118552678698176, + "learning_rate": 1.589307662295122e-06, + "loss": 0.0, + "step": 26326 + }, + { + "epoch": 10.706384709231395, + "grad_norm": 0.08776354171288737, + "learning_rate": 1.5887596441208962e-06, + "loss": 0.0008, + "step": 26327 + }, + { + "epoch": 10.706791378609191, + "grad_norm": 0.004173324811253948, + "learning_rate": 1.588211712292287e-06, + "loss": 0.0, + "step": 26328 + }, + { + "epoch": 10.707198047986987, + "grad_norm": 0.018320407703530144, + "learning_rate": 1.587663866814918e-06, + "loss": 0.0002, + "step": 26329 + }, + { + "epoch": 10.707604717364783, + "grad_norm": 0.011447761622754456, + "learning_rate": 1.5871161076944098e-06, + "loss": 0.0001, + "step": 26330 + }, + { + "epoch": 10.708011386742578, + "grad_norm": 0.1499623955840716, + "learning_rate": 1.586568434936392e-06, + "loss": 0.0014, + "step": 26331 + }, + { + "epoch": 10.708418056120374, + "grad_norm": 0.031724016910610206, + "learning_rate": 1.5860208485464834e-06, + "loss": 0.0002, + "step": 26332 + }, + { + "epoch": 10.70882472549817, + "grad_norm": 0.007199174815302858, + "learning_rate": 1.5854733485303064e-06, + "loss": 0.0001, + "step": 26333 + }, + { + "epoch": 10.709231394875966, + "grad_norm": 0.0034243401735304774, + "learning_rate": 1.584925934893482e-06, + "loss": 0.0, + "step": 26334 + }, + { + "epoch": 10.709638064253761, + "grad_norm": 0.06620840459178745, + "learning_rate": 1.5843786076416235e-06, + "loss": 0.0004, + "step": 26335 + }, + { + "epoch": 10.710044733631557, + "grad_norm": 0.018387395263447855, + "learning_rate": 1.5838313667803563e-06, + "loss": 0.0002, + "step": 26336 + }, + { + "epoch": 10.710451403009353, + "grad_norm": 0.3611672204659246, + "learning_rate": 1.5832842123152958e-06, + "loss": 0.0035, + "step": 26337 + }, + { + "epoch": 10.710858072387149, + "grad_norm": 0.02557886262364184, + "learning_rate": 1.582737144252059e-06, + "loss": 0.0002, + "step": 26338 + }, + { + "epoch": 10.711264741764944, + "grad_norm": 0.0940604759742233, + "learning_rate": 1.582190162596262e-06, + "loss": 0.0011, + "step": 26339 + }, + { + "epoch": 10.71167141114274, + "grad_norm": 0.0791222013104907, + "learning_rate": 1.5816432673535187e-06, + "loss": 0.0008, + "step": 26340 + }, + { + "epoch": 10.712078080520536, + "grad_norm": 0.12097383659255857, + "learning_rate": 1.581096458529443e-06, + "loss": 0.0012, + "step": 26341 + }, + { + "epoch": 10.712484749898334, + "grad_norm": 0.00028648000574006303, + "learning_rate": 1.5805497361296506e-06, + "loss": 0.0, + "step": 26342 + }, + { + "epoch": 10.71289141927613, + "grad_norm": 0.00029559454833651377, + "learning_rate": 1.5800031001597537e-06, + "loss": 0.0, + "step": 26343 + }, + { + "epoch": 10.713298088653925, + "grad_norm": 0.00797613117293837, + "learning_rate": 1.5794565506253613e-06, + "loss": 0.0001, + "step": 26344 + }, + { + "epoch": 10.71370475803172, + "grad_norm": 0.012275600608710503, + "learning_rate": 1.5789100875320863e-06, + "loss": 0.0001, + "step": 26345 + }, + { + "epoch": 10.714111427409517, + "grad_norm": 0.38645313559328254, + "learning_rate": 1.5783637108855377e-06, + "loss": 0.0048, + "step": 26346 + }, + { + "epoch": 10.714518096787312, + "grad_norm": 0.08322416513036945, + "learning_rate": 1.5778174206913233e-06, + "loss": 0.0008, + "step": 26347 + }, + { + "epoch": 10.714924766165108, + "grad_norm": 0.0026155028725789457, + "learning_rate": 1.5772712169550497e-06, + "loss": 0.0, + "step": 26348 + }, + { + "epoch": 10.715331435542904, + "grad_norm": 0.0007406117472461318, + "learning_rate": 1.576725099682329e-06, + "loss": 0.0, + "step": 26349 + }, + { + "epoch": 10.7157381049207, + "grad_norm": 0.00469702663715735, + "learning_rate": 1.5761790688787648e-06, + "loss": 0.0, + "step": 26350 + }, + { + "epoch": 10.716144774298495, + "grad_norm": 0.525941364694347, + "learning_rate": 1.5756331245499612e-06, + "loss": 0.0022, + "step": 26351 + }, + { + "epoch": 10.716551443676291, + "grad_norm": 0.09038181398218888, + "learning_rate": 1.5750872667015238e-06, + "loss": 0.0008, + "step": 26352 + }, + { + "epoch": 10.716958113054087, + "grad_norm": 0.014058313358680949, + "learning_rate": 1.5745414953390558e-06, + "loss": 0.0001, + "step": 26353 + }, + { + "epoch": 10.717364782431883, + "grad_norm": 0.005052909558394565, + "learning_rate": 1.5739958104681574e-06, + "loss": 0.0, + "step": 26354 + }, + { + "epoch": 10.717771451809678, + "grad_norm": 0.008464207083827405, + "learning_rate": 1.5734502120944351e-06, + "loss": 0.0001, + "step": 26355 + }, + { + "epoch": 10.718178121187474, + "grad_norm": 0.09336753909621663, + "learning_rate": 1.5729047002234876e-06, + "loss": 0.0004, + "step": 26356 + }, + { + "epoch": 10.71858479056527, + "grad_norm": 0.0014759884956227268, + "learning_rate": 1.572359274860914e-06, + "loss": 0.0, + "step": 26357 + }, + { + "epoch": 10.718991459943066, + "grad_norm": 0.011898882975587975, + "learning_rate": 1.571813936012314e-06, + "loss": 0.0001, + "step": 26358 + }, + { + "epoch": 10.719398129320862, + "grad_norm": 0.00737357614369939, + "learning_rate": 1.571268683683287e-06, + "loss": 0.0001, + "step": 26359 + }, + { + "epoch": 10.719804798698657, + "grad_norm": 0.005159122774228463, + "learning_rate": 1.5707235178794279e-06, + "loss": 0.0, + "step": 26360 + }, + { + "epoch": 10.720211468076453, + "grad_norm": 0.20847670037393254, + "learning_rate": 1.5701784386063324e-06, + "loss": 0.002, + "step": 26361 + }, + { + "epoch": 10.720618137454249, + "grad_norm": 0.03182263527505551, + "learning_rate": 1.5696334458695995e-06, + "loss": 0.0003, + "step": 26362 + }, + { + "epoch": 10.721024806832045, + "grad_norm": 5.053559067243401e-05, + "learning_rate": 1.5690885396748222e-06, + "loss": 0.0, + "step": 26363 + }, + { + "epoch": 10.721431476209842, + "grad_norm": 15.804409230894349, + "learning_rate": 1.5685437200275956e-06, + "loss": 0.3896, + "step": 26364 + }, + { + "epoch": 10.721838145587638, + "grad_norm": 0.12261538947856397, + "learning_rate": 1.56799898693351e-06, + "loss": 0.0009, + "step": 26365 + }, + { + "epoch": 10.722244814965434, + "grad_norm": 0.0007452871820813744, + "learning_rate": 1.567454340398159e-06, + "loss": 0.0, + "step": 26366 + }, + { + "epoch": 10.72265148434323, + "grad_norm": 0.011546232174951694, + "learning_rate": 1.5669097804271304e-06, + "loss": 0.0001, + "step": 26367 + }, + { + "epoch": 10.723058153721025, + "grad_norm": 0.0708353200975422, + "learning_rate": 1.5663653070260198e-06, + "loss": 0.0009, + "step": 26368 + }, + { + "epoch": 10.723464823098821, + "grad_norm": 0.011792184291968514, + "learning_rate": 1.5658209202004137e-06, + "loss": 0.0001, + "step": 26369 + }, + { + "epoch": 10.723871492476617, + "grad_norm": 0.9593570330770013, + "learning_rate": 1.565276619955901e-06, + "loss": 0.01, + "step": 26370 + }, + { + "epoch": 10.724278161854413, + "grad_norm": 0.0015200751372281404, + "learning_rate": 1.5647324062980684e-06, + "loss": 0.0, + "step": 26371 + }, + { + "epoch": 10.724684831232208, + "grad_norm": 0.08980573987018164, + "learning_rate": 1.5641882792325037e-06, + "loss": 0.0012, + "step": 26372 + }, + { + "epoch": 10.725091500610004, + "grad_norm": 0.06712112308662335, + "learning_rate": 1.563644238764792e-06, + "loss": 0.0008, + "step": 26373 + }, + { + "epoch": 10.7254981699878, + "grad_norm": 0.037519533647072, + "learning_rate": 1.5631002849005151e-06, + "loss": 0.0004, + "step": 26374 + }, + { + "epoch": 10.725904839365596, + "grad_norm": 0.07324882370468462, + "learning_rate": 1.5625564176452634e-06, + "loss": 0.0004, + "step": 26375 + }, + { + "epoch": 10.726311508743391, + "grad_norm": 0.0018768555156469023, + "learning_rate": 1.562012637004615e-06, + "loss": 0.0, + "step": 26376 + }, + { + "epoch": 10.726718178121187, + "grad_norm": 0.01780115707714564, + "learning_rate": 1.5614689429841544e-06, + "loss": 0.0002, + "step": 26377 + }, + { + "epoch": 10.727124847498983, + "grad_norm": 0.02649234599846412, + "learning_rate": 1.5609253355894628e-06, + "loss": 0.0003, + "step": 26378 + }, + { + "epoch": 10.727531516876779, + "grad_norm": 0.4008828383222697, + "learning_rate": 1.5603818148261195e-06, + "loss": 0.0008, + "step": 26379 + }, + { + "epoch": 10.727938186254574, + "grad_norm": 0.03434923619468647, + "learning_rate": 1.559838380699702e-06, + "loss": 0.0003, + "step": 26380 + }, + { + "epoch": 10.72834485563237, + "grad_norm": 0.007542611001454316, + "learning_rate": 1.559295033215793e-06, + "loss": 0.0001, + "step": 26381 + }, + { + "epoch": 10.728751525010166, + "grad_norm": 0.009046490482728372, + "learning_rate": 1.5587517723799694e-06, + "loss": 0.0001, + "step": 26382 + }, + { + "epoch": 10.729158194387963, + "grad_norm": 0.06952804923950716, + "learning_rate": 1.5582085981978078e-06, + "loss": 0.0003, + "step": 26383 + }, + { + "epoch": 10.72956486376576, + "grad_norm": 0.04644692421158321, + "learning_rate": 1.5576655106748828e-06, + "loss": 0.0005, + "step": 26384 + }, + { + "epoch": 10.729971533143555, + "grad_norm": 0.023162596117523707, + "learning_rate": 1.5571225098167696e-06, + "loss": 0.0001, + "step": 26385 + }, + { + "epoch": 10.73037820252135, + "grad_norm": 0.009287292500825777, + "learning_rate": 1.5565795956290442e-06, + "loss": 0.0001, + "step": 26386 + }, + { + "epoch": 10.730784871899147, + "grad_norm": 0.03984804810344894, + "learning_rate": 1.5560367681172761e-06, + "loss": 0.0004, + "step": 26387 + }, + { + "epoch": 10.731191541276942, + "grad_norm": 0.00935442779874943, + "learning_rate": 1.5554940272870423e-06, + "loss": 0.0001, + "step": 26388 + }, + { + "epoch": 10.731598210654738, + "grad_norm": 0.047104461343093144, + "learning_rate": 1.5549513731439126e-06, + "loss": 0.0003, + "step": 26389 + }, + { + "epoch": 10.732004880032534, + "grad_norm": 12.344076422709108, + "learning_rate": 1.554408805693458e-06, + "loss": 0.1359, + "step": 26390 + }, + { + "epoch": 10.73241154941033, + "grad_norm": 0.0411669551841364, + "learning_rate": 1.5538663249412466e-06, + "loss": 0.0004, + "step": 26391 + }, + { + "epoch": 10.732818218788125, + "grad_norm": 0.0038584911164393576, + "learning_rate": 1.5533239308928483e-06, + "loss": 0.0, + "step": 26392 + }, + { + "epoch": 10.733224888165921, + "grad_norm": 0.028443310264757576, + "learning_rate": 1.5527816235538297e-06, + "loss": 0.0002, + "step": 26393 + }, + { + "epoch": 10.733631557543717, + "grad_norm": 0.005389131833880727, + "learning_rate": 1.5522394029297605e-06, + "loss": 0.0001, + "step": 26394 + }, + { + "epoch": 10.734038226921513, + "grad_norm": 0.032621772051514786, + "learning_rate": 1.5516972690262056e-06, + "loss": 0.0002, + "step": 26395 + }, + { + "epoch": 10.734444896299308, + "grad_norm": 0.13135247557241514, + "learning_rate": 1.5511552218487301e-06, + "loss": 0.0004, + "step": 26396 + }, + { + "epoch": 10.734851565677104, + "grad_norm": 0.04454887350178779, + "learning_rate": 1.550613261402899e-06, + "loss": 0.0004, + "step": 26397 + }, + { + "epoch": 10.7352582350549, + "grad_norm": 0.012620444481015368, + "learning_rate": 1.5500713876942742e-06, + "loss": 0.0001, + "step": 26398 + }, + { + "epoch": 10.735664904432696, + "grad_norm": 0.006219826311865326, + "learning_rate": 1.54952960072842e-06, + "loss": 0.0001, + "step": 26399 + }, + { + "epoch": 10.736071573810491, + "grad_norm": 0.0445686267535528, + "learning_rate": 1.5489879005108954e-06, + "loss": 0.0004, + "step": 26400 + }, + { + "epoch": 10.736478243188287, + "grad_norm": 0.06380162213969798, + "learning_rate": 1.548446287047266e-06, + "loss": 0.0009, + "step": 26401 + }, + { + "epoch": 10.736884912566083, + "grad_norm": 0.03674521672984678, + "learning_rate": 1.5479047603430885e-06, + "loss": 0.0004, + "step": 26402 + }, + { + "epoch": 10.737291581943879, + "grad_norm": 0.0012908925859217615, + "learning_rate": 1.5473633204039218e-06, + "loss": 0.0, + "step": 26403 + }, + { + "epoch": 10.737698251321675, + "grad_norm": 0.27602264681123295, + "learning_rate": 1.5468219672353258e-06, + "loss": 0.0017, + "step": 26404 + }, + { + "epoch": 10.738104920699472, + "grad_norm": 0.005647630341868414, + "learning_rate": 1.546280700842856e-06, + "loss": 0.0001, + "step": 26405 + }, + { + "epoch": 10.738511590077268, + "grad_norm": 0.04671376637912969, + "learning_rate": 1.545739521232068e-06, + "loss": 0.0005, + "step": 26406 + }, + { + "epoch": 10.738918259455064, + "grad_norm": 2.391972658951022, + "learning_rate": 1.5451984284085209e-06, + "loss": 0.0224, + "step": 26407 + }, + { + "epoch": 10.73932492883286, + "grad_norm": 0.04065790371945286, + "learning_rate": 1.5446574223777667e-06, + "loss": 0.0003, + "step": 26408 + }, + { + "epoch": 10.739731598210655, + "grad_norm": 0.0020227119144029023, + "learning_rate": 1.5441165031453587e-06, + "loss": 0.0, + "step": 26409 + }, + { + "epoch": 10.740138267588451, + "grad_norm": 0.4421260061508905, + "learning_rate": 1.5435756707168514e-06, + "loss": 0.0036, + "step": 26410 + }, + { + "epoch": 10.740544936966247, + "grad_norm": 0.004419964417179671, + "learning_rate": 1.5430349250977961e-06, + "loss": 0.0, + "step": 26411 + }, + { + "epoch": 10.740951606344042, + "grad_norm": 0.0005079961183528428, + "learning_rate": 1.5424942662937436e-06, + "loss": 0.0, + "step": 26412 + }, + { + "epoch": 10.741358275721838, + "grad_norm": 0.07295764816704332, + "learning_rate": 1.541953694310242e-06, + "loss": 0.0008, + "step": 26413 + }, + { + "epoch": 10.741764945099634, + "grad_norm": 0.009780099851739162, + "learning_rate": 1.5414132091528445e-06, + "loss": 0.0001, + "step": 26414 + }, + { + "epoch": 10.74217161447743, + "grad_norm": 0.009811018090045344, + "learning_rate": 1.5408728108270976e-06, + "loss": 0.0001, + "step": 26415 + }, + { + "epoch": 10.742578283855226, + "grad_norm": 0.02342292472802783, + "learning_rate": 1.5403324993385482e-06, + "loss": 0.0002, + "step": 26416 + }, + { + "epoch": 10.742984953233021, + "grad_norm": 0.01683109678947976, + "learning_rate": 1.539792274692744e-06, + "loss": 0.0001, + "step": 26417 + }, + { + "epoch": 10.743391622610817, + "grad_norm": 0.05698084679155478, + "learning_rate": 1.5392521368952296e-06, + "loss": 0.0004, + "step": 26418 + }, + { + "epoch": 10.743798291988613, + "grad_norm": 0.060456136806332536, + "learning_rate": 1.5387120859515503e-06, + "loss": 0.0007, + "step": 26419 + }, + { + "epoch": 10.744204961366409, + "grad_norm": 0.0001558298086578143, + "learning_rate": 1.5381721218672473e-06, + "loss": 0.0, + "step": 26420 + }, + { + "epoch": 10.744611630744204, + "grad_norm": 0.006029496992039616, + "learning_rate": 1.5376322446478699e-06, + "loss": 0.0001, + "step": 26421 + }, + { + "epoch": 10.745018300122, + "grad_norm": 0.27984798252984355, + "learning_rate": 1.5370924542989552e-06, + "loss": 0.0021, + "step": 26422 + }, + { + "epoch": 10.745424969499796, + "grad_norm": 0.005988835051879287, + "learning_rate": 1.5365527508260457e-06, + "loss": 0.0001, + "step": 26423 + }, + { + "epoch": 10.745831638877593, + "grad_norm": 0.022959413610716913, + "learning_rate": 1.5360131342346817e-06, + "loss": 0.0002, + "step": 26424 + }, + { + "epoch": 10.74623830825539, + "grad_norm": 0.005084265270427893, + "learning_rate": 1.5354736045304041e-06, + "loss": 0.0, + "step": 26425 + }, + { + "epoch": 10.746644977633185, + "grad_norm": 0.015514263812312891, + "learning_rate": 1.5349341617187463e-06, + "loss": 0.0001, + "step": 26426 + }, + { + "epoch": 10.74705164701098, + "grad_norm": 0.8619484847497608, + "learning_rate": 1.5343948058052526e-06, + "loss": 0.0071, + "step": 26427 + }, + { + "epoch": 10.747458316388776, + "grad_norm": 0.012002626052941138, + "learning_rate": 1.5338555367954566e-06, + "loss": 0.0001, + "step": 26428 + }, + { + "epoch": 10.747864985766572, + "grad_norm": 0.0004290566376865986, + "learning_rate": 1.5333163546948948e-06, + "loss": 0.0, + "step": 26429 + }, + { + "epoch": 10.748271655144368, + "grad_norm": 0.5287591433034855, + "learning_rate": 1.5327772595091017e-06, + "loss": 0.0037, + "step": 26430 + }, + { + "epoch": 10.748678324522164, + "grad_norm": 0.00016106473684261913, + "learning_rate": 1.5322382512436118e-06, + "loss": 0.0, + "step": 26431 + }, + { + "epoch": 10.74908499389996, + "grad_norm": 0.06196244334386665, + "learning_rate": 1.5316993299039573e-06, + "loss": 0.0007, + "step": 26432 + }, + { + "epoch": 10.749491663277755, + "grad_norm": 0.01895049401609203, + "learning_rate": 1.5311604954956693e-06, + "loss": 0.0002, + "step": 26433 + }, + { + "epoch": 10.749898332655551, + "grad_norm": 0.006001522560715606, + "learning_rate": 1.5306217480242858e-06, + "loss": 0.0001, + "step": 26434 + }, + { + "epoch": 10.750305002033347, + "grad_norm": 0.027539256810894044, + "learning_rate": 1.5300830874953298e-06, + "loss": 0.0002, + "step": 26435 + }, + { + "epoch": 10.750711671411143, + "grad_norm": 0.040931051784527536, + "learning_rate": 1.5295445139143339e-06, + "loss": 0.0006, + "step": 26436 + }, + { + "epoch": 10.751118340788938, + "grad_norm": 0.003160599747220913, + "learning_rate": 1.5290060272868278e-06, + "loss": 0.0, + "step": 26437 + }, + { + "epoch": 10.751525010166734, + "grad_norm": 0.0007509419430798103, + "learning_rate": 1.528467627618334e-06, + "loss": 0.0, + "step": 26438 + }, + { + "epoch": 10.75193167954453, + "grad_norm": 0.004301020872388075, + "learning_rate": 1.527929314914387e-06, + "loss": 0.0, + "step": 26439 + }, + { + "epoch": 10.752338348922326, + "grad_norm": 0.062160685523727534, + "learning_rate": 1.5273910891805099e-06, + "loss": 0.0004, + "step": 26440 + }, + { + "epoch": 10.752745018300121, + "grad_norm": 0.0017627583781813583, + "learning_rate": 1.5268529504222262e-06, + "loss": 0.0, + "step": 26441 + }, + { + "epoch": 10.753151687677917, + "grad_norm": 0.04106235206798551, + "learning_rate": 1.5263148986450628e-06, + "loss": 0.0004, + "step": 26442 + }, + { + "epoch": 10.753558357055713, + "grad_norm": 0.04435015278712586, + "learning_rate": 1.5257769338545403e-06, + "loss": 0.0003, + "step": 26443 + }, + { + "epoch": 10.753965026433509, + "grad_norm": 0.007041945781685827, + "learning_rate": 1.5252390560561814e-06, + "loss": 0.0001, + "step": 26444 + }, + { + "epoch": 10.754371695811304, + "grad_norm": 0.0001956638282511206, + "learning_rate": 1.5247012652555105e-06, + "loss": 0.0, + "step": 26445 + }, + { + "epoch": 10.754778365189102, + "grad_norm": 0.14235724335975916, + "learning_rate": 1.5241635614580464e-06, + "loss": 0.0018, + "step": 26446 + }, + { + "epoch": 10.755185034566898, + "grad_norm": 0.22609908156339611, + "learning_rate": 1.5236259446693102e-06, + "loss": 0.0015, + "step": 26447 + }, + { + "epoch": 10.755591703944694, + "grad_norm": 0.1252777542522728, + "learning_rate": 1.523088414894819e-06, + "loss": 0.0017, + "step": 26448 + }, + { + "epoch": 10.75599837332249, + "grad_norm": 0.00837391285921828, + "learning_rate": 1.522550972140091e-06, + "loss": 0.0001, + "step": 26449 + }, + { + "epoch": 10.756405042700285, + "grad_norm": 0.006747925048036968, + "learning_rate": 1.5220136164106448e-06, + "loss": 0.0001, + "step": 26450 + }, + { + "epoch": 10.75681171207808, + "grad_norm": 0.0015261819739567158, + "learning_rate": 1.521476347711993e-06, + "loss": 0.0, + "step": 26451 + }, + { + "epoch": 10.757218381455877, + "grad_norm": 0.021926454863122967, + "learning_rate": 1.5209391660496564e-06, + "loss": 0.0002, + "step": 26452 + }, + { + "epoch": 10.757625050833672, + "grad_norm": 0.04557789745439981, + "learning_rate": 1.5204020714291468e-06, + "loss": 0.0004, + "step": 26453 + }, + { + "epoch": 10.758031720211468, + "grad_norm": 0.2563571527331403, + "learning_rate": 1.5198650638559764e-06, + "loss": 0.0019, + "step": 26454 + }, + { + "epoch": 10.758438389589264, + "grad_norm": 0.03248939509597202, + "learning_rate": 1.51932814333566e-06, + "loss": 0.0003, + "step": 26455 + }, + { + "epoch": 10.75884505896706, + "grad_norm": 0.006278747614829143, + "learning_rate": 1.5187913098737084e-06, + "loss": 0.0001, + "step": 26456 + }, + { + "epoch": 10.759251728344855, + "grad_norm": 8.689972071243352, + "learning_rate": 1.5182545634756308e-06, + "loss": 0.1778, + "step": 26457 + }, + { + "epoch": 10.759658397722651, + "grad_norm": 0.046639700268416066, + "learning_rate": 1.5177179041469403e-06, + "loss": 0.0003, + "step": 26458 + }, + { + "epoch": 10.760065067100447, + "grad_norm": 1.1170719487530794, + "learning_rate": 1.517181331893145e-06, + "loss": 0.0127, + "step": 26459 + }, + { + "epoch": 10.760471736478243, + "grad_norm": 0.09792730939976985, + "learning_rate": 1.5166448467197525e-06, + "loss": 0.001, + "step": 26460 + }, + { + "epoch": 10.760878405856038, + "grad_norm": 0.018831347104066337, + "learning_rate": 1.5161084486322698e-06, + "loss": 0.0001, + "step": 26461 + }, + { + "epoch": 10.761285075233834, + "grad_norm": 0.0012281268945025425, + "learning_rate": 1.5155721376362043e-06, + "loss": 0.0, + "step": 26462 + }, + { + "epoch": 10.76169174461163, + "grad_norm": 0.005507251796380864, + "learning_rate": 1.5150359137370608e-06, + "loss": 0.0001, + "step": 26463 + }, + { + "epoch": 10.762098413989426, + "grad_norm": 0.0352778620984147, + "learning_rate": 1.5144997769403414e-06, + "loss": 0.0003, + "step": 26464 + }, + { + "epoch": 10.762505083367223, + "grad_norm": 0.030411687958626695, + "learning_rate": 1.513963727251555e-06, + "loss": 0.0004, + "step": 26465 + }, + { + "epoch": 10.762911752745019, + "grad_norm": 0.003183390103020654, + "learning_rate": 1.513427764676202e-06, + "loss": 0.0, + "step": 26466 + }, + { + "epoch": 10.763318422122815, + "grad_norm": 0.08613555369012829, + "learning_rate": 1.5128918892197842e-06, + "loss": 0.0005, + "step": 26467 + }, + { + "epoch": 10.76372509150061, + "grad_norm": 0.0351063347419065, + "learning_rate": 1.5123561008878018e-06, + "loss": 0.0003, + "step": 26468 + }, + { + "epoch": 10.764131760878406, + "grad_norm": 0.40210600822831505, + "learning_rate": 1.5118203996857573e-06, + "loss": 0.0023, + "step": 26469 + }, + { + "epoch": 10.764538430256202, + "grad_norm": 0.004283457940290664, + "learning_rate": 1.5112847856191449e-06, + "loss": 0.0, + "step": 26470 + }, + { + "epoch": 10.764945099633998, + "grad_norm": 0.006888291972551992, + "learning_rate": 1.5107492586934692e-06, + "loss": 0.0, + "step": 26471 + }, + { + "epoch": 10.765351769011794, + "grad_norm": 1.8887886210497027, + "learning_rate": 1.5102138189142246e-06, + "loss": 0.0134, + "step": 26472 + }, + { + "epoch": 10.76575843838959, + "grad_norm": 0.01634110848127037, + "learning_rate": 1.509678466286908e-06, + "loss": 0.0002, + "step": 26473 + }, + { + "epoch": 10.766165107767385, + "grad_norm": 3.053178792711925, + "learning_rate": 1.5091432008170159e-06, + "loss": 0.0265, + "step": 26474 + }, + { + "epoch": 10.766571777145181, + "grad_norm": 0.25951952994647803, + "learning_rate": 1.5086080225100408e-06, + "loss": 0.0015, + "step": 26475 + }, + { + "epoch": 10.766978446522977, + "grad_norm": 0.003851638689811994, + "learning_rate": 1.5080729313714792e-06, + "loss": 0.0, + "step": 26476 + }, + { + "epoch": 10.767385115900773, + "grad_norm": 0.05823546423837558, + "learning_rate": 1.5075379274068202e-06, + "loss": 0.0007, + "step": 26477 + }, + { + "epoch": 10.767791785278568, + "grad_norm": 0.01265470694052512, + "learning_rate": 1.5070030106215604e-06, + "loss": 0.0002, + "step": 26478 + }, + { + "epoch": 10.768198454656364, + "grad_norm": 0.040467326806235326, + "learning_rate": 1.50646818102119e-06, + "loss": 0.0003, + "step": 26479 + }, + { + "epoch": 10.76860512403416, + "grad_norm": 0.3675421082381354, + "learning_rate": 1.5059334386111978e-06, + "loss": 0.0038, + "step": 26480 + }, + { + "epoch": 10.769011793411956, + "grad_norm": 2.1379352068632915, + "learning_rate": 1.505398783397074e-06, + "loss": 0.0292, + "step": 26481 + }, + { + "epoch": 10.769418462789751, + "grad_norm": 1.1483100879533186, + "learning_rate": 1.5048642153843063e-06, + "loss": 0.0112, + "step": 26482 + }, + { + "epoch": 10.769825132167547, + "grad_norm": 0.01893328965259543, + "learning_rate": 1.5043297345783814e-06, + "loss": 0.0002, + "step": 26483 + }, + { + "epoch": 10.770231801545343, + "grad_norm": 0.001282207303924738, + "learning_rate": 1.5037953409847895e-06, + "loss": 0.0, + "step": 26484 + }, + { + "epoch": 10.770638470923139, + "grad_norm": 0.04679260218631033, + "learning_rate": 1.5032610346090148e-06, + "loss": 0.0005, + "step": 26485 + }, + { + "epoch": 10.771045140300934, + "grad_norm": 0.0026995654333363773, + "learning_rate": 1.502726815456541e-06, + "loss": 0.0, + "step": 26486 + }, + { + "epoch": 10.771451809678732, + "grad_norm": 0.18530710854059024, + "learning_rate": 1.502192683532855e-06, + "loss": 0.0014, + "step": 26487 + }, + { + "epoch": 10.771858479056528, + "grad_norm": 0.004629479895804148, + "learning_rate": 1.5016586388434363e-06, + "loss": 0.0, + "step": 26488 + }, + { + "epoch": 10.772265148434323, + "grad_norm": 0.0001937271232224479, + "learning_rate": 1.5011246813937686e-06, + "loss": 0.0, + "step": 26489 + }, + { + "epoch": 10.77267181781212, + "grad_norm": 0.0025670992664229113, + "learning_rate": 1.5005908111893319e-06, + "loss": 0.0, + "step": 26490 + }, + { + "epoch": 10.773078487189915, + "grad_norm": 0.050230501894041414, + "learning_rate": 1.500057028235611e-06, + "loss": 0.0003, + "step": 26491 + }, + { + "epoch": 10.77348515656771, + "grad_norm": 0.1198221667371805, + "learning_rate": 1.499523332538082e-06, + "loss": 0.0011, + "step": 26492 + }, + { + "epoch": 10.773891825945507, + "grad_norm": 0.00023583888263590614, + "learning_rate": 1.4989897241022233e-06, + "loss": 0.0, + "step": 26493 + }, + { + "epoch": 10.774298495323302, + "grad_norm": 0.0772473187131672, + "learning_rate": 1.4984562029335149e-06, + "loss": 0.001, + "step": 26494 + }, + { + "epoch": 10.774705164701098, + "grad_norm": 0.11293444967646404, + "learning_rate": 1.497922769037432e-06, + "loss": 0.0015, + "step": 26495 + }, + { + "epoch": 10.775111834078894, + "grad_norm": 0.03103740899614112, + "learning_rate": 1.4973894224194486e-06, + "loss": 0.0004, + "step": 26496 + }, + { + "epoch": 10.77551850345669, + "grad_norm": 0.028126530741972904, + "learning_rate": 1.496856163085043e-06, + "loss": 0.0002, + "step": 26497 + }, + { + "epoch": 10.775925172834485, + "grad_norm": 0.1487732045833877, + "learning_rate": 1.4963229910396903e-06, + "loss": 0.0011, + "step": 26498 + }, + { + "epoch": 10.776331842212281, + "grad_norm": 0.010770623109959195, + "learning_rate": 1.4957899062888615e-06, + "loss": 0.0001, + "step": 26499 + }, + { + "epoch": 10.776738511590077, + "grad_norm": 0.07435092237025984, + "learning_rate": 1.4952569088380286e-06, + "loss": 0.0009, + "step": 26500 + }, + { + "epoch": 10.777145180967873, + "grad_norm": 0.00015586292862876378, + "learning_rate": 1.4947239986926642e-06, + "loss": 0.0, + "step": 26501 + }, + { + "epoch": 10.777551850345668, + "grad_norm": 0.006378029884525401, + "learning_rate": 1.4941911758582395e-06, + "loss": 0.0001, + "step": 26502 + }, + { + "epoch": 10.777958519723464, + "grad_norm": 0.0027493401627731286, + "learning_rate": 1.4936584403402199e-06, + "loss": 0.0, + "step": 26503 + }, + { + "epoch": 10.77836518910126, + "grad_norm": 0.0012306852358422561, + "learning_rate": 1.49312579214408e-06, + "loss": 0.0, + "step": 26504 + }, + { + "epoch": 10.778771858479056, + "grad_norm": 0.3445050217280165, + "learning_rate": 1.4925932312752855e-06, + "loss": 0.0033, + "step": 26505 + }, + { + "epoch": 10.779178527856853, + "grad_norm": 0.0016554813594388059, + "learning_rate": 1.4920607577393032e-06, + "loss": 0.0, + "step": 26506 + }, + { + "epoch": 10.779585197234649, + "grad_norm": 0.0030016335710401115, + "learning_rate": 1.4915283715415985e-06, + "loss": 0.0, + "step": 26507 + }, + { + "epoch": 10.779991866612445, + "grad_norm": 0.013835360006144423, + "learning_rate": 1.4909960726876382e-06, + "loss": 0.0001, + "step": 26508 + }, + { + "epoch": 10.78039853599024, + "grad_norm": 0.1552566514010371, + "learning_rate": 1.4904638611828836e-06, + "loss": 0.0013, + "step": 26509 + }, + { + "epoch": 10.780805205368036, + "grad_norm": 0.003918415748862892, + "learning_rate": 1.4899317370328015e-06, + "loss": 0.0, + "step": 26510 + }, + { + "epoch": 10.781211874745832, + "grad_norm": 0.014111317046224802, + "learning_rate": 1.489399700242854e-06, + "loss": 0.0001, + "step": 26511 + }, + { + "epoch": 10.781618544123628, + "grad_norm": 0.025680956379246646, + "learning_rate": 1.4888677508185012e-06, + "loss": 0.0002, + "step": 26512 + }, + { + "epoch": 10.782025213501424, + "grad_norm": 0.0009602907705496756, + "learning_rate": 1.4883358887652044e-06, + "loss": 0.0, + "step": 26513 + }, + { + "epoch": 10.78243188287922, + "grad_norm": 0.0016634789314276916, + "learning_rate": 1.4878041140884248e-06, + "loss": 0.0, + "step": 26514 + }, + { + "epoch": 10.782838552257015, + "grad_norm": 0.2693256798423683, + "learning_rate": 1.487272426793619e-06, + "loss": 0.002, + "step": 26515 + }, + { + "epoch": 10.783245221634811, + "grad_norm": 0.006897627780722702, + "learning_rate": 1.4867408268862438e-06, + "loss": 0.0, + "step": 26516 + }, + { + "epoch": 10.783651891012607, + "grad_norm": 1.148384430353608, + "learning_rate": 1.4862093143717615e-06, + "loss": 0.011, + "step": 26517 + }, + { + "epoch": 10.784058560390402, + "grad_norm": 0.003091720357941278, + "learning_rate": 1.4856778892556246e-06, + "loss": 0.0, + "step": 26518 + }, + { + "epoch": 10.784465229768198, + "grad_norm": 0.08328771828285154, + "learning_rate": 1.4851465515432905e-06, + "loss": 0.0008, + "step": 26519 + }, + { + "epoch": 10.784871899145994, + "grad_norm": 0.013691819272749366, + "learning_rate": 1.484615301240212e-06, + "loss": 0.0001, + "step": 26520 + }, + { + "epoch": 10.78527856852379, + "grad_norm": 0.013220439685380152, + "learning_rate": 1.4840841383518422e-06, + "loss": 0.0002, + "step": 26521 + }, + { + "epoch": 10.785685237901586, + "grad_norm": 0.0017764244890369626, + "learning_rate": 1.4835530628836358e-06, + "loss": 0.0, + "step": 26522 + }, + { + "epoch": 10.786091907279381, + "grad_norm": 0.8247640357409167, + "learning_rate": 1.4830220748410407e-06, + "loss": 0.0053, + "step": 26523 + }, + { + "epoch": 10.786498576657177, + "grad_norm": 0.41942465887347924, + "learning_rate": 1.4824911742295135e-06, + "loss": 0.0036, + "step": 26524 + }, + { + "epoch": 10.786905246034973, + "grad_norm": 0.007153240535565347, + "learning_rate": 1.481960361054501e-06, + "loss": 0.0001, + "step": 26525 + }, + { + "epoch": 10.787311915412769, + "grad_norm": 0.0011795880660723983, + "learning_rate": 1.481429635321452e-06, + "loss": 0.0, + "step": 26526 + }, + { + "epoch": 10.787718584790564, + "grad_norm": 0.018572690444839977, + "learning_rate": 1.4808989970358146e-06, + "loss": 0.0002, + "step": 26527 + }, + { + "epoch": 10.788125254168362, + "grad_norm": 0.004139113745467582, + "learning_rate": 1.4803684462030378e-06, + "loss": 0.0, + "step": 26528 + }, + { + "epoch": 10.788531923546158, + "grad_norm": 0.0015468318623610396, + "learning_rate": 1.4798379828285635e-06, + "loss": 0.0, + "step": 26529 + }, + { + "epoch": 10.788938592923953, + "grad_norm": 0.0041420561093184465, + "learning_rate": 1.4793076069178424e-06, + "loss": 0.0, + "step": 26530 + }, + { + "epoch": 10.78934526230175, + "grad_norm": 0.00019153977941121803, + "learning_rate": 1.4787773184763176e-06, + "loss": 0.0, + "step": 26531 + }, + { + "epoch": 10.789751931679545, + "grad_norm": 0.0549210442958145, + "learning_rate": 1.4782471175094325e-06, + "loss": 0.0003, + "step": 26532 + }, + { + "epoch": 10.79015860105734, + "grad_norm": 0.06546670128997156, + "learning_rate": 1.4777170040226297e-06, + "loss": 0.0002, + "step": 26533 + }, + { + "epoch": 10.790565270435136, + "grad_norm": 0.003552300660267643, + "learning_rate": 1.477186978021351e-06, + "loss": 0.0, + "step": 26534 + }, + { + "epoch": 10.790971939812932, + "grad_norm": 15.0196800829752, + "learning_rate": 1.4766570395110369e-06, + "loss": 0.255, + "step": 26535 + }, + { + "epoch": 10.791378609190728, + "grad_norm": 0.011424698424517394, + "learning_rate": 1.4761271884971284e-06, + "loss": 0.0001, + "step": 26536 + }, + { + "epoch": 10.791785278568524, + "grad_norm": 0.11683657595300453, + "learning_rate": 1.4755974249850647e-06, + "loss": 0.0009, + "step": 26537 + }, + { + "epoch": 10.79219194794632, + "grad_norm": 0.008639359710356408, + "learning_rate": 1.4750677489802822e-06, + "loss": 0.0001, + "step": 26538 + }, + { + "epoch": 10.792598617324115, + "grad_norm": 0.15735565491761438, + "learning_rate": 1.4745381604882215e-06, + "loss": 0.0016, + "step": 26539 + }, + { + "epoch": 10.793005286701911, + "grad_norm": 2.1789203377099358, + "learning_rate": 1.4740086595143167e-06, + "loss": 0.0136, + "step": 26540 + }, + { + "epoch": 10.793411956079707, + "grad_norm": 0.040164569968978416, + "learning_rate": 1.4734792460640012e-06, + "loss": 0.0002, + "step": 26541 + }, + { + "epoch": 10.793818625457503, + "grad_norm": 0.01289609228421439, + "learning_rate": 1.4729499201427155e-06, + "loss": 0.0002, + "step": 26542 + }, + { + "epoch": 10.794225294835298, + "grad_norm": 0.0022685607249111873, + "learning_rate": 1.4724206817558905e-06, + "loss": 0.0, + "step": 26543 + }, + { + "epoch": 10.794631964213094, + "grad_norm": 0.017892122012133346, + "learning_rate": 1.4718915309089587e-06, + "loss": 0.0001, + "step": 26544 + }, + { + "epoch": 10.79503863359089, + "grad_norm": 0.37983975650654656, + "learning_rate": 1.4713624676073524e-06, + "loss": 0.0037, + "step": 26545 + }, + { + "epoch": 10.795445302968686, + "grad_norm": 0.13552768947263683, + "learning_rate": 1.4708334918565026e-06, + "loss": 0.0015, + "step": 26546 + }, + { + "epoch": 10.795851972346483, + "grad_norm": 0.010826033482693566, + "learning_rate": 1.4703046036618385e-06, + "loss": 0.0001, + "step": 26547 + }, + { + "epoch": 10.796258641724279, + "grad_norm": 0.1950629633362615, + "learning_rate": 1.4697758030287912e-06, + "loss": 0.0017, + "step": 26548 + }, + { + "epoch": 10.796665311102075, + "grad_norm": 1.2408676234183658, + "learning_rate": 1.4692470899627898e-06, + "loss": 0.0123, + "step": 26549 + }, + { + "epoch": 10.79707198047987, + "grad_norm": 0.007780640556599241, + "learning_rate": 1.4687184644692597e-06, + "loss": 0.0001, + "step": 26550 + }, + { + "epoch": 10.797478649857666, + "grad_norm": 0.0035862068187460976, + "learning_rate": 1.4681899265536282e-06, + "loss": 0.0, + "step": 26551 + }, + { + "epoch": 10.797885319235462, + "grad_norm": 0.09121727113582657, + "learning_rate": 1.4676614762213214e-06, + "loss": 0.001, + "step": 26552 + }, + { + "epoch": 10.798291988613258, + "grad_norm": 0.05075480975296711, + "learning_rate": 1.4671331134777645e-06, + "loss": 0.0003, + "step": 26553 + }, + { + "epoch": 10.798698657991054, + "grad_norm": 0.004290385232212851, + "learning_rate": 1.4666048383283771e-06, + "loss": 0.0, + "step": 26554 + }, + { + "epoch": 10.79910532736885, + "grad_norm": 0.01253823063428429, + "learning_rate": 1.4660766507785885e-06, + "loss": 0.0001, + "step": 26555 + }, + { + "epoch": 10.799511996746645, + "grad_norm": 0.026295725001183077, + "learning_rate": 1.4655485508338185e-06, + "loss": 0.0003, + "step": 26556 + }, + { + "epoch": 10.79991866612444, + "grad_norm": 0.00014055683859752195, + "learning_rate": 1.4650205384994876e-06, + "loss": 0.0, + "step": 26557 + }, + { + "epoch": 10.800325335502237, + "grad_norm": 0.0028133482806302233, + "learning_rate": 1.4644926137810168e-06, + "loss": 0.0, + "step": 26558 + }, + { + "epoch": 10.800732004880032, + "grad_norm": 0.19323701822352427, + "learning_rate": 1.463964776683825e-06, + "loss": 0.0014, + "step": 26559 + }, + { + "epoch": 10.801138674257828, + "grad_norm": 0.010566748037790183, + "learning_rate": 1.463437027213328e-06, + "loss": 0.0001, + "step": 26560 + }, + { + "epoch": 10.801545343635624, + "grad_norm": 0.7757422842766393, + "learning_rate": 1.4629093653749493e-06, + "loss": 0.0061, + "step": 26561 + }, + { + "epoch": 10.80195201301342, + "grad_norm": 0.0003054018510638829, + "learning_rate": 1.4623817911741022e-06, + "loss": 0.0, + "step": 26562 + }, + { + "epoch": 10.802358682391215, + "grad_norm": 0.026300380355031655, + "learning_rate": 1.4618543046162027e-06, + "loss": 0.0002, + "step": 26563 + }, + { + "epoch": 10.802765351769011, + "grad_norm": 0.30194645871605036, + "learning_rate": 1.461326905706666e-06, + "loss": 0.0016, + "step": 26564 + }, + { + "epoch": 10.803172021146807, + "grad_norm": 0.0009237842701727188, + "learning_rate": 1.4607995944509068e-06, + "loss": 0.0, + "step": 26565 + }, + { + "epoch": 10.803578690524603, + "grad_norm": 0.0007274102923446358, + "learning_rate": 1.4602723708543364e-06, + "loss": 0.0, + "step": 26566 + }, + { + "epoch": 10.803985359902398, + "grad_norm": 0.06686828870879134, + "learning_rate": 1.459745234922366e-06, + "loss": 0.0006, + "step": 26567 + }, + { + "epoch": 10.804392029280194, + "grad_norm": 0.008456907151164334, + "learning_rate": 1.4592181866604115e-06, + "loss": 0.0001, + "step": 26568 + }, + { + "epoch": 10.804798698657992, + "grad_norm": 1.5901106283394537, + "learning_rate": 1.4586912260738816e-06, + "loss": 0.0184, + "step": 26569 + }, + { + "epoch": 10.805205368035788, + "grad_norm": 0.0003264945140297559, + "learning_rate": 1.4581643531681844e-06, + "loss": 0.0, + "step": 26570 + }, + { + "epoch": 10.805612037413583, + "grad_norm": 0.07082193703804598, + "learning_rate": 1.4576375679487286e-06, + "loss": 0.0007, + "step": 26571 + }, + { + "epoch": 10.806018706791379, + "grad_norm": 0.3357698570053438, + "learning_rate": 1.4571108704209225e-06, + "loss": 0.0043, + "step": 26572 + }, + { + "epoch": 10.806425376169175, + "grad_norm": 2.388780462503901e-05, + "learning_rate": 1.4565842605901715e-06, + "loss": 0.0, + "step": 26573 + }, + { + "epoch": 10.80683204554697, + "grad_norm": 0.0007770841608590362, + "learning_rate": 1.4560577384618847e-06, + "loss": 0.0, + "step": 26574 + }, + { + "epoch": 10.807238714924766, + "grad_norm": 0.03810338531835329, + "learning_rate": 1.4555313040414654e-06, + "loss": 0.0002, + "step": 26575 + }, + { + "epoch": 10.807645384302562, + "grad_norm": 0.06894281348037737, + "learning_rate": 1.4550049573343173e-06, + "loss": 0.0004, + "step": 26576 + }, + { + "epoch": 10.808052053680358, + "grad_norm": 9.96926796847023, + "learning_rate": 1.4544786983458436e-06, + "loss": 0.3507, + "step": 26577 + }, + { + "epoch": 10.808458723058154, + "grad_norm": 0.014506620576634174, + "learning_rate": 1.453952527081447e-06, + "loss": 0.0001, + "step": 26578 + }, + { + "epoch": 10.80886539243595, + "grad_norm": 0.003751258225000279, + "learning_rate": 1.4534264435465296e-06, + "loss": 0.0, + "step": 26579 + }, + { + "epoch": 10.809272061813745, + "grad_norm": 0.05590021239190339, + "learning_rate": 1.4529004477464882e-06, + "loss": 0.0005, + "step": 26580 + }, + { + "epoch": 10.809678731191541, + "grad_norm": 0.00436068182162009, + "learning_rate": 1.4523745396867283e-06, + "loss": 0.0, + "step": 26581 + }, + { + "epoch": 10.810085400569337, + "grad_norm": 0.028839038466824393, + "learning_rate": 1.451848719372645e-06, + "loss": 0.0004, + "step": 26582 + }, + { + "epoch": 10.810492069947133, + "grad_norm": 0.0002282360214084492, + "learning_rate": 1.4513229868096368e-06, + "loss": 0.0, + "step": 26583 + }, + { + "epoch": 10.810898739324928, + "grad_norm": 0.0004302562760860981, + "learning_rate": 1.4507973420031008e-06, + "loss": 0.0, + "step": 26584 + }, + { + "epoch": 10.811305408702724, + "grad_norm": 0.0012351922252221018, + "learning_rate": 1.4502717849584335e-06, + "loss": 0.0, + "step": 26585 + }, + { + "epoch": 10.81171207808052, + "grad_norm": 0.001831233492673716, + "learning_rate": 1.4497463156810266e-06, + "loss": 0.0, + "step": 26586 + }, + { + "epoch": 10.812118747458316, + "grad_norm": 0.00726126892688003, + "learning_rate": 1.4492209341762787e-06, + "loss": 0.0001, + "step": 26587 + }, + { + "epoch": 10.812525416836113, + "grad_norm": 0.7057052201860007, + "learning_rate": 1.4486956404495823e-06, + "loss": 0.0067, + "step": 26588 + }, + { + "epoch": 10.812932086213909, + "grad_norm": 0.18915801230289647, + "learning_rate": 1.4481704345063286e-06, + "loss": 0.0019, + "step": 26589 + }, + { + "epoch": 10.813338755591705, + "grad_norm": 0.007680238293107061, + "learning_rate": 1.447645316351909e-06, + "loss": 0.0001, + "step": 26590 + }, + { + "epoch": 10.8137454249695, + "grad_norm": 0.05737968202596133, + "learning_rate": 1.4471202859917143e-06, + "loss": 0.0004, + "step": 26591 + }, + { + "epoch": 10.814152094347296, + "grad_norm": 0.7481632386103684, + "learning_rate": 1.4465953434311353e-06, + "loss": 0.0047, + "step": 26592 + }, + { + "epoch": 10.814558763725092, + "grad_norm": 0.0001607360769060063, + "learning_rate": 1.4460704886755572e-06, + "loss": 0.0, + "step": 26593 + }, + { + "epoch": 10.814965433102888, + "grad_norm": 0.006996488103354584, + "learning_rate": 1.4455457217303738e-06, + "loss": 0.0, + "step": 26594 + }, + { + "epoch": 10.815372102480683, + "grad_norm": 0.020194165614841202, + "learning_rate": 1.445021042600968e-06, + "loss": 0.0001, + "step": 26595 + }, + { + "epoch": 10.81577877185848, + "grad_norm": 0.0442031564256147, + "learning_rate": 1.4444964512927262e-06, + "loss": 0.0003, + "step": 26596 + }, + { + "epoch": 10.816185441236275, + "grad_norm": 0.0020796970803924762, + "learning_rate": 1.4439719478110358e-06, + "loss": 0.0, + "step": 26597 + }, + { + "epoch": 10.81659211061407, + "grad_norm": 0.0009693367484686338, + "learning_rate": 1.4434475321612783e-06, + "loss": 0.0, + "step": 26598 + }, + { + "epoch": 10.816998779991867, + "grad_norm": 0.06408878092374132, + "learning_rate": 1.4429232043488373e-06, + "loss": 0.0005, + "step": 26599 + }, + { + "epoch": 10.817405449369662, + "grad_norm": 0.005231903881514054, + "learning_rate": 1.442398964379097e-06, + "loss": 0.0, + "step": 26600 + }, + { + "epoch": 10.817812118747458, + "grad_norm": 0.025068587569463015, + "learning_rate": 1.441874812257439e-06, + "loss": 0.0002, + "step": 26601 + }, + { + "epoch": 10.818218788125254, + "grad_norm": 0.02493065338713808, + "learning_rate": 1.441350747989244e-06, + "loss": 0.0003, + "step": 26602 + }, + { + "epoch": 10.81862545750305, + "grad_norm": 1.9383211345001654, + "learning_rate": 1.4408267715798896e-06, + "loss": 0.0045, + "step": 26603 + }, + { + "epoch": 10.819032126880845, + "grad_norm": 0.11555756584101848, + "learning_rate": 1.4403028830347575e-06, + "loss": 0.0011, + "step": 26604 + }, + { + "epoch": 10.819438796258641, + "grad_norm": 0.0434639761168851, + "learning_rate": 1.439779082359224e-06, + "loss": 0.0003, + "step": 26605 + }, + { + "epoch": 10.819845465636437, + "grad_norm": 0.0064108337675129745, + "learning_rate": 1.439255369558663e-06, + "loss": 0.0, + "step": 26606 + }, + { + "epoch": 10.820252135014233, + "grad_norm": 0.9452451928147961, + "learning_rate": 1.4387317446384575e-06, + "loss": 0.0068, + "step": 26607 + }, + { + "epoch": 10.820658804392028, + "grad_norm": 0.0008794917996912239, + "learning_rate": 1.4382082076039795e-06, + "loss": 0.0, + "step": 26608 + }, + { + "epoch": 10.821065473769824, + "grad_norm": 0.047585614107698904, + "learning_rate": 1.4376847584606036e-06, + "loss": 0.0004, + "step": 26609 + }, + { + "epoch": 10.821472143147622, + "grad_norm": 0.06650841065338725, + "learning_rate": 1.4371613972137022e-06, + "loss": 0.0005, + "step": 26610 + }, + { + "epoch": 10.821878812525418, + "grad_norm": 0.005988532563626691, + "learning_rate": 1.4366381238686489e-06, + "loss": 0.0001, + "step": 26611 + }, + { + "epoch": 10.822285481903213, + "grad_norm": 0.030590945303241232, + "learning_rate": 1.4361149384308127e-06, + "loss": 0.0004, + "step": 26612 + }, + { + "epoch": 10.822692151281009, + "grad_norm": 0.2964071498085717, + "learning_rate": 1.4355918409055692e-06, + "loss": 0.0022, + "step": 26613 + }, + { + "epoch": 10.823098820658805, + "grad_norm": 0.0014369509071055703, + "learning_rate": 1.4350688312982864e-06, + "loss": 0.0, + "step": 26614 + }, + { + "epoch": 10.8235054900366, + "grad_norm": 0.24916761218985828, + "learning_rate": 1.4345459096143322e-06, + "loss": 0.0011, + "step": 26615 + }, + { + "epoch": 10.823912159414396, + "grad_norm": 0.3884659214953631, + "learning_rate": 1.4340230758590745e-06, + "loss": 0.0014, + "step": 26616 + }, + { + "epoch": 10.824318828792192, + "grad_norm": 0.0004966538118180965, + "learning_rate": 1.4335003300378813e-06, + "loss": 0.0, + "step": 26617 + }, + { + "epoch": 10.824725498169988, + "grad_norm": 2.2925890038648653, + "learning_rate": 1.4329776721561196e-06, + "loss": 0.0222, + "step": 26618 + }, + { + "epoch": 10.825132167547784, + "grad_norm": 0.05829869419332217, + "learning_rate": 1.4324551022191514e-06, + "loss": 0.0005, + "step": 26619 + }, + { + "epoch": 10.82553883692558, + "grad_norm": 0.005950818370577997, + "learning_rate": 1.4319326202323447e-06, + "loss": 0.0001, + "step": 26620 + }, + { + "epoch": 10.825945506303375, + "grad_norm": 0.0020885132664052257, + "learning_rate": 1.4314102262010621e-06, + "loss": 0.0, + "step": 26621 + }, + { + "epoch": 10.826352175681171, + "grad_norm": 0.0019415951927859128, + "learning_rate": 1.4308879201306668e-06, + "loss": 0.0, + "step": 26622 + }, + { + "epoch": 10.826758845058967, + "grad_norm": 1.1425137512909822, + "learning_rate": 1.4303657020265193e-06, + "loss": 0.0114, + "step": 26623 + }, + { + "epoch": 10.827165514436762, + "grad_norm": 0.06794389315933667, + "learning_rate": 1.4298435718939806e-06, + "loss": 0.0006, + "step": 26624 + }, + { + "epoch": 10.827572183814558, + "grad_norm": 2.70658204645917e-05, + "learning_rate": 1.4293215297384089e-06, + "loss": 0.0, + "step": 26625 + }, + { + "epoch": 10.827978853192354, + "grad_norm": 0.017134446317584418, + "learning_rate": 1.4287995755651684e-06, + "loss": 0.0001, + "step": 26626 + }, + { + "epoch": 10.82838552257015, + "grad_norm": 0.003021190767747499, + "learning_rate": 1.428277709379613e-06, + "loss": 0.0, + "step": 26627 + }, + { + "epoch": 10.828792191947946, + "grad_norm": 0.01947246006404955, + "learning_rate": 1.4277559311871014e-06, + "loss": 0.0002, + "step": 26628 + }, + { + "epoch": 10.829198861325743, + "grad_norm": 0.12373920544125804, + "learning_rate": 1.4272342409929886e-06, + "loss": 0.0012, + "step": 26629 + }, + { + "epoch": 10.829605530703539, + "grad_norm": 0.00046094832693106857, + "learning_rate": 1.426712638802632e-06, + "loss": 0.0, + "step": 26630 + }, + { + "epoch": 10.830012200081335, + "grad_norm": 0.03245442806713815, + "learning_rate": 1.4261911246213844e-06, + "loss": 0.0003, + "step": 26631 + }, + { + "epoch": 10.83041886945913, + "grad_norm": 0.0016758731705768608, + "learning_rate": 1.425669698454598e-06, + "loss": 0.0, + "step": 26632 + }, + { + "epoch": 10.830825538836926, + "grad_norm": 0.012828969573370944, + "learning_rate": 1.4251483603076322e-06, + "loss": 0.0001, + "step": 26633 + }, + { + "epoch": 10.831232208214722, + "grad_norm": 0.5388889718438737, + "learning_rate": 1.4246271101858312e-06, + "loss": 0.0058, + "step": 26634 + }, + { + "epoch": 10.831638877592518, + "grad_norm": 3.317945820554255, + "learning_rate": 1.4241059480945497e-06, + "loss": 0.0324, + "step": 26635 + }, + { + "epoch": 10.832045546970313, + "grad_norm": 0.06236557241552692, + "learning_rate": 1.4235848740391357e-06, + "loss": 0.0003, + "step": 26636 + }, + { + "epoch": 10.83245221634811, + "grad_norm": 0.009307719913828696, + "learning_rate": 1.4230638880249382e-06, + "loss": 0.0001, + "step": 26637 + }, + { + "epoch": 10.832858885725905, + "grad_norm": 0.12333334541956097, + "learning_rate": 1.4225429900573084e-06, + "loss": 0.0007, + "step": 26638 + }, + { + "epoch": 10.8332655551037, + "grad_norm": 0.15352943108279757, + "learning_rate": 1.4220221801415912e-06, + "loss": 0.0014, + "step": 26639 + }, + { + "epoch": 10.833672224481496, + "grad_norm": 0.10409672907454544, + "learning_rate": 1.4215014582831344e-06, + "loss": 0.0009, + "step": 26640 + }, + { + "epoch": 10.834078893859292, + "grad_norm": 0.008926364676150458, + "learning_rate": 1.4209808244872825e-06, + "loss": 0.0001, + "step": 26641 + }, + { + "epoch": 10.834485563237088, + "grad_norm": 0.013101927415109587, + "learning_rate": 1.4204602787593802e-06, + "loss": 0.0001, + "step": 26642 + }, + { + "epoch": 10.834892232614884, + "grad_norm": 0.016287557958621385, + "learning_rate": 1.41993982110477e-06, + "loss": 0.0001, + "step": 26643 + }, + { + "epoch": 10.83529890199268, + "grad_norm": 0.45140076592683953, + "learning_rate": 1.4194194515287951e-06, + "loss": 0.0052, + "step": 26644 + }, + { + "epoch": 10.835705571370475, + "grad_norm": 0.0015628372345999491, + "learning_rate": 1.4188991700367994e-06, + "loss": 0.0, + "step": 26645 + }, + { + "epoch": 10.836112240748271, + "grad_norm": 0.011043386260504924, + "learning_rate": 1.4183789766341228e-06, + "loss": 0.0001, + "step": 26646 + }, + { + "epoch": 10.836518910126067, + "grad_norm": 0.00920263327667333, + "learning_rate": 1.4178588713261043e-06, + "loss": 0.0001, + "step": 26647 + }, + { + "epoch": 10.836925579503863, + "grad_norm": 0.3572100902870192, + "learning_rate": 1.4173388541180843e-06, + "loss": 0.0039, + "step": 26648 + }, + { + "epoch": 10.837332248881658, + "grad_norm": 0.005518816289138283, + "learning_rate": 1.4168189250154008e-06, + "loss": 0.0, + "step": 26649 + }, + { + "epoch": 10.837738918259454, + "grad_norm": 0.433356310967219, + "learning_rate": 1.4162990840233893e-06, + "loss": 0.0033, + "step": 26650 + }, + { + "epoch": 10.838145587637252, + "grad_norm": 0.061720114677411564, + "learning_rate": 1.4157793311473888e-06, + "loss": 0.0005, + "step": 26651 + }, + { + "epoch": 10.838552257015047, + "grad_norm": 0.0024313789467405966, + "learning_rate": 1.4152596663927343e-06, + "loss": 0.0, + "step": 26652 + }, + { + "epoch": 10.838958926392843, + "grad_norm": 0.0044217999885232985, + "learning_rate": 1.41474008976476e-06, + "loss": 0.0, + "step": 26653 + }, + { + "epoch": 10.839365595770639, + "grad_norm": 0.0027554631051902663, + "learning_rate": 1.4142206012687987e-06, + "loss": 0.0, + "step": 26654 + }, + { + "epoch": 10.839772265148435, + "grad_norm": 0.0009434365327176919, + "learning_rate": 1.4137012009101848e-06, + "loss": 0.0, + "step": 26655 + }, + { + "epoch": 10.84017893452623, + "grad_norm": 0.0022496668887677486, + "learning_rate": 1.4131818886942494e-06, + "loss": 0.0, + "step": 26656 + }, + { + "epoch": 10.840585603904026, + "grad_norm": 0.00311818922512403, + "learning_rate": 1.412662664626322e-06, + "loss": 0.0, + "step": 26657 + }, + { + "epoch": 10.840992273281822, + "grad_norm": 0.3209670198952029, + "learning_rate": 1.4121435287117357e-06, + "loss": 0.0027, + "step": 26658 + }, + { + "epoch": 10.841398942659618, + "grad_norm": 0.00014974403989309728, + "learning_rate": 1.4116244809558176e-06, + "loss": 0.0, + "step": 26659 + }, + { + "epoch": 10.841805612037414, + "grad_norm": 0.008865969742092073, + "learning_rate": 1.411105521363898e-06, + "loss": 0.0001, + "step": 26660 + }, + { + "epoch": 10.84221228141521, + "grad_norm": 0.01569301139754388, + "learning_rate": 1.4105866499413023e-06, + "loss": 0.0001, + "step": 26661 + }, + { + "epoch": 10.842618950793005, + "grad_norm": 0.0003842990509284726, + "learning_rate": 1.4100678666933576e-06, + "loss": 0.0, + "step": 26662 + }, + { + "epoch": 10.8430256201708, + "grad_norm": 0.2682226435965017, + "learning_rate": 1.4095491716253873e-06, + "loss": 0.0028, + "step": 26663 + }, + { + "epoch": 10.843432289548597, + "grad_norm": 0.005955171909638354, + "learning_rate": 1.4090305647427205e-06, + "loss": 0.0, + "step": 26664 + }, + { + "epoch": 10.843838958926392, + "grad_norm": 0.04504609439533984, + "learning_rate": 1.4085120460506784e-06, + "loss": 0.0005, + "step": 26665 + }, + { + "epoch": 10.844245628304188, + "grad_norm": 0.00464859997012898, + "learning_rate": 1.4079936155545848e-06, + "loss": 0.0, + "step": 26666 + }, + { + "epoch": 10.844652297681984, + "grad_norm": 0.10661130944050919, + "learning_rate": 1.4074752732597607e-06, + "loss": 0.0011, + "step": 26667 + }, + { + "epoch": 10.84505896705978, + "grad_norm": 0.00033545849115483584, + "learning_rate": 1.4069570191715276e-06, + "loss": 0.0, + "step": 26668 + }, + { + "epoch": 10.845465636437575, + "grad_norm": 0.010685612165412351, + "learning_rate": 1.4064388532952056e-06, + "loss": 0.0001, + "step": 26669 + }, + { + "epoch": 10.845872305815373, + "grad_norm": 0.02633147577063057, + "learning_rate": 1.4059207756361103e-06, + "loss": 0.0003, + "step": 26670 + }, + { + "epoch": 10.846278975193169, + "grad_norm": 0.00014789650765608035, + "learning_rate": 1.4054027861995678e-06, + "loss": 0.0, + "step": 26671 + }, + { + "epoch": 10.846685644570965, + "grad_norm": 0.013978444631702672, + "learning_rate": 1.404884884990889e-06, + "loss": 0.0001, + "step": 26672 + }, + { + "epoch": 10.84709231394876, + "grad_norm": 0.007962042669594837, + "learning_rate": 1.4043670720153947e-06, + "loss": 0.0001, + "step": 26673 + }, + { + "epoch": 10.847498983326556, + "grad_norm": 0.007834179285776528, + "learning_rate": 1.4038493472783977e-06, + "loss": 0.0001, + "step": 26674 + }, + { + "epoch": 10.847905652704352, + "grad_norm": 0.04771261187960925, + "learning_rate": 1.403331710785213e-06, + "loss": 0.0003, + "step": 26675 + }, + { + "epoch": 10.848312322082148, + "grad_norm": 0.08784912131069332, + "learning_rate": 1.402814162541154e-06, + "loss": 0.0009, + "step": 26676 + }, + { + "epoch": 10.848718991459943, + "grad_norm": 9.74041001212298e-05, + "learning_rate": 1.4022967025515356e-06, + "loss": 0.0, + "step": 26677 + }, + { + "epoch": 10.849125660837739, + "grad_norm": 0.01767692798209367, + "learning_rate": 1.4017793308216688e-06, + "loss": 0.0002, + "step": 26678 + }, + { + "epoch": 10.849532330215535, + "grad_norm": 0.03368557726420152, + "learning_rate": 1.401262047356865e-06, + "loss": 0.0003, + "step": 26679 + }, + { + "epoch": 10.84993899959333, + "grad_norm": 0.0360561322516015, + "learning_rate": 1.4007448521624334e-06, + "loss": 0.0003, + "step": 26680 + }, + { + "epoch": 10.850345668971126, + "grad_norm": 0.4614263720924894, + "learning_rate": 1.400227745243684e-06, + "loss": 0.0018, + "step": 26681 + }, + { + "epoch": 10.850752338348922, + "grad_norm": 0.10387318767285374, + "learning_rate": 1.399710726605925e-06, + "loss": 0.0011, + "step": 26682 + }, + { + "epoch": 10.851159007726718, + "grad_norm": 3.883144072837781, + "learning_rate": 1.399193796254462e-06, + "loss": 0.0712, + "step": 26683 + }, + { + "epoch": 10.851565677104514, + "grad_norm": 0.005879007037881772, + "learning_rate": 1.3986769541946055e-06, + "loss": 0.0, + "step": 26684 + }, + { + "epoch": 10.85197234648231, + "grad_norm": 0.029138926363431, + "learning_rate": 1.3981602004316586e-06, + "loss": 0.0003, + "step": 26685 + }, + { + "epoch": 10.852379015860105, + "grad_norm": 0.00016955382823331106, + "learning_rate": 1.397643534970926e-06, + "loss": 0.0, + "step": 26686 + }, + { + "epoch": 10.852785685237901, + "grad_norm": 0.09834254690788062, + "learning_rate": 1.3971269578177127e-06, + "loss": 0.0008, + "step": 26687 + }, + { + "epoch": 10.853192354615697, + "grad_norm": 0.0003724292500707197, + "learning_rate": 1.3966104689773208e-06, + "loss": 0.0, + "step": 26688 + }, + { + "epoch": 10.853599023993493, + "grad_norm": 0.021822562929068274, + "learning_rate": 1.3960940684550506e-06, + "loss": 0.0003, + "step": 26689 + }, + { + "epoch": 10.854005693371288, + "grad_norm": 0.20928091630801754, + "learning_rate": 1.3955777562562067e-06, + "loss": 0.0016, + "step": 26690 + }, + { + "epoch": 10.854412362749084, + "grad_norm": 0.03831620837967687, + "learning_rate": 1.3950615323860884e-06, + "loss": 0.0004, + "step": 26691 + }, + { + "epoch": 10.854819032126882, + "grad_norm": 0.0899040753174641, + "learning_rate": 1.3945453968499934e-06, + "loss": 0.0008, + "step": 26692 + }, + { + "epoch": 10.855225701504677, + "grad_norm": 11.418323446735828, + "learning_rate": 1.394029349653221e-06, + "loss": 0.5177, + "step": 26693 + }, + { + "epoch": 10.855632370882473, + "grad_norm": 1.347219456241476, + "learning_rate": 1.3935133908010689e-06, + "loss": 0.0055, + "step": 26694 + }, + { + "epoch": 10.856039040260269, + "grad_norm": 0.04212835855601749, + "learning_rate": 1.3929975202988334e-06, + "loss": 0.0002, + "step": 26695 + }, + { + "epoch": 10.856445709638065, + "grad_norm": 0.0003125234275029363, + "learning_rate": 1.3924817381518085e-06, + "loss": 0.0, + "step": 26696 + }, + { + "epoch": 10.85685237901586, + "grad_norm": 0.031732242946718454, + "learning_rate": 1.3919660443652917e-06, + "loss": 0.0003, + "step": 26697 + }, + { + "epoch": 10.857259048393656, + "grad_norm": 0.0006058422137137255, + "learning_rate": 1.391450438944576e-06, + "loss": 0.0, + "step": 26698 + }, + { + "epoch": 10.857665717771452, + "grad_norm": 0.2875462792879202, + "learning_rate": 1.3909349218949554e-06, + "loss": 0.0021, + "step": 26699 + }, + { + "epoch": 10.858072387149248, + "grad_norm": 0.003082091134674255, + "learning_rate": 1.3904194932217197e-06, + "loss": 0.0, + "step": 26700 + }, + { + "epoch": 10.858479056527043, + "grad_norm": 0.07855191222945573, + "learning_rate": 1.3899041529301604e-06, + "loss": 0.0003, + "step": 26701 + }, + { + "epoch": 10.85888572590484, + "grad_norm": 0.014894980296930607, + "learning_rate": 1.3893889010255667e-06, + "loss": 0.0002, + "step": 26702 + }, + { + "epoch": 10.859292395282635, + "grad_norm": 0.006046725160279727, + "learning_rate": 1.3888737375132322e-06, + "loss": 0.0, + "step": 26703 + }, + { + "epoch": 10.85969906466043, + "grad_norm": 0.0487044113615182, + "learning_rate": 1.3883586623984425e-06, + "loss": 0.0005, + "step": 26704 + }, + { + "epoch": 10.860105734038227, + "grad_norm": 0.013603393647179543, + "learning_rate": 1.3878436756864855e-06, + "loss": 0.0001, + "step": 26705 + }, + { + "epoch": 10.860512403416022, + "grad_norm": 0.10582509271623315, + "learning_rate": 1.387328777382647e-06, + "loss": 0.0007, + "step": 26706 + }, + { + "epoch": 10.860919072793818, + "grad_norm": 0.6965721970091531, + "learning_rate": 1.3868139674922131e-06, + "loss": 0.002, + "step": 26707 + }, + { + "epoch": 10.861325742171614, + "grad_norm": 0.01802991845552148, + "learning_rate": 1.3862992460204694e-06, + "loss": 0.0002, + "step": 26708 + }, + { + "epoch": 10.86173241154941, + "grad_norm": 0.0002700232614633574, + "learning_rate": 1.385784612972696e-06, + "loss": 0.0, + "step": 26709 + }, + { + "epoch": 10.862139080927205, + "grad_norm": 0.021161916200294412, + "learning_rate": 1.385270068354182e-06, + "loss": 0.0002, + "step": 26710 + }, + { + "epoch": 10.862545750305003, + "grad_norm": 0.06932606325272826, + "learning_rate": 1.3847556121702066e-06, + "loss": 0.0008, + "step": 26711 + }, + { + "epoch": 10.862952419682799, + "grad_norm": 0.003736318179206331, + "learning_rate": 1.38424124442605e-06, + "loss": 0.0, + "step": 26712 + }, + { + "epoch": 10.863359089060594, + "grad_norm": 0.022677174948280903, + "learning_rate": 1.3837269651269947e-06, + "loss": 0.0001, + "step": 26713 + }, + { + "epoch": 10.86376575843839, + "grad_norm": 0.04165173789331514, + "learning_rate": 1.3832127742783174e-06, + "loss": 0.0004, + "step": 26714 + }, + { + "epoch": 10.864172427816186, + "grad_norm": 0.0007117937448152136, + "learning_rate": 1.3826986718852952e-06, + "loss": 0.0, + "step": 26715 + }, + { + "epoch": 10.864579097193982, + "grad_norm": 4.0526754411915396e-05, + "learning_rate": 1.3821846579532117e-06, + "loss": 0.0, + "step": 26716 + }, + { + "epoch": 10.864985766571778, + "grad_norm": 0.00029584479214430704, + "learning_rate": 1.381670732487339e-06, + "loss": 0.0, + "step": 26717 + }, + { + "epoch": 10.865392435949573, + "grad_norm": 0.11003516886194081, + "learning_rate": 1.3811568954929533e-06, + "loss": 0.0014, + "step": 26718 + }, + { + "epoch": 10.865799105327369, + "grad_norm": 0.04044140373887269, + "learning_rate": 1.38064314697533e-06, + "loss": 0.0002, + "step": 26719 + }, + { + "epoch": 10.866205774705165, + "grad_norm": 0.6529586809304045, + "learning_rate": 1.380129486939744e-06, + "loss": 0.002, + "step": 26720 + }, + { + "epoch": 10.86661244408296, + "grad_norm": 0.010220896604166385, + "learning_rate": 1.3796159153914657e-06, + "loss": 0.0001, + "step": 26721 + }, + { + "epoch": 10.867019113460756, + "grad_norm": 0.04989857099496411, + "learning_rate": 1.379102432335767e-06, + "loss": 0.0005, + "step": 26722 + }, + { + "epoch": 10.867425782838552, + "grad_norm": 0.0018087569858572536, + "learning_rate": 1.3785890377779233e-06, + "loss": 0.0, + "step": 26723 + }, + { + "epoch": 10.867832452216348, + "grad_norm": 0.07043813744911304, + "learning_rate": 1.378075731723202e-06, + "loss": 0.0004, + "step": 26724 + }, + { + "epoch": 10.868239121594144, + "grad_norm": 0.06369264535835968, + "learning_rate": 1.3775625141768723e-06, + "loss": 0.0006, + "step": 26725 + }, + { + "epoch": 10.86864579097194, + "grad_norm": 0.0008472288044681744, + "learning_rate": 1.3770493851442025e-06, + "loss": 0.0, + "step": 26726 + }, + { + "epoch": 10.869052460349735, + "grad_norm": 0.08024501333435818, + "learning_rate": 1.3765363446304614e-06, + "loss": 0.0009, + "step": 26727 + }, + { + "epoch": 10.869459129727531, + "grad_norm": 0.00032636810284057227, + "learning_rate": 1.3760233926409128e-06, + "loss": 0.0, + "step": 26728 + }, + { + "epoch": 10.869865799105327, + "grad_norm": 0.033222679501705536, + "learning_rate": 1.3755105291808256e-06, + "loss": 0.0004, + "step": 26729 + }, + { + "epoch": 10.870272468483122, + "grad_norm": 0.00022671908309696392, + "learning_rate": 1.3749977542554637e-06, + "loss": 0.0, + "step": 26730 + }, + { + "epoch": 10.870679137860918, + "grad_norm": 0.20495364768098837, + "learning_rate": 1.3744850678700916e-06, + "loss": 0.0025, + "step": 26731 + }, + { + "epoch": 10.871085807238714, + "grad_norm": 0.08915304562237876, + "learning_rate": 1.3739724700299705e-06, + "loss": 0.0009, + "step": 26732 + }, + { + "epoch": 10.871492476616512, + "grad_norm": 0.008445866806835536, + "learning_rate": 1.3734599607403664e-06, + "loss": 0.0001, + "step": 26733 + }, + { + "epoch": 10.871899145994307, + "grad_norm": 2.8102795883709926e-05, + "learning_rate": 1.3729475400065328e-06, + "loss": 0.0, + "step": 26734 + }, + { + "epoch": 10.872305815372103, + "grad_norm": 1.0514469680239167, + "learning_rate": 1.3724352078337365e-06, + "loss": 0.0072, + "step": 26735 + }, + { + "epoch": 10.872712484749899, + "grad_norm": 0.0003434503120296539, + "learning_rate": 1.3719229642272346e-06, + "loss": 0.0, + "step": 26736 + }, + { + "epoch": 10.873119154127695, + "grad_norm": 0.013322238081877344, + "learning_rate": 1.3714108091922863e-06, + "loss": 0.0001, + "step": 26737 + }, + { + "epoch": 10.87352582350549, + "grad_norm": 0.029273248502520433, + "learning_rate": 1.370898742734149e-06, + "loss": 0.0002, + "step": 26738 + }, + { + "epoch": 10.873932492883286, + "grad_norm": 0.0341383596805271, + "learning_rate": 1.3703867648580794e-06, + "loss": 0.0004, + "step": 26739 + }, + { + "epoch": 10.874339162261082, + "grad_norm": 0.01854828703664445, + "learning_rate": 1.3698748755693292e-06, + "loss": 0.0002, + "step": 26740 + }, + { + "epoch": 10.874745831638878, + "grad_norm": 0.6615604890184421, + "learning_rate": 1.3693630748731602e-06, + "loss": 0.0059, + "step": 26741 + }, + { + "epoch": 10.875152501016673, + "grad_norm": 0.005612258190498676, + "learning_rate": 1.3688513627748235e-06, + "loss": 0.0001, + "step": 26742 + }, + { + "epoch": 10.87555917039447, + "grad_norm": 0.0017836161770366104, + "learning_rate": 1.3683397392795706e-06, + "loss": 0.0, + "step": 26743 + }, + { + "epoch": 10.875965839772265, + "grad_norm": 0.278080970252805, + "learning_rate": 1.3678282043926539e-06, + "loss": 0.0038, + "step": 26744 + }, + { + "epoch": 10.87637250915006, + "grad_norm": 0.09160965834662364, + "learning_rate": 1.3673167581193258e-06, + "loss": 0.0006, + "step": 26745 + }, + { + "epoch": 10.876779178527856, + "grad_norm": 0.02706308668335619, + "learning_rate": 1.3668054004648367e-06, + "loss": 0.0003, + "step": 26746 + }, + { + "epoch": 10.877185847905652, + "grad_norm": 1.5412426578734557, + "learning_rate": 1.3662941314344313e-06, + "loss": 0.0103, + "step": 26747 + }, + { + "epoch": 10.877592517283448, + "grad_norm": 0.0020981663445438286, + "learning_rate": 1.3657829510333653e-06, + "loss": 0.0, + "step": 26748 + }, + { + "epoch": 10.877999186661244, + "grad_norm": 0.19334513921040283, + "learning_rate": 1.3652718592668823e-06, + "loss": 0.0012, + "step": 26749 + }, + { + "epoch": 10.87840585603904, + "grad_norm": 0.0027465067376384474, + "learning_rate": 1.3647608561402303e-06, + "loss": 0.0, + "step": 26750 + }, + { + "epoch": 10.878812525416835, + "grad_norm": 0.008296955213285514, + "learning_rate": 1.364249941658653e-06, + "loss": 0.0001, + "step": 26751 + }, + { + "epoch": 10.879219194794633, + "grad_norm": 0.009298419489590173, + "learning_rate": 1.3637391158273971e-06, + "loss": 0.0001, + "step": 26752 + }, + { + "epoch": 10.879625864172429, + "grad_norm": 0.07380296183093207, + "learning_rate": 1.363228378651703e-06, + "loss": 0.0006, + "step": 26753 + }, + { + "epoch": 10.880032533550224, + "grad_norm": 0.5274091924188092, + "learning_rate": 1.362717730136819e-06, + "loss": 0.0062, + "step": 26754 + }, + { + "epoch": 10.88043920292802, + "grad_norm": 0.005927467069671578, + "learning_rate": 1.3622071702879847e-06, + "loss": 0.0001, + "step": 26755 + }, + { + "epoch": 10.880845872305816, + "grad_norm": 0.0022659660590501887, + "learning_rate": 1.3616966991104408e-06, + "loss": 0.0, + "step": 26756 + }, + { + "epoch": 10.881252541683612, + "grad_norm": 6.292766873655274, + "learning_rate": 1.3611863166094274e-06, + "loss": 0.2232, + "step": 26757 + }, + { + "epoch": 10.881659211061407, + "grad_norm": 0.00879109302624345, + "learning_rate": 1.360676022790185e-06, + "loss": 0.0001, + "step": 26758 + }, + { + "epoch": 10.882065880439203, + "grad_norm": 0.09149976856736149, + "learning_rate": 1.3601658176579514e-06, + "loss": 0.0002, + "step": 26759 + }, + { + "epoch": 10.882472549816999, + "grad_norm": 0.010088828506440143, + "learning_rate": 1.3596557012179613e-06, + "loss": 0.0001, + "step": 26760 + }, + { + "epoch": 10.882879219194795, + "grad_norm": 0.0075453029024713, + "learning_rate": 1.359145673475456e-06, + "loss": 0.0001, + "step": 26761 + }, + { + "epoch": 10.88328588857259, + "grad_norm": 0.00011581277537439182, + "learning_rate": 1.3586357344356704e-06, + "loss": 0.0, + "step": 26762 + }, + { + "epoch": 10.883692557950386, + "grad_norm": 0.7188798569911624, + "learning_rate": 1.358125884103837e-06, + "loss": 0.0042, + "step": 26763 + }, + { + "epoch": 10.884099227328182, + "grad_norm": 0.0003617801405862169, + "learning_rate": 1.3576161224851914e-06, + "loss": 0.0, + "step": 26764 + }, + { + "epoch": 10.884505896705978, + "grad_norm": 1.1079243391488027, + "learning_rate": 1.357106449584965e-06, + "loss": 0.008, + "step": 26765 + }, + { + "epoch": 10.884912566083774, + "grad_norm": 0.024307099911245182, + "learning_rate": 1.3565968654083893e-06, + "loss": 0.0002, + "step": 26766 + }, + { + "epoch": 10.88531923546157, + "grad_norm": 0.0020879470452085437, + "learning_rate": 1.356087369960699e-06, + "loss": 0.0, + "step": 26767 + }, + { + "epoch": 10.885725904839365, + "grad_norm": 0.05789169104347574, + "learning_rate": 1.3555779632471223e-06, + "loss": 0.0005, + "step": 26768 + }, + { + "epoch": 10.88613257421716, + "grad_norm": 0.00931828340342504, + "learning_rate": 1.3550686452728888e-06, + "loss": 0.0001, + "step": 26769 + }, + { + "epoch": 10.886539243594957, + "grad_norm": 0.0061797459738759, + "learning_rate": 1.3545594160432263e-06, + "loss": 0.0, + "step": 26770 + }, + { + "epoch": 10.886945912972752, + "grad_norm": 0.012505622788125741, + "learning_rate": 1.3540502755633623e-06, + "loss": 0.0001, + "step": 26771 + }, + { + "epoch": 10.887352582350548, + "grad_norm": 0.16896695681590507, + "learning_rate": 1.3535412238385236e-06, + "loss": 0.0008, + "step": 26772 + }, + { + "epoch": 10.887759251728344, + "grad_norm": 0.04832441784132619, + "learning_rate": 1.3530322608739332e-06, + "loss": 0.0003, + "step": 26773 + }, + { + "epoch": 10.888165921106141, + "grad_norm": 0.0028092928763328454, + "learning_rate": 1.3525233866748221e-06, + "loss": 0.0, + "step": 26774 + }, + { + "epoch": 10.888572590483937, + "grad_norm": 0.006910998852826758, + "learning_rate": 1.3520146012464098e-06, + "loss": 0.0001, + "step": 26775 + }, + { + "epoch": 10.888979259861733, + "grad_norm": 0.004529288054216036, + "learning_rate": 1.3515059045939206e-06, + "loss": 0.0, + "step": 26776 + }, + { + "epoch": 10.889385929239529, + "grad_norm": 0.0011717292654630333, + "learning_rate": 1.3509972967225748e-06, + "loss": 0.0, + "step": 26777 + }, + { + "epoch": 10.889792598617325, + "grad_norm": 0.04395105542032317, + "learning_rate": 1.3504887776375952e-06, + "loss": 0.0003, + "step": 26778 + }, + { + "epoch": 10.89019926799512, + "grad_norm": 4.21462414198566, + "learning_rate": 1.3499803473441996e-06, + "loss": 0.0376, + "step": 26779 + }, + { + "epoch": 10.890605937372916, + "grad_norm": 0.0005453898455157755, + "learning_rate": 1.3494720058476118e-06, + "loss": 0.0, + "step": 26780 + }, + { + "epoch": 10.891012606750712, + "grad_norm": 0.035743662986864884, + "learning_rate": 1.3489637531530475e-06, + "loss": 0.0003, + "step": 26781 + }, + { + "epoch": 10.891419276128508, + "grad_norm": 0.030201167053587864, + "learning_rate": 1.3484555892657236e-06, + "loss": 0.0004, + "step": 26782 + }, + { + "epoch": 10.891825945506303, + "grad_norm": 0.9495224183202735, + "learning_rate": 1.347947514190857e-06, + "loss": 0.0097, + "step": 26783 + }, + { + "epoch": 10.892232614884099, + "grad_norm": 0.030030927565194623, + "learning_rate": 1.3474395279336649e-06, + "loss": 0.0003, + "step": 26784 + }, + { + "epoch": 10.892639284261895, + "grad_norm": 0.006359608598671905, + "learning_rate": 1.3469316304993595e-06, + "loss": 0.0001, + "step": 26785 + }, + { + "epoch": 10.89304595363969, + "grad_norm": 0.024693735957509098, + "learning_rate": 1.3464238218931546e-06, + "loss": 0.0001, + "step": 26786 + }, + { + "epoch": 10.893452623017486, + "grad_norm": 0.1510071363022477, + "learning_rate": 1.345916102120267e-06, + "loss": 0.0017, + "step": 26787 + }, + { + "epoch": 10.893859292395282, + "grad_norm": 0.01711897781827088, + "learning_rate": 1.345408471185905e-06, + "loss": 0.0002, + "step": 26788 + }, + { + "epoch": 10.894265961773078, + "grad_norm": 0.12054496954394357, + "learning_rate": 1.344900929095282e-06, + "loss": 0.0012, + "step": 26789 + }, + { + "epoch": 10.894672631150874, + "grad_norm": 0.41835527211419343, + "learning_rate": 1.3443934758536059e-06, + "loss": 0.003, + "step": 26790 + }, + { + "epoch": 10.89507930052867, + "grad_norm": 0.015052714247983667, + "learning_rate": 1.3438861114660874e-06, + "loss": 0.0001, + "step": 26791 + }, + { + "epoch": 10.895485969906465, + "grad_norm": 0.03814489332805449, + "learning_rate": 1.343378835937933e-06, + "loss": 0.0004, + "step": 26792 + }, + { + "epoch": 10.895892639284263, + "grad_norm": 0.017519309997978214, + "learning_rate": 1.3428716492743521e-06, + "loss": 0.0002, + "step": 26793 + }, + { + "epoch": 10.896299308662059, + "grad_norm": 0.03926260418030402, + "learning_rate": 1.3423645514805528e-06, + "loss": 0.0004, + "step": 26794 + }, + { + "epoch": 10.896705978039854, + "grad_norm": 0.040261318503891357, + "learning_rate": 1.3418575425617376e-06, + "loss": 0.0004, + "step": 26795 + }, + { + "epoch": 10.89711264741765, + "grad_norm": 0.3959698841654432, + "learning_rate": 1.341350622523112e-06, + "loss": 0.0054, + "step": 26796 + }, + { + "epoch": 10.897519316795446, + "grad_norm": 0.24002362250777756, + "learning_rate": 1.34084379136988e-06, + "loss": 0.0018, + "step": 26797 + }, + { + "epoch": 10.897925986173242, + "grad_norm": 0.05124667608765277, + "learning_rate": 1.3403370491072453e-06, + "loss": 0.0004, + "step": 26798 + }, + { + "epoch": 10.898332655551037, + "grad_norm": 0.021923065127207545, + "learning_rate": 1.3398303957404069e-06, + "loss": 0.0002, + "step": 26799 + }, + { + "epoch": 10.898739324928833, + "grad_norm": 10.365978929436231, + "learning_rate": 1.3393238312745704e-06, + "loss": 0.2593, + "step": 26800 + }, + { + "epoch": 10.899145994306629, + "grad_norm": 0.0012878070019218606, + "learning_rate": 1.338817355714933e-06, + "loss": 0.0, + "step": 26801 + }, + { + "epoch": 10.899552663684425, + "grad_norm": 0.01575222935544029, + "learning_rate": 1.338310969066694e-06, + "loss": 0.0001, + "step": 26802 + }, + { + "epoch": 10.89995933306222, + "grad_norm": 0.004208492295536286, + "learning_rate": 1.3378046713350534e-06, + "loss": 0.0, + "step": 26803 + }, + { + "epoch": 10.900366002440016, + "grad_norm": 0.007797486069066357, + "learning_rate": 1.3372984625252072e-06, + "loss": 0.0001, + "step": 26804 + }, + { + "epoch": 10.900772671817812, + "grad_norm": 0.006501776179879638, + "learning_rate": 1.336792342642349e-06, + "loss": 0.0001, + "step": 26805 + }, + { + "epoch": 10.901179341195608, + "grad_norm": 0.0015737840755776316, + "learning_rate": 1.3362863116916814e-06, + "loss": 0.0, + "step": 26806 + }, + { + "epoch": 10.901586010573403, + "grad_norm": 0.1917382965364551, + "learning_rate": 1.3357803696783934e-06, + "loss": 0.0021, + "step": 26807 + }, + { + "epoch": 10.9019926799512, + "grad_norm": 0.021206338401067174, + "learning_rate": 1.335274516607682e-06, + "loss": 0.0003, + "step": 26808 + }, + { + "epoch": 10.902399349328995, + "grad_norm": 0.02001756441519019, + "learning_rate": 1.3347687524847375e-06, + "loss": 0.0001, + "step": 26809 + }, + { + "epoch": 10.90280601870679, + "grad_norm": 0.0012475294701244635, + "learning_rate": 1.3342630773147535e-06, + "loss": 0.0, + "step": 26810 + }, + { + "epoch": 10.903212688084587, + "grad_norm": 0.016095892774196256, + "learning_rate": 1.3337574911029195e-06, + "loss": 0.0002, + "step": 26811 + }, + { + "epoch": 10.903619357462382, + "grad_norm": 0.005574265504763895, + "learning_rate": 1.3332519938544252e-06, + "loss": 0.0001, + "step": 26812 + }, + { + "epoch": 10.904026026840178, + "grad_norm": 1.636522727462627, + "learning_rate": 1.3327465855744625e-06, + "loss": 0.0141, + "step": 26813 + }, + { + "epoch": 10.904432696217976, + "grad_norm": 0.001993139308913483, + "learning_rate": 1.3322412662682195e-06, + "loss": 0.0, + "step": 26814 + }, + { + "epoch": 10.904839365595771, + "grad_norm": 0.0017728957516251853, + "learning_rate": 1.3317360359408815e-06, + "loss": 0.0, + "step": 26815 + }, + { + "epoch": 10.905246034973567, + "grad_norm": 0.014877482382403594, + "learning_rate": 1.3312308945976348e-06, + "loss": 0.0001, + "step": 26816 + }, + { + "epoch": 10.905652704351363, + "grad_norm": 0.03642670402586416, + "learning_rate": 1.3307258422436676e-06, + "loss": 0.0004, + "step": 26817 + }, + { + "epoch": 10.906059373729159, + "grad_norm": 0.009683554024646908, + "learning_rate": 1.3302208788841598e-06, + "loss": 0.0001, + "step": 26818 + }, + { + "epoch": 10.906466043106954, + "grad_norm": 0.05711223850751624, + "learning_rate": 1.3297160045242996e-06, + "loss": 0.0003, + "step": 26819 + }, + { + "epoch": 10.90687271248475, + "grad_norm": 0.00048218044703662833, + "learning_rate": 1.3292112191692696e-06, + "loss": 0.0, + "step": 26820 + }, + { + "epoch": 10.907279381862546, + "grad_norm": 0.029688032052926194, + "learning_rate": 1.328706522824249e-06, + "loss": 0.0003, + "step": 26821 + }, + { + "epoch": 10.907686051240342, + "grad_norm": 0.8757034541949049, + "learning_rate": 1.3282019154944215e-06, + "loss": 0.0089, + "step": 26822 + }, + { + "epoch": 10.908092720618138, + "grad_norm": 0.005164460522147697, + "learning_rate": 1.3276973971849649e-06, + "loss": 0.0, + "step": 26823 + }, + { + "epoch": 10.908499389995933, + "grad_norm": 2.6152728054652918e-05, + "learning_rate": 1.3271929679010599e-06, + "loss": 0.0, + "step": 26824 + }, + { + "epoch": 10.908906059373729, + "grad_norm": 0.006986289620135016, + "learning_rate": 1.3266886276478808e-06, + "loss": 0.0001, + "step": 26825 + }, + { + "epoch": 10.909312728751525, + "grad_norm": 0.020532867561335502, + "learning_rate": 1.3261843764306103e-06, + "loss": 0.0002, + "step": 26826 + }, + { + "epoch": 10.90971939812932, + "grad_norm": 0.09717850575196876, + "learning_rate": 1.3256802142544234e-06, + "loss": 0.0011, + "step": 26827 + }, + { + "epoch": 10.910126067507116, + "grad_norm": 0.07929852874303259, + "learning_rate": 1.3251761411244946e-06, + "loss": 0.0006, + "step": 26828 + }, + { + "epoch": 10.910532736884912, + "grad_norm": 0.17720639991368423, + "learning_rate": 1.3246721570459987e-06, + "loss": 0.0024, + "step": 26829 + }, + { + "epoch": 10.910939406262708, + "grad_norm": 0.002687973870888791, + "learning_rate": 1.3241682620241091e-06, + "loss": 0.0, + "step": 26830 + }, + { + "epoch": 10.911346075640504, + "grad_norm": 0.0031703757611335455, + "learning_rate": 1.3236644560639967e-06, + "loss": 0.0, + "step": 26831 + }, + { + "epoch": 10.9117527450183, + "grad_norm": 0.012659475690927256, + "learning_rate": 1.323160739170838e-06, + "loss": 0.0001, + "step": 26832 + }, + { + "epoch": 10.912159414396095, + "grad_norm": 0.010788740153925216, + "learning_rate": 1.3226571113498022e-06, + "loss": 0.0001, + "step": 26833 + }, + { + "epoch": 10.912566083773893, + "grad_norm": 0.09986709799317546, + "learning_rate": 1.3221535726060575e-06, + "loss": 0.0004, + "step": 26834 + }, + { + "epoch": 10.912972753151688, + "grad_norm": 0.06535159272645963, + "learning_rate": 1.321650122944773e-06, + "loss": 0.0005, + "step": 26835 + }, + { + "epoch": 10.913379422529484, + "grad_norm": 0.018804788534688997, + "learning_rate": 1.3211467623711183e-06, + "loss": 0.0001, + "step": 26836 + }, + { + "epoch": 10.91378609190728, + "grad_norm": 0.0017193662601450587, + "learning_rate": 1.3206434908902576e-06, + "loss": 0.0, + "step": 26837 + }, + { + "epoch": 10.914192761285076, + "grad_norm": 0.020244818396947712, + "learning_rate": 1.3201403085073617e-06, + "loss": 0.0002, + "step": 26838 + }, + { + "epoch": 10.914599430662872, + "grad_norm": 0.016342005720360934, + "learning_rate": 1.3196372152275938e-06, + "loss": 0.0002, + "step": 26839 + }, + { + "epoch": 10.915006100040667, + "grad_norm": 0.026640810971915986, + "learning_rate": 1.319134211056119e-06, + "loss": 0.0003, + "step": 26840 + }, + { + "epoch": 10.915412769418463, + "grad_norm": 0.001772099421608026, + "learning_rate": 1.3186312959981007e-06, + "loss": 0.0, + "step": 26841 + }, + { + "epoch": 10.915819438796259, + "grad_norm": 0.16989180100319515, + "learning_rate": 1.3181284700587016e-06, + "loss": 0.0019, + "step": 26842 + }, + { + "epoch": 10.916226108174055, + "grad_norm": 0.02828168062444821, + "learning_rate": 1.3176257332430798e-06, + "loss": 0.0001, + "step": 26843 + }, + { + "epoch": 10.91663277755185, + "grad_norm": 0.3071720790577419, + "learning_rate": 1.3171230855564033e-06, + "loss": 0.0015, + "step": 26844 + }, + { + "epoch": 10.917039446929646, + "grad_norm": 0.01236606281260822, + "learning_rate": 1.3166205270038267e-06, + "loss": 0.0001, + "step": 26845 + }, + { + "epoch": 10.917446116307442, + "grad_norm": 0.002450266101780429, + "learning_rate": 1.316118057590512e-06, + "loss": 0.0, + "step": 26846 + }, + { + "epoch": 10.917852785685238, + "grad_norm": 0.02391764505851778, + "learning_rate": 1.3156156773216156e-06, + "loss": 0.0004, + "step": 26847 + }, + { + "epoch": 10.918259455063033, + "grad_norm": 0.059500219640776786, + "learning_rate": 1.3151133862022947e-06, + "loss": 0.001, + "step": 26848 + }, + { + "epoch": 10.91866612444083, + "grad_norm": 0.7061101333887599, + "learning_rate": 1.3146111842377062e-06, + "loss": 0.0054, + "step": 26849 + }, + { + "epoch": 10.919072793818625, + "grad_norm": 0.5360050825262417, + "learning_rate": 1.3141090714330028e-06, + "loss": 0.0055, + "step": 26850 + }, + { + "epoch": 10.91947946319642, + "grad_norm": 0.000533318927778783, + "learning_rate": 1.3136070477933438e-06, + "loss": 0.0, + "step": 26851 + }, + { + "epoch": 10.919886132574216, + "grad_norm": 0.00041270006831142596, + "learning_rate": 1.3131051133238804e-06, + "loss": 0.0, + "step": 26852 + }, + { + "epoch": 10.920292801952012, + "grad_norm": 0.0037351761890488354, + "learning_rate": 1.312603268029765e-06, + "loss": 0.0, + "step": 26853 + }, + { + "epoch": 10.920699471329808, + "grad_norm": 0.007680880772146072, + "learning_rate": 1.3121015119161485e-06, + "loss": 0.0001, + "step": 26854 + }, + { + "epoch": 10.921106140707606, + "grad_norm": 0.5521408061003467, + "learning_rate": 1.311599844988184e-06, + "loss": 0.0048, + "step": 26855 + }, + { + "epoch": 10.921512810085401, + "grad_norm": 0.050177488197355256, + "learning_rate": 1.3110982672510165e-06, + "loss": 0.0004, + "step": 26856 + }, + { + "epoch": 10.921919479463197, + "grad_norm": 0.17175205392697285, + "learning_rate": 1.3105967787098006e-06, + "loss": 0.0013, + "step": 26857 + }, + { + "epoch": 10.922326148840993, + "grad_norm": 0.0002814355488949423, + "learning_rate": 1.3100953793696825e-06, + "loss": 0.0, + "step": 26858 + }, + { + "epoch": 10.922732818218789, + "grad_norm": 0.00855188832577033, + "learning_rate": 1.3095940692358077e-06, + "loss": 0.0001, + "step": 26859 + }, + { + "epoch": 10.923139487596584, + "grad_norm": 1.7776581448819377, + "learning_rate": 1.3090928483133247e-06, + "loss": 0.0185, + "step": 26860 + }, + { + "epoch": 10.92354615697438, + "grad_norm": 0.0007429184138332741, + "learning_rate": 1.308591716607377e-06, + "loss": 0.0, + "step": 26861 + }, + { + "epoch": 10.923952826352176, + "grad_norm": 3.9640973750998523, + "learning_rate": 1.3080906741231104e-06, + "loss": 0.0671, + "step": 26862 + }, + { + "epoch": 10.924359495729972, + "grad_norm": 0.03283020736084102, + "learning_rate": 1.307589720865665e-06, + "loss": 0.0003, + "step": 26863 + }, + { + "epoch": 10.924766165107767, + "grad_norm": 0.00040810037083059236, + "learning_rate": 1.3070888568401885e-06, + "loss": 0.0, + "step": 26864 + }, + { + "epoch": 10.925172834485563, + "grad_norm": 0.004140106912633419, + "learning_rate": 1.3065880820518194e-06, + "loss": 0.0, + "step": 26865 + }, + { + "epoch": 10.925579503863359, + "grad_norm": 0.09425407697145799, + "learning_rate": 1.3060873965056987e-06, + "loss": 0.0004, + "step": 26866 + }, + { + "epoch": 10.925986173241155, + "grad_norm": 0.0015300122539315042, + "learning_rate": 1.3055868002069672e-06, + "loss": 0.0, + "step": 26867 + }, + { + "epoch": 10.92639284261895, + "grad_norm": 0.003551163411161657, + "learning_rate": 1.3050862931607621e-06, + "loss": 0.0, + "step": 26868 + }, + { + "epoch": 10.926799511996746, + "grad_norm": 0.0010866298820050625, + "learning_rate": 1.3045858753722207e-06, + "loss": 0.0, + "step": 26869 + }, + { + "epoch": 10.927206181374542, + "grad_norm": 0.66064079808557, + "learning_rate": 1.304085546846484e-06, + "loss": 0.0056, + "step": 26870 + }, + { + "epoch": 10.927612850752338, + "grad_norm": 0.02319865350872378, + "learning_rate": 1.303585307588685e-06, + "loss": 0.0002, + "step": 26871 + }, + { + "epoch": 10.928019520130134, + "grad_norm": 0.0010886123000643313, + "learning_rate": 1.3030851576039604e-06, + "loss": 0.0, + "step": 26872 + }, + { + "epoch": 10.92842618950793, + "grad_norm": 0.10857690157245736, + "learning_rate": 1.302585096897443e-06, + "loss": 0.0009, + "step": 26873 + }, + { + "epoch": 10.928832858885725, + "grad_norm": 0.003702118688808465, + "learning_rate": 1.3020851254742672e-06, + "loss": 0.0, + "step": 26874 + }, + { + "epoch": 10.929239528263523, + "grad_norm": 0.07393436071368217, + "learning_rate": 1.3015852433395648e-06, + "loss": 0.0007, + "step": 26875 + }, + { + "epoch": 10.929646197641318, + "grad_norm": 0.017534115458053178, + "learning_rate": 1.301085450498466e-06, + "loss": 0.0001, + "step": 26876 + }, + { + "epoch": 10.930052867019114, + "grad_norm": 0.0014323156716776221, + "learning_rate": 1.3005857469561056e-06, + "loss": 0.0, + "step": 26877 + }, + { + "epoch": 10.93045953639691, + "grad_norm": 0.0014249690590082654, + "learning_rate": 1.3000861327176107e-06, + "loss": 0.0, + "step": 26878 + }, + { + "epoch": 10.930866205774706, + "grad_norm": 0.008324627712775685, + "learning_rate": 1.2995866077881091e-06, + "loss": 0.0001, + "step": 26879 + }, + { + "epoch": 10.931272875152501, + "grad_norm": 0.001985964227583344, + "learning_rate": 1.2990871721727317e-06, + "loss": 0.0, + "step": 26880 + }, + { + "epoch": 10.931679544530297, + "grad_norm": 0.2917345428184572, + "learning_rate": 1.2985878258766015e-06, + "loss": 0.0023, + "step": 26881 + }, + { + "epoch": 10.932086213908093, + "grad_norm": 0.16993495251373233, + "learning_rate": 1.298088568904846e-06, + "loss": 0.0011, + "step": 26882 + }, + { + "epoch": 10.932492883285889, + "grad_norm": 0.06047948799365091, + "learning_rate": 1.2975894012625923e-06, + "loss": 0.0005, + "step": 26883 + }, + { + "epoch": 10.932899552663685, + "grad_norm": 0.019180708318148592, + "learning_rate": 1.2970903229549637e-06, + "loss": 0.0002, + "step": 26884 + }, + { + "epoch": 10.93330622204148, + "grad_norm": 0.001640639712670091, + "learning_rate": 1.2965913339870829e-06, + "loss": 0.0, + "step": 26885 + }, + { + "epoch": 10.933712891419276, + "grad_norm": 0.0024076456332580553, + "learning_rate": 1.2960924343640724e-06, + "loss": 0.0, + "step": 26886 + }, + { + "epoch": 10.934119560797072, + "grad_norm": 0.07099762643538897, + "learning_rate": 1.2955936240910538e-06, + "loss": 0.0008, + "step": 26887 + }, + { + "epoch": 10.934526230174868, + "grad_norm": 0.00041494370922629236, + "learning_rate": 1.2950949031731464e-06, + "loss": 0.0, + "step": 26888 + }, + { + "epoch": 10.934932899552663, + "grad_norm": 0.020244021106101146, + "learning_rate": 1.2945962716154702e-06, + "loss": 0.0002, + "step": 26889 + }, + { + "epoch": 10.935339568930459, + "grad_norm": 0.0011749381997789043, + "learning_rate": 1.294097729423146e-06, + "loss": 0.0, + "step": 26890 + }, + { + "epoch": 10.935746238308255, + "grad_norm": 0.00290779350079612, + "learning_rate": 1.2935992766012906e-06, + "loss": 0.0, + "step": 26891 + }, + { + "epoch": 10.93615290768605, + "grad_norm": 0.0018672254868010577, + "learning_rate": 1.29310091315502e-06, + "loss": 0.0, + "step": 26892 + }, + { + "epoch": 10.936559577063846, + "grad_norm": 0.1315534616617326, + "learning_rate": 1.2926026390894508e-06, + "loss": 0.0012, + "step": 26893 + }, + { + "epoch": 10.936966246441642, + "grad_norm": 0.008942534183394944, + "learning_rate": 1.2921044544096973e-06, + "loss": 0.0001, + "step": 26894 + }, + { + "epoch": 10.937372915819438, + "grad_norm": 0.007145570044775999, + "learning_rate": 1.2916063591208728e-06, + "loss": 0.0001, + "step": 26895 + }, + { + "epoch": 10.937779585197235, + "grad_norm": 0.00044358306645164746, + "learning_rate": 1.2911083532280943e-06, + "loss": 0.0, + "step": 26896 + }, + { + "epoch": 10.938186254575031, + "grad_norm": 0.0005540723516970678, + "learning_rate": 1.2906104367364703e-06, + "loss": 0.0, + "step": 26897 + }, + { + "epoch": 10.938592923952827, + "grad_norm": 0.16892171196657926, + "learning_rate": 1.2901126096511151e-06, + "loss": 0.0019, + "step": 26898 + }, + { + "epoch": 10.938999593330623, + "grad_norm": 0.02218462450617695, + "learning_rate": 1.289614871977136e-06, + "loss": 0.0002, + "step": 26899 + }, + { + "epoch": 10.939406262708419, + "grad_norm": 0.06817393273699465, + "learning_rate": 1.2891172237196447e-06, + "loss": 0.0003, + "step": 26900 + }, + { + "epoch": 10.939812932086214, + "grad_norm": 0.0022301886953072347, + "learning_rate": 1.2886196648837501e-06, + "loss": 0.0, + "step": 26901 + }, + { + "epoch": 10.94021960146401, + "grad_norm": 0.025724861103189903, + "learning_rate": 1.2881221954745559e-06, + "loss": 0.0002, + "step": 26902 + }, + { + "epoch": 10.940626270841806, + "grad_norm": 0.012309748661221021, + "learning_rate": 1.287624815497175e-06, + "loss": 0.0001, + "step": 26903 + }, + { + "epoch": 10.941032940219602, + "grad_norm": 0.021596541950348297, + "learning_rate": 1.2871275249567096e-06, + "loss": 0.0001, + "step": 26904 + }, + { + "epoch": 10.941439609597397, + "grad_norm": 0.02156366857444904, + "learning_rate": 1.2866303238582645e-06, + "loss": 0.0002, + "step": 26905 + }, + { + "epoch": 10.941846278975193, + "grad_norm": 0.00016809951782204794, + "learning_rate": 1.2861332122069448e-06, + "loss": 0.0, + "step": 26906 + }, + { + "epoch": 10.942252948352989, + "grad_norm": 0.032637440345092694, + "learning_rate": 1.285636190007854e-06, + "loss": 0.0003, + "step": 26907 + }, + { + "epoch": 10.942659617730785, + "grad_norm": 0.2562087971812765, + "learning_rate": 1.2851392572660914e-06, + "loss": 0.002, + "step": 26908 + }, + { + "epoch": 10.94306628710858, + "grad_norm": 0.28341395088920407, + "learning_rate": 1.284642413986762e-06, + "loss": 0.0019, + "step": 26909 + }, + { + "epoch": 10.943472956486376, + "grad_norm": 0.005564525088331662, + "learning_rate": 1.2841456601749648e-06, + "loss": 0.0, + "step": 26910 + }, + { + "epoch": 10.943879625864172, + "grad_norm": 0.02757327119306867, + "learning_rate": 1.283648995835799e-06, + "loss": 0.0002, + "step": 26911 + }, + { + "epoch": 10.944286295241968, + "grad_norm": 8.720110842759468e-05, + "learning_rate": 1.2831524209743628e-06, + "loss": 0.0, + "step": 26912 + }, + { + "epoch": 10.944692964619763, + "grad_norm": 0.0007292647339632983, + "learning_rate": 1.2826559355957546e-06, + "loss": 0.0, + "step": 26913 + }, + { + "epoch": 10.94509963399756, + "grad_norm": 0.007985194283480946, + "learning_rate": 1.28215953970507e-06, + "loss": 0.0001, + "step": 26914 + }, + { + "epoch": 10.945506303375355, + "grad_norm": 0.011363728820667463, + "learning_rate": 1.2816632333074042e-06, + "loss": 0.0001, + "step": 26915 + }, + { + "epoch": 10.945912972753153, + "grad_norm": 0.07082124924975533, + "learning_rate": 1.2811670164078539e-06, + "loss": 0.0005, + "step": 26916 + }, + { + "epoch": 10.946319642130948, + "grad_norm": 0.03130155302187606, + "learning_rate": 1.2806708890115138e-06, + "loss": 0.0001, + "step": 26917 + }, + { + "epoch": 10.946726311508744, + "grad_norm": 0.03030120843765767, + "learning_rate": 1.2801748511234747e-06, + "loss": 0.0005, + "step": 26918 + }, + { + "epoch": 10.94713298088654, + "grad_norm": 0.006197404572086605, + "learning_rate": 1.2796789027488287e-06, + "loss": 0.0001, + "step": 26919 + }, + { + "epoch": 10.947539650264336, + "grad_norm": 0.0005982203201843643, + "learning_rate": 1.2791830438926679e-06, + "loss": 0.0, + "step": 26920 + }, + { + "epoch": 10.947946319642131, + "grad_norm": 0.03226541747265002, + "learning_rate": 1.278687274560081e-06, + "loss": 0.0002, + "step": 26921 + }, + { + "epoch": 10.948352989019927, + "grad_norm": 0.16757102891375852, + "learning_rate": 1.2781915947561595e-06, + "loss": 0.0011, + "step": 26922 + }, + { + "epoch": 10.948759658397723, + "grad_norm": 0.04715118108202779, + "learning_rate": 1.2776960044859921e-06, + "loss": 0.0005, + "step": 26923 + }, + { + "epoch": 10.949166327775519, + "grad_norm": 0.012990741467973105, + "learning_rate": 1.2772005037546643e-06, + "loss": 0.0001, + "step": 26924 + }, + { + "epoch": 10.949572997153314, + "grad_norm": 1.296991362121587, + "learning_rate": 1.276705092567263e-06, + "loss": 0.0143, + "step": 26925 + }, + { + "epoch": 10.94997966653111, + "grad_norm": 0.16927435174629166, + "learning_rate": 1.2762097709288746e-06, + "loss": 0.0023, + "step": 26926 + }, + { + "epoch": 10.950386335908906, + "grad_norm": 4.816924754268842, + "learning_rate": 1.2757145388445835e-06, + "loss": 0.0874, + "step": 26927 + }, + { + "epoch": 10.950793005286702, + "grad_norm": 0.017853111162571298, + "learning_rate": 1.2752193963194726e-06, + "loss": 0.0002, + "step": 26928 + }, + { + "epoch": 10.951199674664498, + "grad_norm": 0.0006692326463088333, + "learning_rate": 1.2747243433586265e-06, + "loss": 0.0, + "step": 26929 + }, + { + "epoch": 10.951606344042293, + "grad_norm": 0.006773012245675625, + "learning_rate": 1.2742293799671267e-06, + "loss": 0.0, + "step": 26930 + }, + { + "epoch": 10.952013013420089, + "grad_norm": 0.14081542385115234, + "learning_rate": 1.2737345061500538e-06, + "loss": 0.0014, + "step": 26931 + }, + { + "epoch": 10.952419682797885, + "grad_norm": 0.006136051242370121, + "learning_rate": 1.2732397219124882e-06, + "loss": 0.0001, + "step": 26932 + }, + { + "epoch": 10.95282635217568, + "grad_norm": 0.04457761175816975, + "learning_rate": 1.2727450272595087e-06, + "loss": 0.0002, + "step": 26933 + }, + { + "epoch": 10.953233021553476, + "grad_norm": 0.002041046220473932, + "learning_rate": 1.272250422196194e-06, + "loss": 0.0, + "step": 26934 + }, + { + "epoch": 10.953639690931272, + "grad_norm": 0.002134301224665247, + "learning_rate": 1.2717559067276208e-06, + "loss": 0.0, + "step": 26935 + }, + { + "epoch": 10.954046360309068, + "grad_norm": 0.07736481544331514, + "learning_rate": 1.2712614808588665e-06, + "loss": 0.0009, + "step": 26936 + }, + { + "epoch": 10.954453029686865, + "grad_norm": 5.674969444574558e-05, + "learning_rate": 1.2707671445950066e-06, + "loss": 0.0, + "step": 26937 + }, + { + "epoch": 10.954859699064661, + "grad_norm": 0.11724701107791767, + "learning_rate": 1.270272897941114e-06, + "loss": 0.0005, + "step": 26938 + }, + { + "epoch": 10.955266368442457, + "grad_norm": 0.02788553528582481, + "learning_rate": 1.2697787409022655e-06, + "loss": 0.0002, + "step": 26939 + }, + { + "epoch": 10.955673037820253, + "grad_norm": 0.00016883567553991748, + "learning_rate": 1.2692846734835284e-06, + "loss": 0.0, + "step": 26940 + }, + { + "epoch": 10.956079707198048, + "grad_norm": 0.0035293616381171494, + "learning_rate": 1.268790695689981e-06, + "loss": 0.0, + "step": 26941 + }, + { + "epoch": 10.956486376575844, + "grad_norm": 0.16099696032821306, + "learning_rate": 1.268296807526691e-06, + "loss": 0.0009, + "step": 26942 + }, + { + "epoch": 10.95689304595364, + "grad_norm": 0.11899600302108188, + "learning_rate": 1.2678030089987292e-06, + "loss": 0.0009, + "step": 26943 + }, + { + "epoch": 10.957299715331436, + "grad_norm": 0.03579643080325792, + "learning_rate": 1.2673093001111648e-06, + "loss": 0.0002, + "step": 26944 + }, + { + "epoch": 10.957706384709232, + "grad_norm": 2.013389016004992, + "learning_rate": 1.2668156808690645e-06, + "loss": 0.0172, + "step": 26945 + }, + { + "epoch": 10.958113054087027, + "grad_norm": 0.7910384140661388, + "learning_rate": 1.2663221512774948e-06, + "loss": 0.0061, + "step": 26946 + }, + { + "epoch": 10.958519723464823, + "grad_norm": 1.6097772572824232, + "learning_rate": 1.265828711341527e-06, + "loss": 0.0132, + "step": 26947 + }, + { + "epoch": 10.958926392842619, + "grad_norm": 0.12137811589330522, + "learning_rate": 1.2653353610662223e-06, + "loss": 0.0008, + "step": 26948 + }, + { + "epoch": 10.959333062220415, + "grad_norm": 0.00021731739966302072, + "learning_rate": 1.2648421004566458e-06, + "loss": 0.0, + "step": 26949 + }, + { + "epoch": 10.95973973159821, + "grad_norm": 0.002313128556853202, + "learning_rate": 1.2643489295178612e-06, + "loss": 0.0, + "step": 26950 + }, + { + "epoch": 10.960146400976006, + "grad_norm": 0.003988482505806789, + "learning_rate": 1.2638558482549324e-06, + "loss": 0.0, + "step": 26951 + }, + { + "epoch": 10.960553070353802, + "grad_norm": 0.0018864415907741314, + "learning_rate": 1.2633628566729207e-06, + "loss": 0.0, + "step": 26952 + }, + { + "epoch": 10.960959739731598, + "grad_norm": 0.0001917140816991473, + "learning_rate": 1.262869954776883e-06, + "loss": 0.0, + "step": 26953 + }, + { + "epoch": 10.961366409109393, + "grad_norm": 0.43046167376576283, + "learning_rate": 1.2623771425718845e-06, + "loss": 0.0037, + "step": 26954 + }, + { + "epoch": 10.96177307848719, + "grad_norm": 0.00012262182012051225, + "learning_rate": 1.2618844200629832e-06, + "loss": 0.0, + "step": 26955 + }, + { + "epoch": 10.962179747864985, + "grad_norm": 0.037983329298523806, + "learning_rate": 1.261391787255235e-06, + "loss": 0.0004, + "step": 26956 + }, + { + "epoch": 10.962586417242782, + "grad_norm": 0.0037787980497028823, + "learning_rate": 1.2608992441536993e-06, + "loss": 0.0001, + "step": 26957 + }, + { + "epoch": 10.962993086620578, + "grad_norm": 0.012213880904356956, + "learning_rate": 1.2604067907634298e-06, + "loss": 0.0001, + "step": 26958 + }, + { + "epoch": 10.963399755998374, + "grad_norm": 0.001607796974279029, + "learning_rate": 1.2599144270894826e-06, + "loss": 0.0, + "step": 26959 + }, + { + "epoch": 10.96380642537617, + "grad_norm": 0.023582717069112583, + "learning_rate": 1.2594221531369133e-06, + "loss": 0.0002, + "step": 26960 + }, + { + "epoch": 10.964213094753966, + "grad_norm": 0.0022508072411736, + "learning_rate": 1.258929968910776e-06, + "loss": 0.0, + "step": 26961 + }, + { + "epoch": 10.964619764131761, + "grad_norm": 6.520698453165345, + "learning_rate": 1.2584378744161207e-06, + "loss": 0.1033, + "step": 26962 + }, + { + "epoch": 10.965026433509557, + "grad_norm": 0.00468994797850481, + "learning_rate": 1.2579458696580005e-06, + "loss": 0.0, + "step": 26963 + }, + { + "epoch": 10.965433102887353, + "grad_norm": 0.00591726331558256, + "learning_rate": 1.2574539546414655e-06, + "loss": 0.0001, + "step": 26964 + }, + { + "epoch": 10.965839772265149, + "grad_norm": 0.13073002121164123, + "learning_rate": 1.2569621293715662e-06, + "loss": 0.0009, + "step": 26965 + }, + { + "epoch": 10.966246441642944, + "grad_norm": 1.0308111197931844, + "learning_rate": 1.2564703938533484e-06, + "loss": 0.0025, + "step": 26966 + }, + { + "epoch": 10.96665311102074, + "grad_norm": 1.657431006746379, + "learning_rate": 1.255978748091865e-06, + "loss": 0.0166, + "step": 26967 + }, + { + "epoch": 10.967059780398536, + "grad_norm": 0.19644817776396634, + "learning_rate": 1.2554871920921608e-06, + "loss": 0.0017, + "step": 26968 + }, + { + "epoch": 10.967466449776332, + "grad_norm": 0.02433886915047153, + "learning_rate": 1.2549957258592805e-06, + "loss": 0.0002, + "step": 26969 + }, + { + "epoch": 10.967873119154127, + "grad_norm": 0.0022715685139044297, + "learning_rate": 1.254504349398271e-06, + "loss": 0.0, + "step": 26970 + }, + { + "epoch": 10.968279788531923, + "grad_norm": 0.0013299106173561834, + "learning_rate": 1.2540130627141766e-06, + "loss": 0.0, + "step": 26971 + }, + { + "epoch": 10.968686457909719, + "grad_norm": 9.621970574452434e-05, + "learning_rate": 1.2535218658120363e-06, + "loss": 0.0, + "step": 26972 + }, + { + "epoch": 10.969093127287515, + "grad_norm": 0.06680281989022725, + "learning_rate": 1.2530307586968992e-06, + "loss": 0.0007, + "step": 26973 + }, + { + "epoch": 10.96949979666531, + "grad_norm": 0.2692147151526407, + "learning_rate": 1.2525397413738038e-06, + "loss": 0.0011, + "step": 26974 + }, + { + "epoch": 10.969906466043106, + "grad_norm": 0.0003718262080296598, + "learning_rate": 1.2520488138477893e-06, + "loss": 0.0, + "step": 26975 + }, + { + "epoch": 10.970313135420902, + "grad_norm": 0.026629593492296343, + "learning_rate": 1.2515579761238972e-06, + "loss": 0.0002, + "step": 26976 + }, + { + "epoch": 10.970719804798698, + "grad_norm": 0.011061708536496313, + "learning_rate": 1.2510672282071657e-06, + "loss": 0.0001, + "step": 26977 + }, + { + "epoch": 10.971126474176495, + "grad_norm": 0.09837553032701178, + "learning_rate": 1.2505765701026319e-06, + "loss": 0.0013, + "step": 26978 + }, + { + "epoch": 10.971533143554291, + "grad_norm": 0.02088391851588003, + "learning_rate": 1.2500860018153315e-06, + "loss": 0.0003, + "step": 26979 + }, + { + "epoch": 10.971939812932087, + "grad_norm": 0.013681668534939182, + "learning_rate": 1.2495955233503043e-06, + "loss": 0.0001, + "step": 26980 + }, + { + "epoch": 10.972346482309883, + "grad_norm": 0.004388328974389837, + "learning_rate": 1.2491051347125826e-06, + "loss": 0.0, + "step": 26981 + }, + { + "epoch": 10.972753151687678, + "grad_norm": 0.017297045052628687, + "learning_rate": 1.2486148359072004e-06, + "loss": 0.0002, + "step": 26982 + }, + { + "epoch": 10.973159821065474, + "grad_norm": 0.016436656474147993, + "learning_rate": 1.2481246269391922e-06, + "loss": 0.0002, + "step": 26983 + }, + { + "epoch": 10.97356649044327, + "grad_norm": 0.01711574138215899, + "learning_rate": 1.2476345078135888e-06, + "loss": 0.0002, + "step": 26984 + }, + { + "epoch": 10.973973159821066, + "grad_norm": 0.7325418143614072, + "learning_rate": 1.2471444785354204e-06, + "loss": 0.0035, + "step": 26985 + }, + { + "epoch": 10.974379829198861, + "grad_norm": 0.19968919984832487, + "learning_rate": 1.2466545391097206e-06, + "loss": 0.0024, + "step": 26986 + }, + { + "epoch": 10.974786498576657, + "grad_norm": 3.472596513161424, + "learning_rate": 1.2461646895415191e-06, + "loss": 0.1144, + "step": 26987 + }, + { + "epoch": 10.975193167954453, + "grad_norm": 0.010826335691200619, + "learning_rate": 1.2456749298358417e-06, + "loss": 0.0001, + "step": 26988 + }, + { + "epoch": 10.975599837332249, + "grad_norm": 0.011086485416637638, + "learning_rate": 1.2451852599977165e-06, + "loss": 0.0001, + "step": 26989 + }, + { + "epoch": 10.976006506710045, + "grad_norm": 0.018279155070367613, + "learning_rate": 1.244695680032172e-06, + "loss": 0.0002, + "step": 26990 + }, + { + "epoch": 10.97641317608784, + "grad_norm": 0.0023367682407698723, + "learning_rate": 1.2442061899442314e-06, + "loss": 0.0, + "step": 26991 + }, + { + "epoch": 10.976819845465636, + "grad_norm": 0.004503731692799967, + "learning_rate": 1.2437167897389203e-06, + "loss": 0.0, + "step": 26992 + }, + { + "epoch": 10.977226514843432, + "grad_norm": 0.0005184531727875004, + "learning_rate": 1.2432274794212651e-06, + "loss": 0.0, + "step": 26993 + }, + { + "epoch": 10.977633184221228, + "grad_norm": 0.020950042634068296, + "learning_rate": 1.2427382589962866e-06, + "loss": 0.0002, + "step": 26994 + }, + { + "epoch": 10.978039853599023, + "grad_norm": 0.0017294798228192547, + "learning_rate": 1.2422491284690075e-06, + "loss": 0.0, + "step": 26995 + }, + { + "epoch": 10.978446522976819, + "grad_norm": 0.0002895505464788105, + "learning_rate": 1.241760087844448e-06, + "loss": 0.0, + "step": 26996 + }, + { + "epoch": 10.978853192354615, + "grad_norm": 0.0020185293784336362, + "learning_rate": 1.2412711371276298e-06, + "loss": 0.0, + "step": 26997 + }, + { + "epoch": 10.979259861732412, + "grad_norm": 0.0009072996344226508, + "learning_rate": 1.2407822763235699e-06, + "loss": 0.0, + "step": 26998 + }, + { + "epoch": 10.979666531110208, + "grad_norm": 0.058707823234779924, + "learning_rate": 1.24029350543729e-06, + "loss": 0.0003, + "step": 26999 + }, + { + "epoch": 10.980073200488004, + "grad_norm": 0.020204919531517956, + "learning_rate": 1.239804824473806e-06, + "loss": 0.0001, + "step": 27000 + }, + { + "epoch": 10.9804798698658, + "grad_norm": 0.004621091506203088, + "learning_rate": 1.2393162334381338e-06, + "loss": 0.0, + "step": 27001 + }, + { + "epoch": 10.980886539243595, + "grad_norm": 0.05643752523327269, + "learning_rate": 1.2388277323352905e-06, + "loss": 0.0006, + "step": 27002 + }, + { + "epoch": 10.981293208621391, + "grad_norm": 0.04657851185951609, + "learning_rate": 1.23833932117029e-06, + "loss": 0.0004, + "step": 27003 + }, + { + "epoch": 10.981699877999187, + "grad_norm": 0.4226569689900564, + "learning_rate": 1.2378509999481448e-06, + "loss": 0.0041, + "step": 27004 + }, + { + "epoch": 10.982106547376983, + "grad_norm": 0.37912386290504974, + "learning_rate": 1.2373627686738677e-06, + "loss": 0.0017, + "step": 27005 + }, + { + "epoch": 10.982513216754779, + "grad_norm": 0.0007210782368339555, + "learning_rate": 1.2368746273524745e-06, + "loss": 0.0, + "step": 27006 + }, + { + "epoch": 10.982919886132574, + "grad_norm": 0.2823390122450446, + "learning_rate": 1.2363865759889737e-06, + "loss": 0.0021, + "step": 27007 + }, + { + "epoch": 10.98332655551037, + "grad_norm": 0.00539534339835451, + "learning_rate": 1.2358986145883744e-06, + "loss": 0.0, + "step": 27008 + }, + { + "epoch": 10.983733224888166, + "grad_norm": 0.04464866684448618, + "learning_rate": 1.2354107431556872e-06, + "loss": 0.0005, + "step": 27009 + }, + { + "epoch": 10.984139894265962, + "grad_norm": 0.008876653339295422, + "learning_rate": 1.234922961695919e-06, + "loss": 0.0001, + "step": 27010 + }, + { + "epoch": 10.984546563643757, + "grad_norm": 0.016047504581782987, + "learning_rate": 1.2344352702140772e-06, + "loss": 0.0002, + "step": 27011 + }, + { + "epoch": 10.984953233021553, + "grad_norm": 2.7833652956044608, + "learning_rate": 1.2339476687151707e-06, + "loss": 0.0233, + "step": 27012 + }, + { + "epoch": 10.985359902399349, + "grad_norm": 0.002442096093946292, + "learning_rate": 1.2334601572042026e-06, + "loss": 0.0, + "step": 27013 + }, + { + "epoch": 10.985766571777145, + "grad_norm": 0.01785203827379916, + "learning_rate": 1.2329727356861786e-06, + "loss": 0.0001, + "step": 27014 + }, + { + "epoch": 10.98617324115494, + "grad_norm": 0.2970450806804395, + "learning_rate": 1.2324854041661017e-06, + "loss": 0.0009, + "step": 27015 + }, + { + "epoch": 10.986579910532736, + "grad_norm": 0.017961980512876372, + "learning_rate": 1.2319981626489752e-06, + "loss": 0.0001, + "step": 27016 + }, + { + "epoch": 10.986986579910532, + "grad_norm": 0.4530406767273779, + "learning_rate": 1.2315110111397998e-06, + "loss": 0.0014, + "step": 27017 + }, + { + "epoch": 10.987393249288328, + "grad_norm": 0.0027226971354581083, + "learning_rate": 1.2310239496435749e-06, + "loss": 0.0, + "step": 27018 + }, + { + "epoch": 10.987799918666125, + "grad_norm": 0.11043607169532865, + "learning_rate": 1.2305369781653042e-06, + "loss": 0.001, + "step": 27019 + }, + { + "epoch": 10.988206588043921, + "grad_norm": 0.004327303397749584, + "learning_rate": 1.2300500967099859e-06, + "loss": 0.0, + "step": 27020 + }, + { + "epoch": 10.988613257421717, + "grad_norm": 0.01583847236480515, + "learning_rate": 1.2295633052826172e-06, + "loss": 0.0002, + "step": 27021 + }, + { + "epoch": 10.989019926799513, + "grad_norm": 0.0007867978025390284, + "learning_rate": 1.229076603888195e-06, + "loss": 0.0, + "step": 27022 + }, + { + "epoch": 10.989426596177308, + "grad_norm": 0.3298687036981454, + "learning_rate": 1.2285899925317157e-06, + "loss": 0.0019, + "step": 27023 + }, + { + "epoch": 10.989833265555104, + "grad_norm": 0.01467784850224657, + "learning_rate": 1.2281034712181717e-06, + "loss": 0.0001, + "step": 27024 + }, + { + "epoch": 10.9902399349329, + "grad_norm": 0.0006930320493077383, + "learning_rate": 1.2276170399525633e-06, + "loss": 0.0, + "step": 27025 + }, + { + "epoch": 10.990646604310696, + "grad_norm": 0.0002725128258226546, + "learning_rate": 1.2271306987398812e-06, + "loss": 0.0, + "step": 27026 + }, + { + "epoch": 10.991053273688491, + "grad_norm": 0.0796247124882311, + "learning_rate": 1.226644447585118e-06, + "loss": 0.0005, + "step": 27027 + }, + { + "epoch": 10.991459943066287, + "grad_norm": 0.007784963039506322, + "learning_rate": 1.2261582864932642e-06, + "loss": 0.0001, + "step": 27028 + }, + { + "epoch": 10.991866612444083, + "grad_norm": 0.05127709317798856, + "learning_rate": 1.2256722154693125e-06, + "loss": 0.0004, + "step": 27029 + }, + { + "epoch": 10.992273281821879, + "grad_norm": 0.024050193274463675, + "learning_rate": 1.2251862345182507e-06, + "loss": 0.0002, + "step": 27030 + }, + { + "epoch": 10.992679951199674, + "grad_norm": 0.4488706570175138, + "learning_rate": 1.2247003436450666e-06, + "loss": 0.0034, + "step": 27031 + }, + { + "epoch": 10.99308662057747, + "grad_norm": 0.0029627967919823145, + "learning_rate": 1.2242145428547535e-06, + "loss": 0.0, + "step": 27032 + }, + { + "epoch": 10.993493289955266, + "grad_norm": 0.06363806212961662, + "learning_rate": 1.2237288321522934e-06, + "loss": 0.0008, + "step": 27033 + }, + { + "epoch": 10.993899959333062, + "grad_norm": 0.0018463610402978674, + "learning_rate": 1.223243211542674e-06, + "loss": 0.0, + "step": 27034 + }, + { + "epoch": 10.994306628710858, + "grad_norm": 0.00023167327299295144, + "learning_rate": 1.2227576810308806e-06, + "loss": 0.0, + "step": 27035 + }, + { + "epoch": 10.994713298088653, + "grad_norm": 0.00048466433662584855, + "learning_rate": 1.2222722406218946e-06, + "loss": 0.0, + "step": 27036 + }, + { + "epoch": 10.995119967466449, + "grad_norm": 0.19114007670516703, + "learning_rate": 1.2217868903207032e-06, + "loss": 0.0013, + "step": 27037 + }, + { + "epoch": 10.995526636844245, + "grad_norm": 0.07504741061768927, + "learning_rate": 1.2213016301322888e-06, + "loss": 0.0009, + "step": 27038 + }, + { + "epoch": 10.995933306222042, + "grad_norm": 0.002522757574239653, + "learning_rate": 1.2208164600616301e-06, + "loss": 0.0, + "step": 27039 + }, + { + "epoch": 10.996339975599838, + "grad_norm": 0.03387935175571841, + "learning_rate": 1.2203313801137096e-06, + "loss": 0.0004, + "step": 27040 + }, + { + "epoch": 10.996746644977634, + "grad_norm": 0.09967824982516202, + "learning_rate": 1.2198463902935055e-06, + "loss": 0.0006, + "step": 27041 + }, + { + "epoch": 10.99715331435543, + "grad_norm": 0.028703815396110482, + "learning_rate": 1.219361490605996e-06, + "loss": 0.0003, + "step": 27042 + }, + { + "epoch": 10.997559983733225, + "grad_norm": 0.06496538757288194, + "learning_rate": 1.2188766810561614e-06, + "loss": 0.0006, + "step": 27043 + }, + { + "epoch": 10.997966653111021, + "grad_norm": 0.0016852843077207278, + "learning_rate": 1.218391961648977e-06, + "loss": 0.0, + "step": 27044 + }, + { + "epoch": 10.998373322488817, + "grad_norm": 0.6456155470206566, + "learning_rate": 1.2179073323894197e-06, + "loss": 0.0027, + "step": 27045 + }, + { + "epoch": 10.998779991866613, + "grad_norm": 0.005429779038937189, + "learning_rate": 1.2174227932824634e-06, + "loss": 0.0001, + "step": 27046 + }, + { + "epoch": 10.999186661244408, + "grad_norm": 0.0001283472441720235, + "learning_rate": 1.2169383443330819e-06, + "loss": 0.0, + "step": 27047 + }, + { + "epoch": 10.999593330622204, + "grad_norm": 0.0033738802866346406, + "learning_rate": 1.2164539855462488e-06, + "loss": 0.0, + "step": 27048 + }, + { + "epoch": 11.0, + "grad_norm": 3.951154254731674, + "learning_rate": 1.2159697169269335e-06, + "loss": 0.0478, + "step": 27049 + }, + { + "epoch": 11.000406669377796, + "grad_norm": 0.03545600313688516, + "learning_rate": 1.2154855384801124e-06, + "loss": 0.0003, + "step": 27050 + }, + { + "epoch": 11.000813338755592, + "grad_norm": 0.02267316493146181, + "learning_rate": 1.2150014502107543e-06, + "loss": 0.0002, + "step": 27051 + }, + { + "epoch": 11.001220008133387, + "grad_norm": 0.028494130443649054, + "learning_rate": 1.2145174521238269e-06, + "loss": 0.0003, + "step": 27052 + }, + { + "epoch": 11.001626677511183, + "grad_norm": 0.09256313484687965, + "learning_rate": 1.214033544224299e-06, + "loss": 0.0006, + "step": 27053 + }, + { + "epoch": 11.002033346888979, + "grad_norm": 0.020124027875119747, + "learning_rate": 1.213549726517139e-06, + "loss": 0.0002, + "step": 27054 + }, + { + "epoch": 11.002440016266775, + "grad_norm": 0.00871551133280246, + "learning_rate": 1.2130659990073146e-06, + "loss": 0.0001, + "step": 27055 + }, + { + "epoch": 11.00284668564457, + "grad_norm": 0.0025416137996935846, + "learning_rate": 1.2125823616997868e-06, + "loss": 0.0, + "step": 27056 + }, + { + "epoch": 11.003253355022366, + "grad_norm": 0.08036515352544979, + "learning_rate": 1.212098814599525e-06, + "loss": 0.0006, + "step": 27057 + }, + { + "epoch": 11.003660024400162, + "grad_norm": 0.032145489477715505, + "learning_rate": 1.2116153577114932e-06, + "loss": 0.0004, + "step": 27058 + }, + { + "epoch": 11.004066693777958, + "grad_norm": 0.0007763434461785987, + "learning_rate": 1.2111319910406517e-06, + "loss": 0.0, + "step": 27059 + }, + { + "epoch": 11.004473363155755, + "grad_norm": 0.0002528797781986009, + "learning_rate": 1.2106487145919642e-06, + "loss": 0.0, + "step": 27060 + }, + { + "epoch": 11.004880032533551, + "grad_norm": 0.002957715143667355, + "learning_rate": 1.2101655283703916e-06, + "loss": 0.0, + "step": 27061 + }, + { + "epoch": 11.005286701911347, + "grad_norm": 0.0230686363431556, + "learning_rate": 1.2096824323808909e-06, + "loss": 0.0001, + "step": 27062 + }, + { + "epoch": 11.005693371289142, + "grad_norm": 0.006384048508094093, + "learning_rate": 1.209199426628427e-06, + "loss": 0.0001, + "step": 27063 + }, + { + "epoch": 11.006100040666938, + "grad_norm": 0.00015904216733106567, + "learning_rate": 1.2087165111179545e-06, + "loss": 0.0, + "step": 27064 + }, + { + "epoch": 11.006506710044734, + "grad_norm": 0.5281498494156599, + "learning_rate": 1.2082336858544318e-06, + "loss": 0.0043, + "step": 27065 + }, + { + "epoch": 11.00691337942253, + "grad_norm": 0.2561217245091682, + "learning_rate": 1.2077509508428154e-06, + "loss": 0.0021, + "step": 27066 + }, + { + "epoch": 11.007320048800326, + "grad_norm": 0.024318454569440814, + "learning_rate": 1.2072683060880608e-06, + "loss": 0.0002, + "step": 27067 + }, + { + "epoch": 11.007726718178121, + "grad_norm": 0.0011558623680765119, + "learning_rate": 1.2067857515951208e-06, + "loss": 0.0, + "step": 27068 + }, + { + "epoch": 11.008133387555917, + "grad_norm": 0.0006277515417741736, + "learning_rate": 1.2063032873689495e-06, + "loss": 0.0, + "step": 27069 + }, + { + "epoch": 11.008540056933713, + "grad_norm": 0.011076498848254545, + "learning_rate": 1.2058209134145037e-06, + "loss": 0.0001, + "step": 27070 + }, + { + "epoch": 11.008946726311509, + "grad_norm": 0.008735914539613436, + "learning_rate": 1.2053386297367308e-06, + "loss": 0.0001, + "step": 27071 + }, + { + "epoch": 11.009353395689304, + "grad_norm": 0.008266662046606636, + "learning_rate": 1.2048564363405846e-06, + "loss": 0.0001, + "step": 27072 + }, + { + "epoch": 11.0097600650671, + "grad_norm": 0.009648167108284611, + "learning_rate": 1.204374333231012e-06, + "loss": 0.0001, + "step": 27073 + }, + { + "epoch": 11.010166734444896, + "grad_norm": 0.002451092951985627, + "learning_rate": 1.2038923204129649e-06, + "loss": 0.0, + "step": 27074 + }, + { + "epoch": 11.010573403822692, + "grad_norm": 0.011336845710058347, + "learning_rate": 1.2034103978913869e-06, + "loss": 0.0001, + "step": 27075 + }, + { + "epoch": 11.010980073200487, + "grad_norm": 0.011074372014100319, + "learning_rate": 1.2029285656712308e-06, + "loss": 0.0001, + "step": 27076 + }, + { + "epoch": 11.011386742578283, + "grad_norm": 0.031970328209722856, + "learning_rate": 1.2024468237574417e-06, + "loss": 0.0004, + "step": 27077 + }, + { + "epoch": 11.011793411956079, + "grad_norm": 0.2722266716620063, + "learning_rate": 1.2019651721549619e-06, + "loss": 0.0022, + "step": 27078 + }, + { + "epoch": 11.012200081333875, + "grad_norm": 0.004827604870896698, + "learning_rate": 1.201483610868739e-06, + "loss": 0.0, + "step": 27079 + }, + { + "epoch": 11.012606750711672, + "grad_norm": 0.014849566810025822, + "learning_rate": 1.2010021399037141e-06, + "loss": 0.0001, + "step": 27080 + }, + { + "epoch": 11.013013420089468, + "grad_norm": 0.005049366717900684, + "learning_rate": 1.2005207592648306e-06, + "loss": 0.0001, + "step": 27081 + }, + { + "epoch": 11.013420089467264, + "grad_norm": 0.00788608901719455, + "learning_rate": 1.2000394689570282e-06, + "loss": 0.0001, + "step": 27082 + }, + { + "epoch": 11.01382675884506, + "grad_norm": 0.12791786672150884, + "learning_rate": 1.1995582689852526e-06, + "loss": 0.0011, + "step": 27083 + }, + { + "epoch": 11.014233428222855, + "grad_norm": 0.0020645349891958552, + "learning_rate": 1.1990771593544392e-06, + "loss": 0.0, + "step": 27084 + }, + { + "epoch": 11.014640097600651, + "grad_norm": 0.001500077636241617, + "learning_rate": 1.198596140069529e-06, + "loss": 0.0, + "step": 27085 + }, + { + "epoch": 11.015046766978447, + "grad_norm": 0.0032676278455464713, + "learning_rate": 1.1981152111354588e-06, + "loss": 0.0, + "step": 27086 + }, + { + "epoch": 11.015453436356243, + "grad_norm": 0.00037096895411822193, + "learning_rate": 1.197634372557166e-06, + "loss": 0.0, + "step": 27087 + }, + { + "epoch": 11.015860105734038, + "grad_norm": 0.0625839029000177, + "learning_rate": 1.1971536243395853e-06, + "loss": 0.0005, + "step": 27088 + }, + { + "epoch": 11.016266775111834, + "grad_norm": 0.001990227830283025, + "learning_rate": 1.1966729664876541e-06, + "loss": 0.0, + "step": 27089 + }, + { + "epoch": 11.01667344448963, + "grad_norm": 0.2924518535135761, + "learning_rate": 1.196192399006305e-06, + "loss": 0.002, + "step": 27090 + }, + { + "epoch": 11.017080113867426, + "grad_norm": 0.004424394283342526, + "learning_rate": 1.195711921900473e-06, + "loss": 0.0, + "step": 27091 + }, + { + "epoch": 11.017486783245221, + "grad_norm": 0.008749481854985435, + "learning_rate": 1.1952315351750886e-06, + "loss": 0.0001, + "step": 27092 + }, + { + "epoch": 11.017893452623017, + "grad_norm": 0.004667894146532947, + "learning_rate": 1.1947512388350836e-06, + "loss": 0.0, + "step": 27093 + }, + { + "epoch": 11.018300122000813, + "grad_norm": 0.07791478276583193, + "learning_rate": 1.194271032885389e-06, + "loss": 0.0009, + "step": 27094 + }, + { + "epoch": 11.018706791378609, + "grad_norm": 0.00975663928344825, + "learning_rate": 1.1937909173309325e-06, + "loss": 0.0001, + "step": 27095 + }, + { + "epoch": 11.019113460756405, + "grad_norm": 0.006430052074227375, + "learning_rate": 1.1933108921766457e-06, + "loss": 0.0001, + "step": 27096 + }, + { + "epoch": 11.0195201301342, + "grad_norm": 0.06516025613539055, + "learning_rate": 1.1928309574274543e-06, + "loss": 0.0005, + "step": 27097 + }, + { + "epoch": 11.019926799511996, + "grad_norm": 0.5209858492475963, + "learning_rate": 1.192351113088287e-06, + "loss": 0.0051, + "step": 27098 + }, + { + "epoch": 11.020333468889792, + "grad_norm": 0.09304332924071461, + "learning_rate": 1.1918713591640675e-06, + "loss": 0.0009, + "step": 27099 + }, + { + "epoch": 11.020740138267588, + "grad_norm": 0.12434904734673202, + "learning_rate": 1.1913916956597215e-06, + "loss": 0.0011, + "step": 27100 + }, + { + "epoch": 11.021146807645385, + "grad_norm": 0.001663061743585701, + "learning_rate": 1.1909121225801723e-06, + "loss": 0.0, + "step": 27101 + }, + { + "epoch": 11.021553477023181, + "grad_norm": 0.04295415108116272, + "learning_rate": 1.1904326399303446e-06, + "loss": 0.0003, + "step": 27102 + }, + { + "epoch": 11.021960146400977, + "grad_norm": 0.11979299896639442, + "learning_rate": 1.1899532477151598e-06, + "loss": 0.0009, + "step": 27103 + }, + { + "epoch": 11.022366815778772, + "grad_norm": 0.00039911493564380397, + "learning_rate": 1.1894739459395387e-06, + "loss": 0.0, + "step": 27104 + }, + { + "epoch": 11.022773485156568, + "grad_norm": 0.0011327205834765605, + "learning_rate": 1.1889947346084019e-06, + "loss": 0.0, + "step": 27105 + }, + { + "epoch": 11.023180154534364, + "grad_norm": 0.3439833348184874, + "learning_rate": 1.1885156137266684e-06, + "loss": 0.0031, + "step": 27106 + }, + { + "epoch": 11.02358682391216, + "grad_norm": 0.11818877676826575, + "learning_rate": 1.1880365832992569e-06, + "loss": 0.0011, + "step": 27107 + }, + { + "epoch": 11.023993493289955, + "grad_norm": 0.016911881171762996, + "learning_rate": 1.1875576433310832e-06, + "loss": 0.0001, + "step": 27108 + }, + { + "epoch": 11.024400162667751, + "grad_norm": 0.005391642834927565, + "learning_rate": 1.1870787938270678e-06, + "loss": 0.0, + "step": 27109 + }, + { + "epoch": 11.024806832045547, + "grad_norm": 0.04226790281163355, + "learning_rate": 1.1866000347921235e-06, + "loss": 0.0004, + "step": 27110 + }, + { + "epoch": 11.025213501423343, + "grad_norm": 0.04471976848512393, + "learning_rate": 1.1861213662311654e-06, + "loss": 0.0004, + "step": 27111 + }, + { + "epoch": 11.025620170801139, + "grad_norm": 0.7762069289422922, + "learning_rate": 1.1856427881491074e-06, + "loss": 0.0079, + "step": 27112 + }, + { + "epoch": 11.026026840178934, + "grad_norm": 0.0040293492867943155, + "learning_rate": 1.1851643005508628e-06, + "loss": 0.0, + "step": 27113 + }, + { + "epoch": 11.02643350955673, + "grad_norm": 0.008897266740874312, + "learning_rate": 1.1846859034413404e-06, + "loss": 0.0001, + "step": 27114 + }, + { + "epoch": 11.026840178934526, + "grad_norm": 0.0003520080298906406, + "learning_rate": 1.1842075968254574e-06, + "loss": 0.0, + "step": 27115 + }, + { + "epoch": 11.027246848312322, + "grad_norm": 0.012260665023386205, + "learning_rate": 1.1837293807081185e-06, + "loss": 0.0001, + "step": 27116 + }, + { + "epoch": 11.027653517690117, + "grad_norm": 0.06618365257556574, + "learning_rate": 1.1832512550942354e-06, + "loss": 0.0006, + "step": 27117 + }, + { + "epoch": 11.028060187067913, + "grad_norm": 0.00012884087000716336, + "learning_rate": 1.1827732199887154e-06, + "loss": 0.0, + "step": 27118 + }, + { + "epoch": 11.028466856445709, + "grad_norm": 0.001775491674638272, + "learning_rate": 1.1822952753964667e-06, + "loss": 0.0, + "step": 27119 + }, + { + "epoch": 11.028873525823505, + "grad_norm": 0.00917781712359505, + "learning_rate": 1.1818174213223943e-06, + "loss": 0.0, + "step": 27120 + }, + { + "epoch": 11.029280195201302, + "grad_norm": 0.08175975646637726, + "learning_rate": 1.181339657771402e-06, + "loss": 0.0008, + "step": 27121 + }, + { + "epoch": 11.029686864579098, + "grad_norm": 0.0012791018608779028, + "learning_rate": 1.1808619847483982e-06, + "loss": 0.0, + "step": 27122 + }, + { + "epoch": 11.030093533956894, + "grad_norm": 0.011967607138374454, + "learning_rate": 1.1803844022582855e-06, + "loss": 0.0001, + "step": 27123 + }, + { + "epoch": 11.03050020333469, + "grad_norm": 0.001610472787304514, + "learning_rate": 1.1799069103059646e-06, + "loss": 0.0, + "step": 27124 + }, + { + "epoch": 11.030906872712485, + "grad_norm": 0.006391468181296978, + "learning_rate": 1.1794295088963391e-06, + "loss": 0.0001, + "step": 27125 + }, + { + "epoch": 11.031313542090281, + "grad_norm": 0.007749076481763246, + "learning_rate": 1.1789521980343087e-06, + "loss": 0.0001, + "step": 27126 + }, + { + "epoch": 11.031720211468077, + "grad_norm": 0.025143059809987288, + "learning_rate": 1.1784749777247718e-06, + "loss": 0.0003, + "step": 27127 + }, + { + "epoch": 11.032126880845873, + "grad_norm": 0.010715908031328748, + "learning_rate": 1.1779978479726307e-06, + "loss": 0.0001, + "step": 27128 + }, + { + "epoch": 11.032533550223668, + "grad_norm": 0.010577821994891781, + "learning_rate": 1.1775208087827828e-06, + "loss": 0.0001, + "step": 27129 + }, + { + "epoch": 11.032940219601464, + "grad_norm": 0.0035166643611258984, + "learning_rate": 1.177043860160123e-06, + "loss": 0.0, + "step": 27130 + }, + { + "epoch": 11.03334688897926, + "grad_norm": 0.1854740993528143, + "learning_rate": 1.1765670021095487e-06, + "loss": 0.0015, + "step": 27131 + }, + { + "epoch": 11.033753558357056, + "grad_norm": 0.000928289527888258, + "learning_rate": 1.1760902346359549e-06, + "loss": 0.0, + "step": 27132 + }, + { + "epoch": 11.034160227734851, + "grad_norm": 0.19987921057934918, + "learning_rate": 1.175613557744235e-06, + "loss": 0.0019, + "step": 27133 + }, + { + "epoch": 11.034566897112647, + "grad_norm": 0.000663831004525665, + "learning_rate": 1.1751369714392846e-06, + "loss": 0.0, + "step": 27134 + }, + { + "epoch": 11.034973566490443, + "grad_norm": 0.022792336147009207, + "learning_rate": 1.1746604757259939e-06, + "loss": 0.0001, + "step": 27135 + }, + { + "epoch": 11.035380235868239, + "grad_norm": 0.00016786436571467858, + "learning_rate": 1.1741840706092555e-06, + "loss": 0.0, + "step": 27136 + }, + { + "epoch": 11.035786905246034, + "grad_norm": 1.0246789953503748e-05, + "learning_rate": 1.173707756093959e-06, + "loss": 0.0, + "step": 27137 + }, + { + "epoch": 11.03619357462383, + "grad_norm": 0.0024888615543664657, + "learning_rate": 1.173231532184994e-06, + "loss": 0.0, + "step": 27138 + }, + { + "epoch": 11.036600244001626, + "grad_norm": 0.0016071752724733054, + "learning_rate": 1.1727553988872486e-06, + "loss": 0.0, + "step": 27139 + }, + { + "epoch": 11.037006913379422, + "grad_norm": 4.0286421581288366e-05, + "learning_rate": 1.1722793562056146e-06, + "loss": 0.0, + "step": 27140 + }, + { + "epoch": 11.037413582757218, + "grad_norm": 0.001992534799900649, + "learning_rate": 1.1718034041449744e-06, + "loss": 0.0, + "step": 27141 + }, + { + "epoch": 11.037820252135015, + "grad_norm": 0.05691379672696785, + "learning_rate": 1.1713275427102156e-06, + "loss": 0.0004, + "step": 27142 + }, + { + "epoch": 11.03822692151281, + "grad_norm": 0.04036393563287358, + "learning_rate": 1.170851771906224e-06, + "loss": 0.0002, + "step": 27143 + }, + { + "epoch": 11.038633590890607, + "grad_norm": 0.04660212875167872, + "learning_rate": 1.1703760917378816e-06, + "loss": 0.0006, + "step": 27144 + }, + { + "epoch": 11.039040260268402, + "grad_norm": 0.03831233109428799, + "learning_rate": 1.1699005022100707e-06, + "loss": 0.0004, + "step": 27145 + }, + { + "epoch": 11.039446929646198, + "grad_norm": 0.005395435754215376, + "learning_rate": 1.1694250033276778e-06, + "loss": 0.0001, + "step": 27146 + }, + { + "epoch": 11.039853599023994, + "grad_norm": 0.09734804975922193, + "learning_rate": 1.1689495950955808e-06, + "loss": 0.0008, + "step": 27147 + }, + { + "epoch": 11.04026026840179, + "grad_norm": 0.008680074745965173, + "learning_rate": 1.1684742775186608e-06, + "loss": 0.0001, + "step": 27148 + }, + { + "epoch": 11.040666937779585, + "grad_norm": 0.008739862406669066, + "learning_rate": 1.1679990506017979e-06, + "loss": 0.0001, + "step": 27149 + }, + { + "epoch": 11.041073607157381, + "grad_norm": 0.0022409255630795794, + "learning_rate": 1.1675239143498684e-06, + "loss": 0.0, + "step": 27150 + }, + { + "epoch": 11.041480276535177, + "grad_norm": 0.020779677154854796, + "learning_rate": 1.1670488687677528e-06, + "loss": 0.0001, + "step": 27151 + }, + { + "epoch": 11.041886945912973, + "grad_norm": 0.00407445897351255, + "learning_rate": 1.1665739138603228e-06, + "loss": 0.0, + "step": 27152 + }, + { + "epoch": 11.042293615290768, + "grad_norm": 0.0024434922251894368, + "learning_rate": 1.1660990496324598e-06, + "loss": 0.0, + "step": 27153 + }, + { + "epoch": 11.042700284668564, + "grad_norm": 0.04293349834952909, + "learning_rate": 1.1656242760890368e-06, + "loss": 0.0004, + "step": 27154 + }, + { + "epoch": 11.04310695404636, + "grad_norm": 0.0003368811736301078, + "learning_rate": 1.1651495932349266e-06, + "loss": 0.0, + "step": 27155 + }, + { + "epoch": 11.043513623424156, + "grad_norm": 0.010173500797536912, + "learning_rate": 1.164675001075002e-06, + "loss": 0.0001, + "step": 27156 + }, + { + "epoch": 11.043920292801952, + "grad_norm": 0.019559875585274444, + "learning_rate": 1.1642004996141343e-06, + "loss": 0.0002, + "step": 27157 + }, + { + "epoch": 11.044326962179747, + "grad_norm": 2.816316483643744, + "learning_rate": 1.1637260888571967e-06, + "loss": 0.019, + "step": 27158 + }, + { + "epoch": 11.044733631557543, + "grad_norm": 0.12688122367251306, + "learning_rate": 1.1632517688090561e-06, + "loss": 0.0007, + "step": 27159 + }, + { + "epoch": 11.045140300935339, + "grad_norm": 0.00018246138759281854, + "learning_rate": 1.1627775394745844e-06, + "loss": 0.0, + "step": 27160 + }, + { + "epoch": 11.045546970313135, + "grad_norm": 0.00994574101497235, + "learning_rate": 1.1623034008586497e-06, + "loss": 0.0001, + "step": 27161 + }, + { + "epoch": 11.045953639690932, + "grad_norm": 0.011882721724652084, + "learning_rate": 1.1618293529661184e-06, + "loss": 0.0001, + "step": 27162 + }, + { + "epoch": 11.046360309068728, + "grad_norm": 0.04344496615385795, + "learning_rate": 1.1613553958018576e-06, + "loss": 0.0005, + "step": 27163 + }, + { + "epoch": 11.046766978446524, + "grad_norm": 0.023347754202541017, + "learning_rate": 1.1608815293707309e-06, + "loss": 0.0001, + "step": 27164 + }, + { + "epoch": 11.04717364782432, + "grad_norm": 0.05532656584984634, + "learning_rate": 1.1604077536776026e-06, + "loss": 0.0006, + "step": 27165 + }, + { + "epoch": 11.047580317202115, + "grad_norm": 8.057360173195913e-05, + "learning_rate": 1.1599340687273398e-06, + "loss": 0.0, + "step": 27166 + }, + { + "epoch": 11.047986986579911, + "grad_norm": 0.007893839167731415, + "learning_rate": 1.1594604745248028e-06, + "loss": 0.0001, + "step": 27167 + }, + { + "epoch": 11.048393655957707, + "grad_norm": 0.017944688560202822, + "learning_rate": 1.1589869710748524e-06, + "loss": 0.0003, + "step": 27168 + }, + { + "epoch": 11.048800325335502, + "grad_norm": 0.006440355012752806, + "learning_rate": 1.1585135583823514e-06, + "loss": 0.0001, + "step": 27169 + }, + { + "epoch": 11.049206994713298, + "grad_norm": 0.013653211331088929, + "learning_rate": 1.158040236452158e-06, + "loss": 0.0001, + "step": 27170 + }, + { + "epoch": 11.049613664091094, + "grad_norm": 0.0009679104196679866, + "learning_rate": 1.157567005289132e-06, + "loss": 0.0, + "step": 27171 + }, + { + "epoch": 11.05002033346889, + "grad_norm": 0.000869798861209445, + "learning_rate": 1.1570938648981288e-06, + "loss": 0.0, + "step": 27172 + }, + { + "epoch": 11.050427002846686, + "grad_norm": 0.0036487171102331183, + "learning_rate": 1.1566208152840098e-06, + "loss": 0.0001, + "step": 27173 + }, + { + "epoch": 11.050833672224481, + "grad_norm": 0.013933369917907907, + "learning_rate": 1.1561478564516293e-06, + "loss": 0.0001, + "step": 27174 + }, + { + "epoch": 11.051240341602277, + "grad_norm": 0.14357938223868372, + "learning_rate": 1.1556749884058416e-06, + "loss": 0.0005, + "step": 27175 + }, + { + "epoch": 11.051647010980073, + "grad_norm": 0.0006746800609101715, + "learning_rate": 1.1552022111515004e-06, + "loss": 0.0, + "step": 27176 + }, + { + "epoch": 11.052053680357869, + "grad_norm": 0.010399527929113087, + "learning_rate": 1.154729524693462e-06, + "loss": 0.0001, + "step": 27177 + }, + { + "epoch": 11.052460349735664, + "grad_norm": 4.3614385115119365, + "learning_rate": 1.1542569290365734e-06, + "loss": 0.0666, + "step": 27178 + }, + { + "epoch": 11.05286701911346, + "grad_norm": 0.009765409885399085, + "learning_rate": 1.1537844241856911e-06, + "loss": 0.0001, + "step": 27179 + }, + { + "epoch": 11.053273688491256, + "grad_norm": 0.05034131725206782, + "learning_rate": 1.153312010145665e-06, + "loss": 0.0004, + "step": 27180 + }, + { + "epoch": 11.053680357869052, + "grad_norm": 0.06285018511923693, + "learning_rate": 1.1528396869213431e-06, + "loss": 0.0003, + "step": 27181 + }, + { + "epoch": 11.054087027246847, + "grad_norm": 0.0004666780560837485, + "learning_rate": 1.1523674545175745e-06, + "loss": 0.0, + "step": 27182 + }, + { + "epoch": 11.054493696624645, + "grad_norm": 0.06436401570409582, + "learning_rate": 1.1518953129392063e-06, + "loss": 0.0005, + "step": 27183 + }, + { + "epoch": 11.05490036600244, + "grad_norm": 0.024105401336312157, + "learning_rate": 1.151423262191087e-06, + "loss": 0.0003, + "step": 27184 + }, + { + "epoch": 11.055307035380237, + "grad_norm": 0.10551915069846753, + "learning_rate": 1.1509513022780583e-06, + "loss": 0.0011, + "step": 27185 + }, + { + "epoch": 11.055713704758032, + "grad_norm": 0.0004173819119157455, + "learning_rate": 1.1504794332049706e-06, + "loss": 0.0, + "step": 27186 + }, + { + "epoch": 11.056120374135828, + "grad_norm": 0.008185155955998634, + "learning_rate": 1.1500076549766659e-06, + "loss": 0.0, + "step": 27187 + }, + { + "epoch": 11.056527043513624, + "grad_norm": 0.002677214145057436, + "learning_rate": 1.1495359675979857e-06, + "loss": 0.0, + "step": 27188 + }, + { + "epoch": 11.05693371289142, + "grad_norm": 0.00022003039303939624, + "learning_rate": 1.1490643710737736e-06, + "loss": 0.0, + "step": 27189 + }, + { + "epoch": 11.057340382269215, + "grad_norm": 0.006010641530938276, + "learning_rate": 1.1485928654088708e-06, + "loss": 0.0, + "step": 27190 + }, + { + "epoch": 11.057747051647011, + "grad_norm": 0.00902262748967417, + "learning_rate": 1.148121450608114e-06, + "loss": 0.0001, + "step": 27191 + }, + { + "epoch": 11.058153721024807, + "grad_norm": 0.006401062574022768, + "learning_rate": 1.1476501266763484e-06, + "loss": 0.0001, + "step": 27192 + }, + { + "epoch": 11.058560390402603, + "grad_norm": 0.2671117751386666, + "learning_rate": 1.1471788936184092e-06, + "loss": 0.0028, + "step": 27193 + }, + { + "epoch": 11.058967059780398, + "grad_norm": 0.08637793142754795, + "learning_rate": 1.1467077514391345e-06, + "loss": 0.0009, + "step": 27194 + }, + { + "epoch": 11.059373729158194, + "grad_norm": 0.0017784364191063735, + "learning_rate": 1.1462367001433606e-06, + "loss": 0.0, + "step": 27195 + }, + { + "epoch": 11.05978039853599, + "grad_norm": 0.036539210203635354, + "learning_rate": 1.1457657397359224e-06, + "loss": 0.0004, + "step": 27196 + }, + { + "epoch": 11.060187067913786, + "grad_norm": 0.2322491912909521, + "learning_rate": 1.1452948702216549e-06, + "loss": 0.0036, + "step": 27197 + }, + { + "epoch": 11.060593737291581, + "grad_norm": 0.0016641562432228682, + "learning_rate": 1.144824091605391e-06, + "loss": 0.0, + "step": 27198 + }, + { + "epoch": 11.061000406669377, + "grad_norm": 0.0037294788025855757, + "learning_rate": 1.1443534038919658e-06, + "loss": 0.0, + "step": 27199 + }, + { + "epoch": 11.061407076047173, + "grad_norm": 0.0032560473197702977, + "learning_rate": 1.1438828070862108e-06, + "loss": 0.0, + "step": 27200 + }, + { + "epoch": 11.061813745424969, + "grad_norm": 0.004891810862617426, + "learning_rate": 1.1434123011929543e-06, + "loss": 0.0, + "step": 27201 + }, + { + "epoch": 11.062220414802765, + "grad_norm": 0.009289641780213739, + "learning_rate": 1.1429418862170293e-06, + "loss": 0.0001, + "step": 27202 + }, + { + "epoch": 11.062627084180562, + "grad_norm": 0.015474047497258024, + "learning_rate": 1.142471562163263e-06, + "loss": 0.0002, + "step": 27203 + }, + { + "epoch": 11.063033753558358, + "grad_norm": 0.06407045374737005, + "learning_rate": 1.1420013290364829e-06, + "loss": 0.0006, + "step": 27204 + }, + { + "epoch": 11.063440422936154, + "grad_norm": 0.045391723244848456, + "learning_rate": 1.141531186841518e-06, + "loss": 0.0004, + "step": 27205 + }, + { + "epoch": 11.06384709231395, + "grad_norm": 0.08068565940483761, + "learning_rate": 1.1410611355831946e-06, + "loss": 0.0008, + "step": 27206 + }, + { + "epoch": 11.064253761691745, + "grad_norm": 0.04695855129353703, + "learning_rate": 1.1405911752663368e-06, + "loss": 0.0005, + "step": 27207 + }, + { + "epoch": 11.064660431069541, + "grad_norm": 0.16590501206771546, + "learning_rate": 1.1401213058957695e-06, + "loss": 0.001, + "step": 27208 + }, + { + "epoch": 11.065067100447337, + "grad_norm": 0.019437647566589004, + "learning_rate": 1.1396515274763164e-06, + "loss": 0.0002, + "step": 27209 + }, + { + "epoch": 11.065473769825132, + "grad_norm": 0.004353107213843255, + "learning_rate": 1.1391818400127995e-06, + "loss": 0.0001, + "step": 27210 + }, + { + "epoch": 11.065880439202928, + "grad_norm": 0.09522118646157982, + "learning_rate": 1.1387122435100384e-06, + "loss": 0.0007, + "step": 27211 + }, + { + "epoch": 11.066287108580724, + "grad_norm": 0.0017321738483176746, + "learning_rate": 1.138242737972859e-06, + "loss": 0.0, + "step": 27212 + }, + { + "epoch": 11.06669377795852, + "grad_norm": 0.0010913617351481367, + "learning_rate": 1.137773323406076e-06, + "loss": 0.0, + "step": 27213 + }, + { + "epoch": 11.067100447336315, + "grad_norm": 0.048529133593769334, + "learning_rate": 1.1373039998145119e-06, + "loss": 0.0004, + "step": 27214 + }, + { + "epoch": 11.067507116714111, + "grad_norm": 0.0018847096560286015, + "learning_rate": 1.1368347672029822e-06, + "loss": 0.0, + "step": 27215 + }, + { + "epoch": 11.067913786091907, + "grad_norm": 0.005169905759115373, + "learning_rate": 1.1363656255763045e-06, + "loss": 0.0, + "step": 27216 + }, + { + "epoch": 11.068320455469703, + "grad_norm": 0.1887540135816875, + "learning_rate": 1.1358965749392914e-06, + "loss": 0.0019, + "step": 27217 + }, + { + "epoch": 11.068727124847499, + "grad_norm": 0.004048135708656605, + "learning_rate": 1.1354276152967646e-06, + "loss": 0.0, + "step": 27218 + }, + { + "epoch": 11.069133794225294, + "grad_norm": 0.14822402505462445, + "learning_rate": 1.134958746653534e-06, + "loss": 0.0015, + "step": 27219 + }, + { + "epoch": 11.06954046360309, + "grad_norm": 0.30958389289480337, + "learning_rate": 1.134489969014414e-06, + "loss": 0.0023, + "step": 27220 + }, + { + "epoch": 11.069947132980886, + "grad_norm": 0.031455989062366525, + "learning_rate": 1.1340212823842156e-06, + "loss": 0.0001, + "step": 27221 + }, + { + "epoch": 11.070353802358682, + "grad_norm": 0.2428737654188398, + "learning_rate": 1.1335526867677514e-06, + "loss": 0.0014, + "step": 27222 + }, + { + "epoch": 11.070760471736477, + "grad_norm": 0.011443009144016962, + "learning_rate": 1.1330841821698314e-06, + "loss": 0.0001, + "step": 27223 + }, + { + "epoch": 11.071167141114275, + "grad_norm": 0.015535618802555917, + "learning_rate": 1.1326157685952632e-06, + "loss": 0.0002, + "step": 27224 + }, + { + "epoch": 11.07157381049207, + "grad_norm": 0.03070415939788094, + "learning_rate": 1.132147446048858e-06, + "loss": 0.0003, + "step": 27225 + }, + { + "epoch": 11.071980479869866, + "grad_norm": 0.015449901529698872, + "learning_rate": 1.1316792145354238e-06, + "loss": 0.0002, + "step": 27226 + }, + { + "epoch": 11.072387149247662, + "grad_norm": 0.0020547623521370616, + "learning_rate": 1.1312110740597647e-06, + "loss": 0.0, + "step": 27227 + }, + { + "epoch": 11.072793818625458, + "grad_norm": 0.06915889103432658, + "learning_rate": 1.1307430246266882e-06, + "loss": 0.0006, + "step": 27228 + }, + { + "epoch": 11.073200488003254, + "grad_norm": 0.029110162169498632, + "learning_rate": 1.1302750662409978e-06, + "loss": 0.0002, + "step": 27229 + }, + { + "epoch": 11.07360715738105, + "grad_norm": 0.05876547124273673, + "learning_rate": 1.1298071989074955e-06, + "loss": 0.0005, + "step": 27230 + }, + { + "epoch": 11.074013826758845, + "grad_norm": 0.005166407684778557, + "learning_rate": 1.1293394226309896e-06, + "loss": 0.0, + "step": 27231 + }, + { + "epoch": 11.074420496136641, + "grad_norm": 0.0018768618178940219, + "learning_rate": 1.1288717374162805e-06, + "loss": 0.0, + "step": 27232 + }, + { + "epoch": 11.074827165514437, + "grad_norm": 0.0023049596621999868, + "learning_rate": 1.128404143268166e-06, + "loss": 0.0, + "step": 27233 + }, + { + "epoch": 11.075233834892233, + "grad_norm": 0.009390170220352241, + "learning_rate": 1.1279366401914472e-06, + "loss": 0.0001, + "step": 27234 + }, + { + "epoch": 11.075640504270028, + "grad_norm": 0.02323724619103589, + "learning_rate": 1.1274692281909227e-06, + "loss": 0.0002, + "step": 27235 + }, + { + "epoch": 11.076047173647824, + "grad_norm": 0.0019676937162257683, + "learning_rate": 1.1270019072713934e-06, + "loss": 0.0, + "step": 27236 + }, + { + "epoch": 11.07645384302562, + "grad_norm": 0.022401683837164392, + "learning_rate": 1.1265346774376562e-06, + "loss": 0.0002, + "step": 27237 + }, + { + "epoch": 11.076860512403416, + "grad_norm": 0.014825865671318082, + "learning_rate": 1.1260675386945053e-06, + "loss": 0.0001, + "step": 27238 + }, + { + "epoch": 11.077267181781211, + "grad_norm": 0.0038573787460642363, + "learning_rate": 1.1256004910467376e-06, + "loss": 0.0, + "step": 27239 + }, + { + "epoch": 11.077673851159007, + "grad_norm": 0.023409175055646773, + "learning_rate": 1.1251335344991475e-06, + "loss": 0.0002, + "step": 27240 + }, + { + "epoch": 11.078080520536803, + "grad_norm": 0.0255768709090312, + "learning_rate": 1.124666669056529e-06, + "loss": 0.0002, + "step": 27241 + }, + { + "epoch": 11.078487189914599, + "grad_norm": 0.02859483291674903, + "learning_rate": 1.1241998947236709e-06, + "loss": 0.0003, + "step": 27242 + }, + { + "epoch": 11.078893859292394, + "grad_norm": 0.059252863117737566, + "learning_rate": 1.1237332115053712e-06, + "loss": 0.0009, + "step": 27243 + }, + { + "epoch": 11.079300528670192, + "grad_norm": 0.05136311215904385, + "learning_rate": 1.1232666194064168e-06, + "loss": 0.0005, + "step": 27244 + }, + { + "epoch": 11.079707198047988, + "grad_norm": 0.016539171035359252, + "learning_rate": 1.1228001184315972e-06, + "loss": 0.0002, + "step": 27245 + }, + { + "epoch": 11.080113867425784, + "grad_norm": 0.024087972265780796, + "learning_rate": 1.1223337085857033e-06, + "loss": 0.0002, + "step": 27246 + }, + { + "epoch": 11.08052053680358, + "grad_norm": 0.010641818640411546, + "learning_rate": 1.1218673898735211e-06, + "loss": 0.0001, + "step": 27247 + }, + { + "epoch": 11.080927206181375, + "grad_norm": 0.08005269978392728, + "learning_rate": 1.1214011622998367e-06, + "loss": 0.0007, + "step": 27248 + }, + { + "epoch": 11.08133387555917, + "grad_norm": 0.005262485080692492, + "learning_rate": 1.1209350258694406e-06, + "loss": 0.0, + "step": 27249 + }, + { + "epoch": 11.081740544936967, + "grad_norm": 0.04145476331220243, + "learning_rate": 1.120468980587115e-06, + "loss": 0.0004, + "step": 27250 + }, + { + "epoch": 11.082147214314762, + "grad_norm": 0.009403966933145843, + "learning_rate": 1.1200030264576433e-06, + "loss": 0.0001, + "step": 27251 + }, + { + "epoch": 11.082553883692558, + "grad_norm": 0.000897426792823906, + "learning_rate": 1.119537163485811e-06, + "loss": 0.0, + "step": 27252 + }, + { + "epoch": 11.082960553070354, + "grad_norm": 0.046242050826865846, + "learning_rate": 1.1190713916763985e-06, + "loss": 0.0003, + "step": 27253 + }, + { + "epoch": 11.08336722244815, + "grad_norm": 0.002052590733833807, + "learning_rate": 1.1186057110341876e-06, + "loss": 0.0, + "step": 27254 + }, + { + "epoch": 11.083773891825945, + "grad_norm": 0.0002742892258873722, + "learning_rate": 1.1181401215639576e-06, + "loss": 0.0, + "step": 27255 + }, + { + "epoch": 11.084180561203741, + "grad_norm": 0.001801746147406037, + "learning_rate": 1.1176746232704905e-06, + "loss": 0.0, + "step": 27256 + }, + { + "epoch": 11.084587230581537, + "grad_norm": 2.452248076448774, + "learning_rate": 1.1172092161585646e-06, + "loss": 0.0311, + "step": 27257 + }, + { + "epoch": 11.084993899959333, + "grad_norm": 0.01520541745967358, + "learning_rate": 1.1167439002329561e-06, + "loss": 0.0001, + "step": 27258 + }, + { + "epoch": 11.085400569337128, + "grad_norm": 0.42028284414112715, + "learning_rate": 1.1162786754984434e-06, + "loss": 0.0057, + "step": 27259 + }, + { + "epoch": 11.085807238714924, + "grad_norm": 0.03186850642197478, + "learning_rate": 1.1158135419598004e-06, + "loss": 0.0003, + "step": 27260 + }, + { + "epoch": 11.08621390809272, + "grad_norm": 0.04196347079964234, + "learning_rate": 1.1153484996218001e-06, + "loss": 0.0003, + "step": 27261 + }, + { + "epoch": 11.086620577470516, + "grad_norm": 0.0038999542872867215, + "learning_rate": 1.1148835484892218e-06, + "loss": 0.0, + "step": 27262 + }, + { + "epoch": 11.087027246848312, + "grad_norm": 0.004824271537578443, + "learning_rate": 1.1144186885668352e-06, + "loss": 0.0001, + "step": 27263 + }, + { + "epoch": 11.087433916226107, + "grad_norm": 0.0910596797295059, + "learning_rate": 1.113953919859414e-06, + "loss": 0.0011, + "step": 27264 + }, + { + "epoch": 11.087840585603905, + "grad_norm": 0.023823527742758754, + "learning_rate": 1.113489242371727e-06, + "loss": 0.0003, + "step": 27265 + }, + { + "epoch": 11.0882472549817, + "grad_norm": 6.688227633723871e-06, + "learning_rate": 1.1130246561085456e-06, + "loss": 0.0, + "step": 27266 + }, + { + "epoch": 11.088653924359496, + "grad_norm": 0.06568537099240869, + "learning_rate": 1.1125601610746384e-06, + "loss": 0.0004, + "step": 27267 + }, + { + "epoch": 11.089060593737292, + "grad_norm": 0.08404994690172479, + "learning_rate": 1.1120957572747727e-06, + "loss": 0.0006, + "step": 27268 + }, + { + "epoch": 11.089467263115088, + "grad_norm": 0.0007399099303089916, + "learning_rate": 1.1116314447137189e-06, + "loss": 0.0, + "step": 27269 + }, + { + "epoch": 11.089873932492884, + "grad_norm": 0.015034581712501625, + "learning_rate": 1.1111672233962423e-06, + "loss": 0.0002, + "step": 27270 + }, + { + "epoch": 11.09028060187068, + "grad_norm": 0.010446208216706769, + "learning_rate": 1.1107030933271069e-06, + "loss": 0.0001, + "step": 27271 + }, + { + "epoch": 11.090687271248475, + "grad_norm": 0.023220957658355714, + "learning_rate": 1.1102390545110787e-06, + "loss": 0.0002, + "step": 27272 + }, + { + "epoch": 11.091093940626271, + "grad_norm": 0.18093255078457782, + "learning_rate": 1.1097751069529206e-06, + "loss": 0.002, + "step": 27273 + }, + { + "epoch": 11.091500610004067, + "grad_norm": 0.007251213689016741, + "learning_rate": 1.1093112506573955e-06, + "loss": 0.0001, + "step": 27274 + }, + { + "epoch": 11.091907279381862, + "grad_norm": 0.0015956801502303934, + "learning_rate": 1.108847485629263e-06, + "loss": 0.0, + "step": 27275 + }, + { + "epoch": 11.092313948759658, + "grad_norm": 0.01833763761128308, + "learning_rate": 1.108383811873288e-06, + "loss": 0.0001, + "step": 27276 + }, + { + "epoch": 11.092720618137454, + "grad_norm": 0.04902160263922958, + "learning_rate": 1.1079202293942282e-06, + "loss": 0.0004, + "step": 27277 + }, + { + "epoch": 11.09312728751525, + "grad_norm": 0.017200370933381268, + "learning_rate": 1.1074567381968426e-06, + "loss": 0.0001, + "step": 27278 + }, + { + "epoch": 11.093533956893046, + "grad_norm": 0.0007346322125838669, + "learning_rate": 1.1069933382858888e-06, + "loss": 0.0, + "step": 27279 + }, + { + "epoch": 11.093940626270841, + "grad_norm": 0.0002298264430695236, + "learning_rate": 1.1065300296661242e-06, + "loss": 0.0, + "step": 27280 + }, + { + "epoch": 11.094347295648637, + "grad_norm": 0.0017733394130343412, + "learning_rate": 1.1060668123423023e-06, + "loss": 0.0, + "step": 27281 + }, + { + "epoch": 11.094753965026433, + "grad_norm": 0.07281137002993326, + "learning_rate": 1.1056036863191833e-06, + "loss": 0.0004, + "step": 27282 + }, + { + "epoch": 11.095160634404229, + "grad_norm": 0.3987033545386765, + "learning_rate": 1.1051406516015196e-06, + "loss": 0.0031, + "step": 27283 + }, + { + "epoch": 11.095567303782024, + "grad_norm": 0.0066635541332864725, + "learning_rate": 1.1046777081940629e-06, + "loss": 0.0001, + "step": 27284 + }, + { + "epoch": 11.095973973159822, + "grad_norm": 0.0006851472242209225, + "learning_rate": 1.1042148561015665e-06, + "loss": 0.0, + "step": 27285 + }, + { + "epoch": 11.096380642537618, + "grad_norm": 0.0013983622210604796, + "learning_rate": 1.1037520953287827e-06, + "loss": 0.0, + "step": 27286 + }, + { + "epoch": 11.096787311915413, + "grad_norm": 0.03097434349290865, + "learning_rate": 1.1032894258804604e-06, + "loss": 0.0003, + "step": 27287 + }, + { + "epoch": 11.09719398129321, + "grad_norm": 0.12949373149242396, + "learning_rate": 1.1028268477613468e-06, + "loss": 0.0012, + "step": 27288 + }, + { + "epoch": 11.097600650671005, + "grad_norm": 0.03118308896251042, + "learning_rate": 1.102364360976197e-06, + "loss": 0.0002, + "step": 27289 + }, + { + "epoch": 11.0980073200488, + "grad_norm": 0.002624382654978817, + "learning_rate": 1.1019019655297548e-06, + "loss": 0.0, + "step": 27290 + }, + { + "epoch": 11.098413989426597, + "grad_norm": 0.0008041501781288979, + "learning_rate": 1.1014396614267665e-06, + "loss": 0.0, + "step": 27291 + }, + { + "epoch": 11.098820658804392, + "grad_norm": 0.004628956388511406, + "learning_rate": 1.1009774486719793e-06, + "loss": 0.0001, + "step": 27292 + }, + { + "epoch": 11.099227328182188, + "grad_norm": 0.0022943159800080395, + "learning_rate": 1.1005153272701363e-06, + "loss": 0.0, + "step": 27293 + }, + { + "epoch": 11.099633997559984, + "grad_norm": 0.024888504999487266, + "learning_rate": 1.1000532972259814e-06, + "loss": 0.0003, + "step": 27294 + }, + { + "epoch": 11.10004066693778, + "grad_norm": 0.0010116686503470135, + "learning_rate": 1.0995913585442608e-06, + "loss": 0.0, + "step": 27295 + }, + { + "epoch": 11.100447336315575, + "grad_norm": 0.004332369352954409, + "learning_rate": 1.099129511229714e-06, + "loss": 0.0, + "step": 27296 + }, + { + "epoch": 11.100854005693371, + "grad_norm": 0.019843408854960005, + "learning_rate": 1.0986677552870827e-06, + "loss": 0.0002, + "step": 27297 + }, + { + "epoch": 11.101260675071167, + "grad_norm": 0.006695690803736837, + "learning_rate": 1.0982060907211067e-06, + "loss": 0.0, + "step": 27298 + }, + { + "epoch": 11.101667344448963, + "grad_norm": 0.0004214251559996466, + "learning_rate": 1.0977445175365254e-06, + "loss": 0.0, + "step": 27299 + }, + { + "epoch": 11.102074013826758, + "grad_norm": 0.012209300153625128, + "learning_rate": 1.097283035738077e-06, + "loss": 0.0001, + "step": 27300 + }, + { + "epoch": 11.102480683204554, + "grad_norm": 0.00900133479103171, + "learning_rate": 1.0968216453304969e-06, + "loss": 0.0001, + "step": 27301 + }, + { + "epoch": 11.10288735258235, + "grad_norm": 0.047038513445498066, + "learning_rate": 1.0963603463185258e-06, + "loss": 0.0004, + "step": 27302 + }, + { + "epoch": 11.103294021960146, + "grad_norm": 0.006820114461859484, + "learning_rate": 1.0958991387068962e-06, + "loss": 0.0001, + "step": 27303 + }, + { + "epoch": 11.103700691337941, + "grad_norm": 0.07592204757527173, + "learning_rate": 1.0954380225003447e-06, + "loss": 0.0003, + "step": 27304 + }, + { + "epoch": 11.104107360715737, + "grad_norm": 0.014194127968574785, + "learning_rate": 1.0949769977036017e-06, + "loss": 0.0001, + "step": 27305 + }, + { + "epoch": 11.104514030093535, + "grad_norm": 0.0278768876551285, + "learning_rate": 1.0945160643214025e-06, + "loss": 0.0002, + "step": 27306 + }, + { + "epoch": 11.10492069947133, + "grad_norm": 0.004864361746746168, + "learning_rate": 1.0940552223584756e-06, + "loss": 0.0, + "step": 27307 + }, + { + "epoch": 11.105327368849126, + "grad_norm": 8.05449824837199e-05, + "learning_rate": 1.0935944718195568e-06, + "loss": 0.0, + "step": 27308 + }, + { + "epoch": 11.105734038226922, + "grad_norm": 0.0031456306993167807, + "learning_rate": 1.0931338127093727e-06, + "loss": 0.0, + "step": 27309 + }, + { + "epoch": 11.106140707604718, + "grad_norm": 0.039429604532825985, + "learning_rate": 1.0926732450326527e-06, + "loss": 0.0003, + "step": 27310 + }, + { + "epoch": 11.106547376982514, + "grad_norm": 0.03914359066339824, + "learning_rate": 1.0922127687941252e-06, + "loss": 0.0004, + "step": 27311 + }, + { + "epoch": 11.10695404636031, + "grad_norm": 0.02692482138973686, + "learning_rate": 1.0917523839985168e-06, + "loss": 0.0002, + "step": 27312 + }, + { + "epoch": 11.107360715738105, + "grad_norm": 0.5777148727426242, + "learning_rate": 1.091292090650553e-06, + "loss": 0.0019, + "step": 27313 + }, + { + "epoch": 11.107767385115901, + "grad_norm": 0.014801670034352255, + "learning_rate": 1.0908318887549573e-06, + "loss": 0.0002, + "step": 27314 + }, + { + "epoch": 11.108174054493697, + "grad_norm": 0.4067535517243185, + "learning_rate": 1.0903717783164592e-06, + "loss": 0.0044, + "step": 27315 + }, + { + "epoch": 11.108580723871492, + "grad_norm": 0.00591830899270436, + "learning_rate": 1.0899117593397779e-06, + "loss": 0.0001, + "step": 27316 + }, + { + "epoch": 11.108987393249288, + "grad_norm": 0.005434537527170916, + "learning_rate": 1.0894518318296376e-06, + "loss": 0.0001, + "step": 27317 + }, + { + "epoch": 11.109394062627084, + "grad_norm": 0.017992072733790984, + "learning_rate": 1.0889919957907592e-06, + "loss": 0.0002, + "step": 27318 + }, + { + "epoch": 11.10980073200488, + "grad_norm": 0.023950577288939612, + "learning_rate": 1.0885322512278618e-06, + "loss": 0.0002, + "step": 27319 + }, + { + "epoch": 11.110207401382675, + "grad_norm": 0.014163304569159084, + "learning_rate": 1.0880725981456642e-06, + "loss": 0.0001, + "step": 27320 + }, + { + "epoch": 11.110614070760471, + "grad_norm": 0.007504367268962497, + "learning_rate": 1.087613036548888e-06, + "loss": 0.0001, + "step": 27321 + }, + { + "epoch": 11.111020740138267, + "grad_norm": 0.2196730031030274, + "learning_rate": 1.0871535664422506e-06, + "loss": 0.0012, + "step": 27322 + }, + { + "epoch": 11.111427409516063, + "grad_norm": 0.003777351465836983, + "learning_rate": 1.0866941878304672e-06, + "loss": 0.0, + "step": 27323 + }, + { + "epoch": 11.111834078893859, + "grad_norm": 0.021722166581682113, + "learning_rate": 1.0862349007182537e-06, + "loss": 0.0001, + "step": 27324 + }, + { + "epoch": 11.112240748271654, + "grad_norm": 0.002691059183262977, + "learning_rate": 1.0857757051103257e-06, + "loss": 0.0, + "step": 27325 + }, + { + "epoch": 11.112647417649452, + "grad_norm": 0.00022077334191651155, + "learning_rate": 1.0853166010113958e-06, + "loss": 0.0, + "step": 27326 + }, + { + "epoch": 11.113054087027248, + "grad_norm": 0.0051020808738467625, + "learning_rate": 1.0848575884261758e-06, + "loss": 0.0, + "step": 27327 + }, + { + "epoch": 11.113460756405043, + "grad_norm": 0.044249595903368, + "learning_rate": 1.084398667359382e-06, + "loss": 0.0003, + "step": 27328 + }, + { + "epoch": 11.11386742578284, + "grad_norm": 0.010570121724697647, + "learning_rate": 1.083939837815723e-06, + "loss": 0.0001, + "step": 27329 + }, + { + "epoch": 11.114274095160635, + "grad_norm": 0.0014771444269033637, + "learning_rate": 1.0834810997999079e-06, + "loss": 0.0, + "step": 27330 + }, + { + "epoch": 11.11468076453843, + "grad_norm": 1.0469580566864645, + "learning_rate": 1.08302245331665e-06, + "loss": 0.0107, + "step": 27331 + }, + { + "epoch": 11.115087433916226, + "grad_norm": 0.008379537005868444, + "learning_rate": 1.08256389837065e-06, + "loss": 0.0001, + "step": 27332 + }, + { + "epoch": 11.115494103294022, + "grad_norm": 0.00035425978894088587, + "learning_rate": 1.0821054349666215e-06, + "loss": 0.0, + "step": 27333 + }, + { + "epoch": 11.115900772671818, + "grad_norm": 0.0015784746195677774, + "learning_rate": 1.08164706310927e-06, + "loss": 0.0, + "step": 27334 + }, + { + "epoch": 11.116307442049614, + "grad_norm": 0.01476065335675514, + "learning_rate": 1.0811887828032986e-06, + "loss": 0.0001, + "step": 27335 + }, + { + "epoch": 11.11671411142741, + "grad_norm": 0.0023224951199117337, + "learning_rate": 1.080730594053414e-06, + "loss": 0.0, + "step": 27336 + }, + { + "epoch": 11.117120780805205, + "grad_norm": 0.0657788313395006, + "learning_rate": 1.0802724968643186e-06, + "loss": 0.0004, + "step": 27337 + }, + { + "epoch": 11.117527450183001, + "grad_norm": 0.001364951723297571, + "learning_rate": 1.0798144912407128e-06, + "loss": 0.0, + "step": 27338 + }, + { + "epoch": 11.117934119560797, + "grad_norm": 0.0011044253323444643, + "learning_rate": 1.0793565771873026e-06, + "loss": 0.0, + "step": 27339 + }, + { + "epoch": 11.118340788938593, + "grad_norm": 0.00452894802998106, + "learning_rate": 1.0788987547087882e-06, + "loss": 0.0, + "step": 27340 + }, + { + "epoch": 11.118747458316388, + "grad_norm": 0.08679796692570861, + "learning_rate": 1.0784410238098664e-06, + "loss": 0.0007, + "step": 27341 + }, + { + "epoch": 11.119154127694184, + "grad_norm": 0.006989069876536727, + "learning_rate": 1.077983384495238e-06, + "loss": 0.0001, + "step": 27342 + }, + { + "epoch": 11.11956079707198, + "grad_norm": 0.0005672469073284671, + "learning_rate": 1.0775258367696006e-06, + "loss": 0.0, + "step": 27343 + }, + { + "epoch": 11.119967466449776, + "grad_norm": 0.044164345714326224, + "learning_rate": 1.0770683806376514e-06, + "loss": 0.0006, + "step": 27344 + }, + { + "epoch": 11.120374135827571, + "grad_norm": 0.004189068171941355, + "learning_rate": 1.0766110161040844e-06, + "loss": 0.0, + "step": 27345 + }, + { + "epoch": 11.120780805205367, + "grad_norm": 0.27472256898344094, + "learning_rate": 1.0761537431735968e-06, + "loss": 0.0026, + "step": 27346 + }, + { + "epoch": 11.121187474583165, + "grad_norm": 0.0016089072314708294, + "learning_rate": 1.075696561850884e-06, + "loss": 0.0, + "step": 27347 + }, + { + "epoch": 11.12159414396096, + "grad_norm": 0.0001546657791808252, + "learning_rate": 1.0752394721406368e-06, + "loss": 0.0, + "step": 27348 + }, + { + "epoch": 11.122000813338756, + "grad_norm": 0.029819487496968392, + "learning_rate": 1.0747824740475476e-06, + "loss": 0.0003, + "step": 27349 + }, + { + "epoch": 11.122407482716552, + "grad_norm": 0.056023269082619426, + "learning_rate": 1.0743255675763097e-06, + "loss": 0.0006, + "step": 27350 + }, + { + "epoch": 11.122814152094348, + "grad_norm": 0.12479769953993348, + "learning_rate": 1.0738687527316094e-06, + "loss": 0.0011, + "step": 27351 + }, + { + "epoch": 11.123220821472144, + "grad_norm": 0.01065396812163658, + "learning_rate": 1.0734120295181405e-06, + "loss": 0.0001, + "step": 27352 + }, + { + "epoch": 11.12362749084994, + "grad_norm": 0.05175822952216444, + "learning_rate": 1.0729553979405892e-06, + "loss": 0.0002, + "step": 27353 + }, + { + "epoch": 11.124034160227735, + "grad_norm": 0.05421142919499486, + "learning_rate": 1.0724988580036443e-06, + "loss": 0.0004, + "step": 27354 + }, + { + "epoch": 11.12444082960553, + "grad_norm": 0.0054697760380077246, + "learning_rate": 1.0720424097119918e-06, + "loss": 0.0001, + "step": 27355 + }, + { + "epoch": 11.124847498983327, + "grad_norm": 0.006305913992028181, + "learning_rate": 1.071586053070317e-06, + "loss": 0.0, + "step": 27356 + }, + { + "epoch": 11.125254168361122, + "grad_norm": 0.011464808757736698, + "learning_rate": 1.071129788083306e-06, + "loss": 0.0001, + "step": 27357 + }, + { + "epoch": 11.125660837738918, + "grad_norm": 0.017213904685719858, + "learning_rate": 1.0706736147556385e-06, + "loss": 0.0001, + "step": 27358 + }, + { + "epoch": 11.126067507116714, + "grad_norm": 0.13643816555869104, + "learning_rate": 1.070217533092003e-06, + "loss": 0.0014, + "step": 27359 + }, + { + "epoch": 11.12647417649451, + "grad_norm": 0.0015697488924802296, + "learning_rate": 1.0697615430970777e-06, + "loss": 0.0, + "step": 27360 + }, + { + "epoch": 11.126880845872305, + "grad_norm": 0.0012970281746155325, + "learning_rate": 1.0693056447755457e-06, + "loss": 0.0, + "step": 27361 + }, + { + "epoch": 11.127287515250101, + "grad_norm": 0.004501066462859005, + "learning_rate": 1.0688498381320855e-06, + "loss": 0.0001, + "step": 27362 + }, + { + "epoch": 11.127694184627897, + "grad_norm": 0.0044327377399907076, + "learning_rate": 1.0683941231713768e-06, + "loss": 0.0, + "step": 27363 + }, + { + "epoch": 11.128100854005693, + "grad_norm": 0.022856824943125888, + "learning_rate": 1.0679384998980957e-06, + "loss": 0.0001, + "step": 27364 + }, + { + "epoch": 11.128507523383488, + "grad_norm": 0.008516465657231824, + "learning_rate": 1.067482968316923e-06, + "loss": 0.0001, + "step": 27365 + }, + { + "epoch": 11.128914192761284, + "grad_norm": 0.11023130971813722, + "learning_rate": 1.0670275284325338e-06, + "loss": 0.0014, + "step": 27366 + }, + { + "epoch": 11.129320862139082, + "grad_norm": 0.020182789449588436, + "learning_rate": 1.0665721802496022e-06, + "loss": 0.0001, + "step": 27367 + }, + { + "epoch": 11.129727531516878, + "grad_norm": 0.17972932172071693, + "learning_rate": 1.0661169237728031e-06, + "loss": 0.0022, + "step": 27368 + }, + { + "epoch": 11.130134200894673, + "grad_norm": 0.006950826997859644, + "learning_rate": 1.0656617590068109e-06, + "loss": 0.0, + "step": 27369 + }, + { + "epoch": 11.130540870272469, + "grad_norm": 0.011645040621266901, + "learning_rate": 1.065206685956297e-06, + "loss": 0.0001, + "step": 27370 + }, + { + "epoch": 11.130947539650265, + "grad_norm": 0.08334679999032456, + "learning_rate": 1.0647517046259314e-06, + "loss": 0.0007, + "step": 27371 + }, + { + "epoch": 11.13135420902806, + "grad_norm": 0.0003219918832832518, + "learning_rate": 1.0642968150203893e-06, + "loss": 0.0, + "step": 27372 + }, + { + "epoch": 11.131760878405856, + "grad_norm": 0.0010637009623178074, + "learning_rate": 1.0638420171443365e-06, + "loss": 0.0, + "step": 27373 + }, + { + "epoch": 11.132167547783652, + "grad_norm": 0.0732714170705693, + "learning_rate": 1.063387311002443e-06, + "loss": 0.0009, + "step": 27374 + }, + { + "epoch": 11.132574217161448, + "grad_norm": 0.03497770399962509, + "learning_rate": 1.0629326965993769e-06, + "loss": 0.0004, + "step": 27375 + }, + { + "epoch": 11.132980886539244, + "grad_norm": 0.031657076897943906, + "learning_rate": 1.0624781739398037e-06, + "loss": 0.0003, + "step": 27376 + }, + { + "epoch": 11.13338755591704, + "grad_norm": 0.0016129072059398034, + "learning_rate": 1.0620237430283909e-06, + "loss": 0.0, + "step": 27377 + }, + { + "epoch": 11.133794225294835, + "grad_norm": 0.007459700099954395, + "learning_rate": 1.061569403869801e-06, + "loss": 0.0001, + "step": 27378 + }, + { + "epoch": 11.134200894672631, + "grad_norm": 0.015451304776814995, + "learning_rate": 1.0611151564687017e-06, + "loss": 0.0001, + "step": 27379 + }, + { + "epoch": 11.134607564050427, + "grad_norm": 0.04019584368548335, + "learning_rate": 1.0606610008297535e-06, + "loss": 0.0004, + "step": 27380 + }, + { + "epoch": 11.135014233428222, + "grad_norm": 0.00803168106685867, + "learning_rate": 1.0602069369576197e-06, + "loss": 0.0001, + "step": 27381 + }, + { + "epoch": 11.135420902806018, + "grad_norm": 0.009830361630492859, + "learning_rate": 1.0597529648569615e-06, + "loss": 0.0, + "step": 27382 + }, + { + "epoch": 11.135827572183814, + "grad_norm": 0.00026186951701398503, + "learning_rate": 1.059299084532438e-06, + "loss": 0.0, + "step": 27383 + }, + { + "epoch": 11.13623424156161, + "grad_norm": 0.00016244839657719777, + "learning_rate": 1.0588452959887075e-06, + "loss": 0.0, + "step": 27384 + }, + { + "epoch": 11.136640910939406, + "grad_norm": 0.012065519960939753, + "learning_rate": 1.0583915992304317e-06, + "loss": 0.0001, + "step": 27385 + }, + { + "epoch": 11.137047580317201, + "grad_norm": 0.0005944929347257821, + "learning_rate": 1.0579379942622658e-06, + "loss": 0.0, + "step": 27386 + }, + { + "epoch": 11.137454249694997, + "grad_norm": 0.05146308192932995, + "learning_rate": 1.0574844810888673e-06, + "loss": 0.0004, + "step": 27387 + }, + { + "epoch": 11.137860919072795, + "grad_norm": 0.18801228156014535, + "learning_rate": 1.0570310597148915e-06, + "loss": 0.001, + "step": 27388 + }, + { + "epoch": 11.13826758845059, + "grad_norm": 0.029163847474655183, + "learning_rate": 1.056577730144993e-06, + "loss": 0.0004, + "step": 27389 + }, + { + "epoch": 11.138674257828386, + "grad_norm": 0.20653136205738157, + "learning_rate": 1.0561244923838254e-06, + "loss": 0.0025, + "step": 27390 + }, + { + "epoch": 11.139080927206182, + "grad_norm": 0.07653996224578496, + "learning_rate": 1.0556713464360392e-06, + "loss": 0.0005, + "step": 27391 + }, + { + "epoch": 11.139487596583978, + "grad_norm": 0.006766027468051763, + "learning_rate": 1.0552182923062892e-06, + "loss": 0.0001, + "step": 27392 + }, + { + "epoch": 11.139894265961773, + "grad_norm": 0.018303509281267626, + "learning_rate": 1.0547653299992255e-06, + "loss": 0.0002, + "step": 27393 + }, + { + "epoch": 11.14030093533957, + "grad_norm": 0.011299974405469681, + "learning_rate": 1.0543124595194986e-06, + "loss": 0.0001, + "step": 27394 + }, + { + "epoch": 11.140707604717365, + "grad_norm": 0.00575905424623741, + "learning_rate": 1.053859680871756e-06, + "loss": 0.0001, + "step": 27395 + }, + { + "epoch": 11.14111427409516, + "grad_norm": 0.0129169641851923, + "learning_rate": 1.0534069940606462e-06, + "loss": 0.0001, + "step": 27396 + }, + { + "epoch": 11.141520943472957, + "grad_norm": 0.013652048082064392, + "learning_rate": 1.0529543990908142e-06, + "loss": 0.0001, + "step": 27397 + }, + { + "epoch": 11.141927612850752, + "grad_norm": 0.02589567426011413, + "learning_rate": 1.0525018959669108e-06, + "loss": 0.0003, + "step": 27398 + }, + { + "epoch": 11.142334282228548, + "grad_norm": 0.0032276641202290423, + "learning_rate": 1.052049484693578e-06, + "loss": 0.0, + "step": 27399 + }, + { + "epoch": 11.142740951606344, + "grad_norm": 0.027534757113148782, + "learning_rate": 1.0515971652754608e-06, + "loss": 0.0002, + "step": 27400 + }, + { + "epoch": 11.14314762098414, + "grad_norm": 0.00026001374342481993, + "learning_rate": 1.0511449377172034e-06, + "loss": 0.0, + "step": 27401 + }, + { + "epoch": 11.143554290361935, + "grad_norm": 3.0974839296375704e-05, + "learning_rate": 1.0506928020234464e-06, + "loss": 0.0, + "step": 27402 + }, + { + "epoch": 11.143960959739731, + "grad_norm": 0.007097569374678632, + "learning_rate": 1.0502407581988316e-06, + "loss": 0.0, + "step": 27403 + }, + { + "epoch": 11.144367629117527, + "grad_norm": 0.001955997084587165, + "learning_rate": 1.0497888062479976e-06, + "loss": 0.0, + "step": 27404 + }, + { + "epoch": 11.144774298495323, + "grad_norm": 0.002246271122744664, + "learning_rate": 1.0493369461755887e-06, + "loss": 0.0, + "step": 27405 + }, + { + "epoch": 11.145180967873118, + "grad_norm": 0.021117911273527194, + "learning_rate": 1.0488851779862396e-06, + "loss": 0.0003, + "step": 27406 + }, + { + "epoch": 11.145587637250914, + "grad_norm": 0.01709048723604984, + "learning_rate": 1.0484335016845903e-06, + "loss": 0.0001, + "step": 27407 + }, + { + "epoch": 11.145994306628712, + "grad_norm": 0.022211130736857196, + "learning_rate": 1.0479819172752759e-06, + "loss": 0.0001, + "step": 27408 + }, + { + "epoch": 11.146400976006507, + "grad_norm": 0.07880686318484012, + "learning_rate": 1.0475304247629325e-06, + "loss": 0.0007, + "step": 27409 + }, + { + "epoch": 11.146807645384303, + "grad_norm": 0.021784811847644477, + "learning_rate": 1.0470790241521932e-06, + "loss": 0.0002, + "step": 27410 + }, + { + "epoch": 11.147214314762099, + "grad_norm": 0.01776200048700535, + "learning_rate": 1.0466277154476956e-06, + "loss": 0.0002, + "step": 27411 + }, + { + "epoch": 11.147620984139895, + "grad_norm": 0.08127843642735764, + "learning_rate": 1.04617649865407e-06, + "loss": 0.0011, + "step": 27412 + }, + { + "epoch": 11.14802765351769, + "grad_norm": 0.009066127378371409, + "learning_rate": 1.0457253737759498e-06, + "loss": 0.0001, + "step": 27413 + }, + { + "epoch": 11.148434322895486, + "grad_norm": 0.00024455326661949477, + "learning_rate": 1.0452743408179656e-06, + "loss": 0.0, + "step": 27414 + }, + { + "epoch": 11.148840992273282, + "grad_norm": 0.008392746240273528, + "learning_rate": 1.0448233997847468e-06, + "loss": 0.0001, + "step": 27415 + }, + { + "epoch": 11.149247661651078, + "grad_norm": 0.016056583254258253, + "learning_rate": 1.0443725506809222e-06, + "loss": 0.0001, + "step": 27416 + }, + { + "epoch": 11.149654331028874, + "grad_norm": 0.0022751780251091, + "learning_rate": 1.043921793511119e-06, + "loss": 0.0, + "step": 27417 + }, + { + "epoch": 11.15006100040667, + "grad_norm": 0.03536886938832242, + "learning_rate": 1.043471128279968e-06, + "loss": 0.0004, + "step": 27418 + }, + { + "epoch": 11.150467669784465, + "grad_norm": 0.0061242611982407795, + "learning_rate": 1.0430205549920935e-06, + "loss": 0.0, + "step": 27419 + }, + { + "epoch": 11.150874339162261, + "grad_norm": 0.43843361525017266, + "learning_rate": 1.0425700736521215e-06, + "loss": 0.0037, + "step": 27420 + }, + { + "epoch": 11.151281008540057, + "grad_norm": 0.0038980507609426506, + "learning_rate": 1.0421196842646753e-06, + "loss": 0.0, + "step": 27421 + }, + { + "epoch": 11.151687677917852, + "grad_norm": 0.010768482873171002, + "learning_rate": 1.0416693868343796e-06, + "loss": 0.0001, + "step": 27422 + }, + { + "epoch": 11.152094347295648, + "grad_norm": 0.014682462413463158, + "learning_rate": 1.0412191813658533e-06, + "loss": 0.0001, + "step": 27423 + }, + { + "epoch": 11.152501016673444, + "grad_norm": 0.001459795899959584, + "learning_rate": 1.040769067863724e-06, + "loss": 0.0, + "step": 27424 + }, + { + "epoch": 11.15290768605124, + "grad_norm": 0.05264581640199077, + "learning_rate": 1.0403190463326085e-06, + "loss": 0.0006, + "step": 27425 + }, + { + "epoch": 11.153314355429035, + "grad_norm": 0.011387343761668587, + "learning_rate": 1.039869116777128e-06, + "loss": 0.0001, + "step": 27426 + }, + { + "epoch": 11.153721024806831, + "grad_norm": 0.014329301766278382, + "learning_rate": 1.0394192792018997e-06, + "loss": 0.0001, + "step": 27427 + }, + { + "epoch": 11.154127694184627, + "grad_norm": 0.22784556953824026, + "learning_rate": 1.0389695336115434e-06, + "loss": 0.002, + "step": 27428 + }, + { + "epoch": 11.154534363562425, + "grad_norm": 0.015341228634166468, + "learning_rate": 1.0385198800106743e-06, + "loss": 0.0001, + "step": 27429 + }, + { + "epoch": 11.15494103294022, + "grad_norm": 0.25523280849221813, + "learning_rate": 1.0380703184039066e-06, + "loss": 0.0021, + "step": 27430 + }, + { + "epoch": 11.155347702318016, + "grad_norm": 0.12710913207438604, + "learning_rate": 1.0376208487958616e-06, + "loss": 0.0019, + "step": 27431 + }, + { + "epoch": 11.155754371695812, + "grad_norm": 0.38412879676501166, + "learning_rate": 1.0371714711911473e-06, + "loss": 0.0036, + "step": 27432 + }, + { + "epoch": 11.156161041073608, + "grad_norm": 6.148825879485944e-05, + "learning_rate": 1.0367221855943788e-06, + "loss": 0.0, + "step": 27433 + }, + { + "epoch": 11.156567710451403, + "grad_norm": 0.12389614924885267, + "learning_rate": 1.0362729920101678e-06, + "loss": 0.0009, + "step": 27434 + }, + { + "epoch": 11.1569743798292, + "grad_norm": 0.011531731086873893, + "learning_rate": 1.035823890443125e-06, + "loss": 0.0001, + "step": 27435 + }, + { + "epoch": 11.157381049206995, + "grad_norm": 0.016577754199457173, + "learning_rate": 1.0353748808978626e-06, + "loss": 0.0002, + "step": 27436 + }, + { + "epoch": 11.15778771858479, + "grad_norm": 0.016382991495934474, + "learning_rate": 1.0349259633789887e-06, + "loss": 0.0002, + "step": 27437 + }, + { + "epoch": 11.158194387962586, + "grad_norm": 0.008337662730001685, + "learning_rate": 1.0344771378911123e-06, + "loss": 0.0001, + "step": 27438 + }, + { + "epoch": 11.158601057340382, + "grad_norm": 0.0004138686274578556, + "learning_rate": 1.0340284044388404e-06, + "loss": 0.0, + "step": 27439 + }, + { + "epoch": 11.159007726718178, + "grad_norm": 0.10926914870142278, + "learning_rate": 1.0335797630267797e-06, + "loss": 0.001, + "step": 27440 + }, + { + "epoch": 11.159414396095974, + "grad_norm": 8.279054725942334e-05, + "learning_rate": 1.0331312136595328e-06, + "loss": 0.0, + "step": 27441 + }, + { + "epoch": 11.15982106547377, + "grad_norm": 0.004638780590366682, + "learning_rate": 1.0326827563417085e-06, + "loss": 0.0, + "step": 27442 + }, + { + "epoch": 11.160227734851565, + "grad_norm": 0.09879696726690118, + "learning_rate": 1.0322343910779098e-06, + "loss": 0.001, + "step": 27443 + }, + { + "epoch": 11.160634404229361, + "grad_norm": 0.012091880802685995, + "learning_rate": 1.0317861178727384e-06, + "loss": 0.0001, + "step": 27444 + }, + { + "epoch": 11.161041073607157, + "grad_norm": 0.004168994684904673, + "learning_rate": 1.0313379367307952e-06, + "loss": 0.0, + "step": 27445 + }, + { + "epoch": 11.161447742984953, + "grad_norm": 0.2906835596979562, + "learning_rate": 1.0308898476566831e-06, + "loss": 0.003, + "step": 27446 + }, + { + "epoch": 11.161854412362748, + "grad_norm": 0.0018377670306012816, + "learning_rate": 1.0304418506549997e-06, + "loss": 0.0, + "step": 27447 + }, + { + "epoch": 11.162261081740544, + "grad_norm": 2.3895919872846196e-05, + "learning_rate": 1.0299939457303432e-06, + "loss": 0.0, + "step": 27448 + }, + { + "epoch": 11.162667751118342, + "grad_norm": 0.02677094156871686, + "learning_rate": 1.0295461328873158e-06, + "loss": 0.0002, + "step": 27449 + }, + { + "epoch": 11.163074420496137, + "grad_norm": 0.0014358812478306282, + "learning_rate": 1.0290984121305115e-06, + "loss": 0.0, + "step": 27450 + }, + { + "epoch": 11.163481089873933, + "grad_norm": 0.014456317711410918, + "learning_rate": 1.0286507834645276e-06, + "loss": 0.0002, + "step": 27451 + }, + { + "epoch": 11.163887759251729, + "grad_norm": 0.00011389688178508887, + "learning_rate": 1.0282032468939584e-06, + "loss": 0.0, + "step": 27452 + }, + { + "epoch": 11.164294428629525, + "grad_norm": 0.0017482108356064417, + "learning_rate": 1.027755802423398e-06, + "loss": 0.0, + "step": 27453 + }, + { + "epoch": 11.16470109800732, + "grad_norm": 0.8787937506725351, + "learning_rate": 1.0273084500574382e-06, + "loss": 0.0086, + "step": 27454 + }, + { + "epoch": 11.165107767385116, + "grad_norm": 0.10477309035798149, + "learning_rate": 1.0268611898006753e-06, + "loss": 0.0008, + "step": 27455 + }, + { + "epoch": 11.165514436762912, + "grad_norm": 0.006640089560904906, + "learning_rate": 1.026414021657698e-06, + "loss": 0.0001, + "step": 27456 + }, + { + "epoch": 11.165921106140708, + "grad_norm": 0.0003653906963634464, + "learning_rate": 1.0259669456330968e-06, + "loss": 0.0, + "step": 27457 + }, + { + "epoch": 11.166327775518504, + "grad_norm": 0.0011190810271246604, + "learning_rate": 1.0255199617314626e-06, + "loss": 0.0, + "step": 27458 + }, + { + "epoch": 11.1667344448963, + "grad_norm": 0.28459988671127034, + "learning_rate": 1.0250730699573818e-06, + "loss": 0.0044, + "step": 27459 + }, + { + "epoch": 11.167141114274095, + "grad_norm": 0.007577874121994764, + "learning_rate": 1.024626270315443e-06, + "loss": 0.0001, + "step": 27460 + }, + { + "epoch": 11.16754778365189, + "grad_norm": 0.007112724576706624, + "learning_rate": 1.0241795628102303e-06, + "loss": 0.0001, + "step": 27461 + }, + { + "epoch": 11.167954453029687, + "grad_norm": 0.1122207069827987, + "learning_rate": 1.0237329474463343e-06, + "loss": 0.001, + "step": 27462 + }, + { + "epoch": 11.168361122407482, + "grad_norm": 0.017166493094572514, + "learning_rate": 1.023286424228338e-06, + "loss": 0.0001, + "step": 27463 + }, + { + "epoch": 11.168767791785278, + "grad_norm": 0.01279285513178489, + "learning_rate": 1.0228399931608235e-06, + "loss": 0.0001, + "step": 27464 + }, + { + "epoch": 11.169174461163074, + "grad_norm": 0.002431952054215712, + "learning_rate": 1.0223936542483748e-06, + "loss": 0.0, + "step": 27465 + }, + { + "epoch": 11.16958113054087, + "grad_norm": 0.02669799204121445, + "learning_rate": 1.0219474074955738e-06, + "loss": 0.0003, + "step": 27466 + }, + { + "epoch": 11.169987799918665, + "grad_norm": 0.01963794884176635, + "learning_rate": 1.021501252906999e-06, + "loss": 0.0002, + "step": 27467 + }, + { + "epoch": 11.170394469296461, + "grad_norm": 0.016471275677444462, + "learning_rate": 1.0210551904872345e-06, + "loss": 0.0002, + "step": 27468 + }, + { + "epoch": 11.170801138674257, + "grad_norm": 0.00019990849606525137, + "learning_rate": 1.0206092202408568e-06, + "loss": 0.0, + "step": 27469 + }, + { + "epoch": 11.171207808052054, + "grad_norm": 0.029389945143655272, + "learning_rate": 1.0201633421724466e-06, + "loss": 0.0003, + "step": 27470 + }, + { + "epoch": 11.17161447742985, + "grad_norm": 0.00013381890010957985, + "learning_rate": 1.019717556286578e-06, + "loss": 0.0, + "step": 27471 + }, + { + "epoch": 11.172021146807646, + "grad_norm": 0.21245162936892412, + "learning_rate": 1.0192718625878285e-06, + "loss": 0.0014, + "step": 27472 + }, + { + "epoch": 11.172427816185442, + "grad_norm": 0.0007626868989237252, + "learning_rate": 1.018826261080772e-06, + "loss": 0.0, + "step": 27473 + }, + { + "epoch": 11.172834485563238, + "grad_norm": 0.0036694885246915227, + "learning_rate": 1.0183807517699839e-06, + "loss": 0.0, + "step": 27474 + }, + { + "epoch": 11.173241154941033, + "grad_norm": 0.03870500257549273, + "learning_rate": 1.0179353346600384e-06, + "loss": 0.0004, + "step": 27475 + }, + { + "epoch": 11.173647824318829, + "grad_norm": 0.03800001769016394, + "learning_rate": 1.0174900097555086e-06, + "loss": 0.0002, + "step": 27476 + }, + { + "epoch": 11.174054493696625, + "grad_norm": 0.03881431738566646, + "learning_rate": 1.0170447770609638e-06, + "loss": 0.0002, + "step": 27477 + }, + { + "epoch": 11.17446116307442, + "grad_norm": 4.402244259455035e-05, + "learning_rate": 1.016599636580976e-06, + "loss": 0.0, + "step": 27478 + }, + { + "epoch": 11.174867832452216, + "grad_norm": 0.00024582142935556175, + "learning_rate": 1.016154588320113e-06, + "loss": 0.0, + "step": 27479 + }, + { + "epoch": 11.175274501830012, + "grad_norm": 0.0037177473985604294, + "learning_rate": 1.015709632282944e-06, + "loss": 0.0, + "step": 27480 + }, + { + "epoch": 11.175681171207808, + "grad_norm": 0.07932670678788269, + "learning_rate": 1.015264768474038e-06, + "loss": 0.0003, + "step": 27481 + }, + { + "epoch": 11.176087840585604, + "grad_norm": 0.39974019938100075, + "learning_rate": 1.0148199968979622e-06, + "loss": 0.0041, + "step": 27482 + }, + { + "epoch": 11.1764945099634, + "grad_norm": 0.10953807674298989, + "learning_rate": 1.0143753175592807e-06, + "loss": 0.0012, + "step": 27483 + }, + { + "epoch": 11.176901179341195, + "grad_norm": 0.000874129925026962, + "learning_rate": 1.013930730462559e-06, + "loss": 0.0, + "step": 27484 + }, + { + "epoch": 11.177307848718991, + "grad_norm": 0.8880781023699942, + "learning_rate": 1.0134862356123608e-06, + "loss": 0.0096, + "step": 27485 + }, + { + "epoch": 11.177714518096787, + "grad_norm": 0.0022517777066720715, + "learning_rate": 1.0130418330132496e-06, + "loss": 0.0, + "step": 27486 + }, + { + "epoch": 11.178121187474582, + "grad_norm": 0.24396359651407565, + "learning_rate": 1.012597522669785e-06, + "loss": 0.0022, + "step": 27487 + }, + { + "epoch": 11.178527856852378, + "grad_norm": 0.0029301993292163635, + "learning_rate": 1.0121533045865318e-06, + "loss": 0.0, + "step": 27488 + }, + { + "epoch": 11.178934526230174, + "grad_norm": 0.011497888243321064, + "learning_rate": 1.0117091787680489e-06, + "loss": 0.0001, + "step": 27489 + }, + { + "epoch": 11.179341195607972, + "grad_norm": 0.00039385777657819336, + "learning_rate": 1.0112651452188947e-06, + "loss": 0.0, + "step": 27490 + }, + { + "epoch": 11.179747864985767, + "grad_norm": 0.03457257182266311, + "learning_rate": 1.0108212039436272e-06, + "loss": 0.0005, + "step": 27491 + }, + { + "epoch": 11.180154534363563, + "grad_norm": 0.27503578569519765, + "learning_rate": 1.0103773549468044e-06, + "loss": 0.0015, + "step": 27492 + }, + { + "epoch": 11.180561203741359, + "grad_norm": 0.06266635140633439, + "learning_rate": 1.0099335982329827e-06, + "loss": 0.0004, + "step": 27493 + }, + { + "epoch": 11.180967873119155, + "grad_norm": 0.0006797245542168359, + "learning_rate": 1.0094899338067155e-06, + "loss": 0.0, + "step": 27494 + }, + { + "epoch": 11.18137454249695, + "grad_norm": 0.00047144708573292017, + "learning_rate": 1.009046361672561e-06, + "loss": 0.0, + "step": 27495 + }, + { + "epoch": 11.181781211874746, + "grad_norm": 0.0603306708181873, + "learning_rate": 1.0086028818350702e-06, + "loss": 0.0005, + "step": 27496 + }, + { + "epoch": 11.182187881252542, + "grad_norm": 0.011260065485109813, + "learning_rate": 1.008159494298796e-06, + "loss": 0.0001, + "step": 27497 + }, + { + "epoch": 11.182594550630338, + "grad_norm": 0.1043952772795801, + "learning_rate": 1.0077161990682904e-06, + "loss": 0.0007, + "step": 27498 + }, + { + "epoch": 11.183001220008133, + "grad_norm": 0.03082962706679462, + "learning_rate": 1.0072729961481043e-06, + "loss": 0.0003, + "step": 27499 + }, + { + "epoch": 11.18340788938593, + "grad_norm": 0.09646854839427554, + "learning_rate": 1.006829885542784e-06, + "loss": 0.0007, + "step": 27500 + }, + { + "epoch": 11.183814558763725, + "grad_norm": 8.612921142761974e-05, + "learning_rate": 1.0063868672568832e-06, + "loss": 0.0, + "step": 27501 + }, + { + "epoch": 11.18422122814152, + "grad_norm": 0.020613136306370843, + "learning_rate": 1.005943941294948e-06, + "loss": 0.0001, + "step": 27502 + }, + { + "epoch": 11.184627897519317, + "grad_norm": 1.2078604695951342e-05, + "learning_rate": 1.0055011076615239e-06, + "loss": 0.0, + "step": 27503 + }, + { + "epoch": 11.185034566897112, + "grad_norm": 0.00047505250850221636, + "learning_rate": 1.005058366361159e-06, + "loss": 0.0, + "step": 27504 + }, + { + "epoch": 11.185441236274908, + "grad_norm": 0.0011329886038845577, + "learning_rate": 1.004615717398396e-06, + "loss": 0.0, + "step": 27505 + }, + { + "epoch": 11.185847905652704, + "grad_norm": 0.002597492877322196, + "learning_rate": 1.0041731607777805e-06, + "loss": 0.0, + "step": 27506 + }, + { + "epoch": 11.1862545750305, + "grad_norm": 0.006566486904719669, + "learning_rate": 1.0037306965038529e-06, + "loss": 0.0001, + "step": 27507 + }, + { + "epoch": 11.186661244408295, + "grad_norm": 0.01869445439012292, + "learning_rate": 1.0032883245811597e-06, + "loss": 0.0002, + "step": 27508 + }, + { + "epoch": 11.187067913786091, + "grad_norm": 0.08301131629988617, + "learning_rate": 1.0028460450142397e-06, + "loss": 0.0007, + "step": 27509 + }, + { + "epoch": 11.187474583163887, + "grad_norm": 0.001042683615268118, + "learning_rate": 1.0024038578076322e-06, + "loss": 0.0, + "step": 27510 + }, + { + "epoch": 11.187881252541684, + "grad_norm": 0.007868699120987085, + "learning_rate": 1.0019617629658783e-06, + "loss": 0.0001, + "step": 27511 + }, + { + "epoch": 11.18828792191948, + "grad_norm": 0.03698581804702691, + "learning_rate": 1.0015197604935157e-06, + "loss": 0.0004, + "step": 27512 + }, + { + "epoch": 11.188694591297276, + "grad_norm": 0.0628832971967127, + "learning_rate": 1.0010778503950803e-06, + "loss": 0.0004, + "step": 27513 + }, + { + "epoch": 11.189101260675072, + "grad_norm": 0.00038517536761514923, + "learning_rate": 1.0006360326751107e-06, + "loss": 0.0, + "step": 27514 + }, + { + "epoch": 11.189507930052867, + "grad_norm": 0.05603551670464483, + "learning_rate": 1.0001943073381414e-06, + "loss": 0.0003, + "step": 27515 + }, + { + "epoch": 11.189914599430663, + "grad_norm": 0.32708190594628755, + "learning_rate": 9.997526743887077e-07, + "loss": 0.0042, + "step": 27516 + }, + { + "epoch": 11.190321268808459, + "grad_norm": 0.03587151466991863, + "learning_rate": 9.993111338313422e-07, + "loss": 0.0002, + "step": 27517 + }, + { + "epoch": 11.190727938186255, + "grad_norm": 0.0011161129973245511, + "learning_rate": 9.988696856705781e-07, + "loss": 0.0, + "step": 27518 + }, + { + "epoch": 11.19113460756405, + "grad_norm": 0.0013567769170994629, + "learning_rate": 9.984283299109465e-07, + "loss": 0.0, + "step": 27519 + }, + { + "epoch": 11.191541276941846, + "grad_norm": 0.1529955828218272, + "learning_rate": 9.979870665569768e-07, + "loss": 0.0012, + "step": 27520 + }, + { + "epoch": 11.191947946319642, + "grad_norm": 2.7428931597256262e-05, + "learning_rate": 9.975458956132022e-07, + "loss": 0.0, + "step": 27521 + }, + { + "epoch": 11.192354615697438, + "grad_norm": 0.007000046511827768, + "learning_rate": 9.971048170841501e-07, + "loss": 0.0001, + "step": 27522 + }, + { + "epoch": 11.192761285075234, + "grad_norm": 0.001900452428319631, + "learning_rate": 9.966638309743481e-07, + "loss": 0.0, + "step": 27523 + }, + { + "epoch": 11.19316795445303, + "grad_norm": 0.0023591200987912665, + "learning_rate": 9.962229372883226e-07, + "loss": 0.0, + "step": 27524 + }, + { + "epoch": 11.193574623830825, + "grad_norm": 0.0006974347064049525, + "learning_rate": 9.957821360306008e-07, + "loss": 0.0, + "step": 27525 + }, + { + "epoch": 11.193981293208621, + "grad_norm": 0.0024642229033596673, + "learning_rate": 9.95341427205705e-07, + "loss": 0.0, + "step": 27526 + }, + { + "epoch": 11.194387962586417, + "grad_norm": 0.009437473156142774, + "learning_rate": 9.949008108181635e-07, + "loss": 0.0001, + "step": 27527 + }, + { + "epoch": 11.194794631964212, + "grad_norm": 0.0030014542058978025, + "learning_rate": 9.944602868724972e-07, + "loss": 0.0, + "step": 27528 + }, + { + "epoch": 11.195201301342008, + "grad_norm": 0.00032690988946580554, + "learning_rate": 9.940198553732284e-07, + "loss": 0.0, + "step": 27529 + }, + { + "epoch": 11.195607970719804, + "grad_norm": 0.04836451478858538, + "learning_rate": 9.935795163248786e-07, + "loss": 0.0003, + "step": 27530 + }, + { + "epoch": 11.196014640097602, + "grad_norm": 0.02532082767635332, + "learning_rate": 9.931392697319674e-07, + "loss": 0.0002, + "step": 27531 + }, + { + "epoch": 11.196421309475397, + "grad_norm": 0.04702984824382488, + "learning_rate": 9.92699115599015e-07, + "loss": 0.0004, + "step": 27532 + }, + { + "epoch": 11.196827978853193, + "grad_norm": 0.003163522084081218, + "learning_rate": 9.922590539305388e-07, + "loss": 0.0, + "step": 27533 + }, + { + "epoch": 11.197234648230989, + "grad_norm": 0.0021497023446353926, + "learning_rate": 9.918190847310582e-07, + "loss": 0.0, + "step": 27534 + }, + { + "epoch": 11.197641317608785, + "grad_norm": 0.0006143970159823133, + "learning_rate": 9.913792080050876e-07, + "loss": 0.0, + "step": 27535 + }, + { + "epoch": 11.19804798698658, + "grad_norm": 0.006153222861032089, + "learning_rate": 9.909394237571436e-07, + "loss": 0.0, + "step": 27536 + }, + { + "epoch": 11.198454656364376, + "grad_norm": 0.009393908416789724, + "learning_rate": 9.904997319917397e-07, + "loss": 0.0001, + "step": 27537 + }, + { + "epoch": 11.198861325742172, + "grad_norm": 0.010642166388630355, + "learning_rate": 9.900601327133896e-07, + "loss": 0.0001, + "step": 27538 + }, + { + "epoch": 11.199267995119968, + "grad_norm": 0.0027395606455353034, + "learning_rate": 9.896206259266083e-07, + "loss": 0.0, + "step": 27539 + }, + { + "epoch": 11.199674664497763, + "grad_norm": 0.002191737863710403, + "learning_rate": 9.891812116359067e-07, + "loss": 0.0, + "step": 27540 + }, + { + "epoch": 11.20008133387556, + "grad_norm": 0.0003861833871527381, + "learning_rate": 9.887418898457946e-07, + "loss": 0.0, + "step": 27541 + }, + { + "epoch": 11.200488003253355, + "grad_norm": 0.0007289074074008096, + "learning_rate": 9.883026605607827e-07, + "loss": 0.0, + "step": 27542 + }, + { + "epoch": 11.20089467263115, + "grad_norm": 0.6959798615039904, + "learning_rate": 9.878635237853785e-07, + "loss": 0.0065, + "step": 27543 + }, + { + "epoch": 11.201301342008946, + "grad_norm": 2.536208272097053, + "learning_rate": 9.874244795240894e-07, + "loss": 0.0232, + "step": 27544 + }, + { + "epoch": 11.201708011386742, + "grad_norm": 0.019906323547327544, + "learning_rate": 9.869855277814266e-07, + "loss": 0.0002, + "step": 27545 + }, + { + "epoch": 11.202114680764538, + "grad_norm": 0.0012952397715387705, + "learning_rate": 9.86546668561893e-07, + "loss": 0.0, + "step": 27546 + }, + { + "epoch": 11.202521350142334, + "grad_norm": 0.00019887487667544947, + "learning_rate": 9.861079018699936e-07, + "loss": 0.0, + "step": 27547 + }, + { + "epoch": 11.20292801952013, + "grad_norm": 0.037621101967393246, + "learning_rate": 9.856692277102342e-07, + "loss": 0.0003, + "step": 27548 + }, + { + "epoch": 11.203334688897925, + "grad_norm": 0.016609969366408878, + "learning_rate": 9.852306460871164e-07, + "loss": 0.0002, + "step": 27549 + }, + { + "epoch": 11.203741358275721, + "grad_norm": 0.006866777817532773, + "learning_rate": 9.847921570051433e-07, + "loss": 0.0001, + "step": 27550 + }, + { + "epoch": 11.204148027653517, + "grad_norm": 0.004301509748209816, + "learning_rate": 9.843537604688135e-07, + "loss": 0.0, + "step": 27551 + }, + { + "epoch": 11.204554697031314, + "grad_norm": 0.0013063585328595972, + "learning_rate": 9.839154564826326e-07, + "loss": 0.0, + "step": 27552 + }, + { + "epoch": 11.20496136640911, + "grad_norm": 0.00265186774353371, + "learning_rate": 9.834772450510965e-07, + "loss": 0.0, + "step": 27553 + }, + { + "epoch": 11.205368035786906, + "grad_norm": 0.11478169253539067, + "learning_rate": 9.830391261787053e-07, + "loss": 0.0012, + "step": 27554 + }, + { + "epoch": 11.205774705164702, + "grad_norm": 0.001547529391618654, + "learning_rate": 9.826010998699553e-07, + "loss": 0.0, + "step": 27555 + }, + { + "epoch": 11.206181374542497, + "grad_norm": 0.026770551330523275, + "learning_rate": 9.821631661293429e-07, + "loss": 0.0003, + "step": 27556 + }, + { + "epoch": 11.206588043920293, + "grad_norm": 0.023284735779927367, + "learning_rate": 9.817253249613623e-07, + "loss": 0.0002, + "step": 27557 + }, + { + "epoch": 11.206994713298089, + "grad_norm": 0.011341697771310326, + "learning_rate": 9.812875763705122e-07, + "loss": 0.0001, + "step": 27558 + }, + { + "epoch": 11.207401382675885, + "grad_norm": 0.11134613502771154, + "learning_rate": 9.808499203612841e-07, + "loss": 0.0009, + "step": 27559 + }, + { + "epoch": 11.20780805205368, + "grad_norm": 0.007303374657889679, + "learning_rate": 9.804123569381706e-07, + "loss": 0.0001, + "step": 27560 + }, + { + "epoch": 11.208214721431476, + "grad_norm": 0.003079976371440506, + "learning_rate": 9.799748861056645e-07, + "loss": 0.0, + "step": 27561 + }, + { + "epoch": 11.208621390809272, + "grad_norm": 0.04828964583403357, + "learning_rate": 9.795375078682557e-07, + "loss": 0.0003, + "step": 27562 + }, + { + "epoch": 11.209028060187068, + "grad_norm": 0.003954550834532728, + "learning_rate": 9.791002222304334e-07, + "loss": 0.0, + "step": 27563 + }, + { + "epoch": 11.209434729564864, + "grad_norm": 0.029373424127557096, + "learning_rate": 9.786630291966858e-07, + "loss": 0.0001, + "step": 27564 + }, + { + "epoch": 11.20984139894266, + "grad_norm": 0.04472450549929498, + "learning_rate": 9.782259287715045e-07, + "loss": 0.0003, + "step": 27565 + }, + { + "epoch": 11.210248068320455, + "grad_norm": 0.08873943274792874, + "learning_rate": 9.77788920959375e-07, + "loss": 0.0008, + "step": 27566 + }, + { + "epoch": 11.21065473769825, + "grad_norm": 0.010283089942608606, + "learning_rate": 9.773520057647833e-07, + "loss": 0.0001, + "step": 27567 + }, + { + "epoch": 11.211061407076047, + "grad_norm": 0.002082095603090146, + "learning_rate": 9.76915183192213e-07, + "loss": 0.0, + "step": 27568 + }, + { + "epoch": 11.211468076453842, + "grad_norm": 0.025843305446115454, + "learning_rate": 9.764784532461513e-07, + "loss": 0.0002, + "step": 27569 + }, + { + "epoch": 11.211874745831638, + "grad_norm": 0.04239673564154113, + "learning_rate": 9.760418159310768e-07, + "loss": 0.0003, + "step": 27570 + }, + { + "epoch": 11.212281415209434, + "grad_norm": 0.002517585903387229, + "learning_rate": 9.756052712514762e-07, + "loss": 0.0, + "step": 27571 + }, + { + "epoch": 11.212688084587231, + "grad_norm": 0.00493281051762317, + "learning_rate": 9.751688192118303e-07, + "loss": 0.0, + "step": 27572 + }, + { + "epoch": 11.213094753965027, + "grad_norm": 0.001053141376188532, + "learning_rate": 9.747324598166175e-07, + "loss": 0.0, + "step": 27573 + }, + { + "epoch": 11.213501423342823, + "grad_norm": 0.006805473765575489, + "learning_rate": 9.742961930703188e-07, + "loss": 0.0001, + "step": 27574 + }, + { + "epoch": 11.213908092720619, + "grad_norm": 0.0357064453187743, + "learning_rate": 9.738600189774128e-07, + "loss": 0.0003, + "step": 27575 + }, + { + "epoch": 11.214314762098414, + "grad_norm": 0.014550468265877239, + "learning_rate": 9.73423937542376e-07, + "loss": 0.0002, + "step": 27576 + }, + { + "epoch": 11.21472143147621, + "grad_norm": 0.008955354881639637, + "learning_rate": 9.729879487696835e-07, + "loss": 0.0001, + "step": 27577 + }, + { + "epoch": 11.215128100854006, + "grad_norm": 0.01849027310143038, + "learning_rate": 9.725520526638154e-07, + "loss": 0.0001, + "step": 27578 + }, + { + "epoch": 11.215534770231802, + "grad_norm": 0.00038900934570531925, + "learning_rate": 9.721162492292435e-07, + "loss": 0.0, + "step": 27579 + }, + { + "epoch": 11.215941439609598, + "grad_norm": 0.012870457837283357, + "learning_rate": 9.71680538470442e-07, + "loss": 0.0001, + "step": 27580 + }, + { + "epoch": 11.216348108987393, + "grad_norm": 0.001012092947725539, + "learning_rate": 9.712449203918828e-07, + "loss": 0.0, + "step": 27581 + }, + { + "epoch": 11.216754778365189, + "grad_norm": 0.4118810925355509, + "learning_rate": 9.70809394998039e-07, + "loss": 0.0054, + "step": 27582 + }, + { + "epoch": 11.217161447742985, + "grad_norm": 0.015804808334684242, + "learning_rate": 9.703739622933794e-07, + "loss": 0.0001, + "step": 27583 + }, + { + "epoch": 11.21756811712078, + "grad_norm": 0.00710537218033626, + "learning_rate": 9.69938622282378e-07, + "loss": 0.0, + "step": 27584 + }, + { + "epoch": 11.217974786498576, + "grad_norm": 0.004827628329509339, + "learning_rate": 9.695033749695005e-07, + "loss": 0.0, + "step": 27585 + }, + { + "epoch": 11.218381455876372, + "grad_norm": 0.026972946898496556, + "learning_rate": 9.690682203592162e-07, + "loss": 0.0002, + "step": 27586 + }, + { + "epoch": 11.218788125254168, + "grad_norm": 0.015278333335483986, + "learning_rate": 9.686331584559927e-07, + "loss": 0.0001, + "step": 27587 + }, + { + "epoch": 11.219194794631964, + "grad_norm": 0.024998552561218024, + "learning_rate": 9.681981892642933e-07, + "loss": 0.0001, + "step": 27588 + }, + { + "epoch": 11.21960146400976, + "grad_norm": 0.05340796876502225, + "learning_rate": 9.677633127885866e-07, + "loss": 0.0004, + "step": 27589 + }, + { + "epoch": 11.220008133387555, + "grad_norm": 0.0007880057934946875, + "learning_rate": 9.673285290333334e-07, + "loss": 0.0, + "step": 27590 + }, + { + "epoch": 11.220414802765351, + "grad_norm": 0.3208561982479698, + "learning_rate": 9.66893838003e-07, + "loss": 0.0027, + "step": 27591 + }, + { + "epoch": 11.220821472143147, + "grad_norm": 0.019702260387406363, + "learning_rate": 9.664592397020478e-07, + "loss": 0.0002, + "step": 27592 + }, + { + "epoch": 11.221228141520944, + "grad_norm": 0.002268508400263465, + "learning_rate": 9.660247341349382e-07, + "loss": 0.0, + "step": 27593 + }, + { + "epoch": 11.22163481089874, + "grad_norm": 0.07564891512307091, + "learning_rate": 9.655903213061313e-07, + "loss": 0.001, + "step": 27594 + }, + { + "epoch": 11.222041480276536, + "grad_norm": 0.0003098446032896803, + "learning_rate": 9.651560012200867e-07, + "loss": 0.0, + "step": 27595 + }, + { + "epoch": 11.222448149654332, + "grad_norm": 0.11940371483288459, + "learning_rate": 9.647217738812632e-07, + "loss": 0.0007, + "step": 27596 + }, + { + "epoch": 11.222854819032127, + "grad_norm": 0.11300244519277124, + "learning_rate": 9.642876392941159e-07, + "loss": 0.0014, + "step": 27597 + }, + { + "epoch": 11.223261488409923, + "grad_norm": 0.012933107445196556, + "learning_rate": 9.638535974631048e-07, + "loss": 0.0001, + "step": 27598 + }, + { + "epoch": 11.223668157787719, + "grad_norm": 0.02044593845788759, + "learning_rate": 9.63419648392685e-07, + "loss": 0.0002, + "step": 27599 + }, + { + "epoch": 11.224074827165515, + "grad_norm": 0.10818527885658981, + "learning_rate": 9.62985792087311e-07, + "loss": 0.0012, + "step": 27600 + }, + { + "epoch": 11.22448149654331, + "grad_norm": 0.000838352983902909, + "learning_rate": 9.625520285514355e-07, + "loss": 0.0, + "step": 27601 + }, + { + "epoch": 11.224888165921106, + "grad_norm": 0.0021509032041029277, + "learning_rate": 9.62118357789512e-07, + "loss": 0.0, + "step": 27602 + }, + { + "epoch": 11.225294835298902, + "grad_norm": 0.0204984899729076, + "learning_rate": 9.616847798059904e-07, + "loss": 0.0002, + "step": 27603 + }, + { + "epoch": 11.225701504676698, + "grad_norm": 0.0010404312069546824, + "learning_rate": 9.612512946053242e-07, + "loss": 0.0, + "step": 27604 + }, + { + "epoch": 11.226108174054493, + "grad_norm": 0.005213135317966502, + "learning_rate": 9.608179021919638e-07, + "loss": 0.0, + "step": 27605 + }, + { + "epoch": 11.22651484343229, + "grad_norm": 0.002188246393023506, + "learning_rate": 9.603846025703566e-07, + "loss": 0.0, + "step": 27606 + }, + { + "epoch": 11.226921512810085, + "grad_norm": 0.00021081301825235535, + "learning_rate": 9.599513957449514e-07, + "loss": 0.0, + "step": 27607 + }, + { + "epoch": 11.22732818218788, + "grad_norm": 0.4286003273556541, + "learning_rate": 9.595182817201942e-07, + "loss": 0.004, + "step": 27608 + }, + { + "epoch": 11.227734851565677, + "grad_norm": 0.01639953993184561, + "learning_rate": 9.59085260500533e-07, + "loss": 0.0002, + "step": 27609 + }, + { + "epoch": 11.228141520943472, + "grad_norm": 0.0024677049643164997, + "learning_rate": 9.586523320904084e-07, + "loss": 0.0, + "step": 27610 + }, + { + "epoch": 11.228548190321268, + "grad_norm": 0.11828454950739295, + "learning_rate": 9.582194964942715e-07, + "loss": 0.0013, + "step": 27611 + }, + { + "epoch": 11.228954859699064, + "grad_norm": 0.0035594350333580365, + "learning_rate": 9.577867537165607e-07, + "loss": 0.0, + "step": 27612 + }, + { + "epoch": 11.229361529076861, + "grad_norm": 0.014401186900547635, + "learning_rate": 9.573541037617206e-07, + "loss": 0.0001, + "step": 27613 + }, + { + "epoch": 11.229768198454657, + "grad_norm": 0.00016291845874393538, + "learning_rate": 9.569215466341908e-07, + "loss": 0.0, + "step": 27614 + }, + { + "epoch": 11.230174867832453, + "grad_norm": 0.008130982511956122, + "learning_rate": 9.564890823384132e-07, + "loss": 0.0001, + "step": 27615 + }, + { + "epoch": 11.230581537210249, + "grad_norm": 0.005211610809782975, + "learning_rate": 9.560567108788254e-07, + "loss": 0.0001, + "step": 27616 + }, + { + "epoch": 11.230988206588044, + "grad_norm": 8.39100842396862e-05, + "learning_rate": 9.556244322598683e-07, + "loss": 0.0, + "step": 27617 + }, + { + "epoch": 11.23139487596584, + "grad_norm": 0.171976668915814, + "learning_rate": 9.551922464859786e-07, + "loss": 0.0008, + "step": 27618 + }, + { + "epoch": 11.231801545343636, + "grad_norm": 0.018419231003893147, + "learning_rate": 9.547601535615925e-07, + "loss": 0.0002, + "step": 27619 + }, + { + "epoch": 11.232208214721432, + "grad_norm": 0.0015022046076885061, + "learning_rate": 9.543281534911454e-07, + "loss": 0.0, + "step": 27620 + }, + { + "epoch": 11.232614884099227, + "grad_norm": 0.012958520437236463, + "learning_rate": 9.538962462790724e-07, + "loss": 0.0001, + "step": 27621 + }, + { + "epoch": 11.233021553477023, + "grad_norm": 0.07187825589855945, + "learning_rate": 9.534644319298081e-07, + "loss": 0.0004, + "step": 27622 + }, + { + "epoch": 11.233428222854819, + "grad_norm": 0.0026365403536598208, + "learning_rate": 9.530327104477822e-07, + "loss": 0.0, + "step": 27623 + }, + { + "epoch": 11.233834892232615, + "grad_norm": 0.018604854922174237, + "learning_rate": 9.52601081837431e-07, + "loss": 0.0001, + "step": 27624 + }, + { + "epoch": 11.23424156161041, + "grad_norm": 0.0004398008835642889, + "learning_rate": 9.521695461031821e-07, + "loss": 0.0, + "step": 27625 + }, + { + "epoch": 11.234648230988206, + "grad_norm": 0.018042985253816944, + "learning_rate": 9.517381032494677e-07, + "loss": 0.0002, + "step": 27626 + }, + { + "epoch": 11.235054900366002, + "grad_norm": 0.011450291273793753, + "learning_rate": 9.513067532807141e-07, + "loss": 0.0001, + "step": 27627 + }, + { + "epoch": 11.235461569743798, + "grad_norm": 0.0005252689715413325, + "learning_rate": 9.508754962013522e-07, + "loss": 0.0, + "step": 27628 + }, + { + "epoch": 11.235868239121594, + "grad_norm": 0.010918432131372276, + "learning_rate": 9.504443320158052e-07, + "loss": 0.0001, + "step": 27629 + }, + { + "epoch": 11.23627490849939, + "grad_norm": 0.004460656900861793, + "learning_rate": 9.500132607285029e-07, + "loss": 0.0, + "step": 27630 + }, + { + "epoch": 11.236681577877185, + "grad_norm": 0.00015475546452021054, + "learning_rate": 9.495822823438727e-07, + "loss": 0.0, + "step": 27631 + }, + { + "epoch": 11.237088247254981, + "grad_norm": 0.0011054782597516443, + "learning_rate": 9.491513968663324e-07, + "loss": 0.0, + "step": 27632 + }, + { + "epoch": 11.237494916632777, + "grad_norm": 0.008092240528706982, + "learning_rate": 9.487206043003083e-07, + "loss": 0.0001, + "step": 27633 + }, + { + "epoch": 11.237901586010574, + "grad_norm": 0.07926660943295998, + "learning_rate": 9.482899046502203e-07, + "loss": 0.0006, + "step": 27634 + }, + { + "epoch": 11.23830825538837, + "grad_norm": 0.09775424554245976, + "learning_rate": 9.478592979204949e-07, + "loss": 0.0009, + "step": 27635 + }, + { + "epoch": 11.238714924766166, + "grad_norm": 0.005778432372418171, + "learning_rate": 9.474287841155494e-07, + "loss": 0.0001, + "step": 27636 + }, + { + "epoch": 11.239121594143962, + "grad_norm": 0.0003891942181959258, + "learning_rate": 9.469983632398027e-07, + "loss": 0.0, + "step": 27637 + }, + { + "epoch": 11.239528263521757, + "grad_norm": 0.0015714040339130499, + "learning_rate": 9.465680352976748e-07, + "loss": 0.0, + "step": 27638 + }, + { + "epoch": 11.239934932899553, + "grad_norm": 0.05180732004722302, + "learning_rate": 9.461378002935828e-07, + "loss": 0.0005, + "step": 27639 + }, + { + "epoch": 11.240341602277349, + "grad_norm": 0.001991972525704194, + "learning_rate": 9.457076582319424e-07, + "loss": 0.0, + "step": 27640 + }, + { + "epoch": 11.240748271655145, + "grad_norm": 0.009429739965182198, + "learning_rate": 9.452776091171678e-07, + "loss": 0.0001, + "step": 27641 + }, + { + "epoch": 11.24115494103294, + "grad_norm": 0.029381355872469747, + "learning_rate": 9.448476529536787e-07, + "loss": 0.0002, + "step": 27642 + }, + { + "epoch": 11.241561610410736, + "grad_norm": 0.0018713472932584204, + "learning_rate": 9.444177897458851e-07, + "loss": 0.0, + "step": 27643 + }, + { + "epoch": 11.241968279788532, + "grad_norm": 0.0007748620837478338, + "learning_rate": 9.439880194981998e-07, + "loss": 0.0, + "step": 27644 + }, + { + "epoch": 11.242374949166328, + "grad_norm": 0.004842042064518754, + "learning_rate": 9.435583422150363e-07, + "loss": 0.0, + "step": 27645 + }, + { + "epoch": 11.242781618544123, + "grad_norm": 8.500072562262818e-05, + "learning_rate": 9.431287579008041e-07, + "loss": 0.0, + "step": 27646 + }, + { + "epoch": 11.24318828792192, + "grad_norm": 0.0043728638674632725, + "learning_rate": 9.426992665599111e-07, + "loss": 0.0, + "step": 27647 + }, + { + "epoch": 11.243594957299715, + "grad_norm": 0.008251223159273213, + "learning_rate": 9.422698681967701e-07, + "loss": 0.0001, + "step": 27648 + }, + { + "epoch": 11.24400162667751, + "grad_norm": 0.07968174922621048, + "learning_rate": 9.41840562815789e-07, + "loss": 0.0006, + "step": 27649 + }, + { + "epoch": 11.244408296055306, + "grad_norm": 0.0002279224109638635, + "learning_rate": 9.414113504213729e-07, + "loss": 0.0, + "step": 27650 + }, + { + "epoch": 11.244814965433102, + "grad_norm": 0.17044532414361682, + "learning_rate": 9.409822310179273e-07, + "loss": 0.0012, + "step": 27651 + }, + { + "epoch": 11.245221634810898, + "grad_norm": 0.031453859704222806, + "learning_rate": 9.405532046098598e-07, + "loss": 0.0002, + "step": 27652 + }, + { + "epoch": 11.245628304188694, + "grad_norm": 0.00036375826113611094, + "learning_rate": 9.401242712015723e-07, + "loss": 0.0, + "step": 27653 + }, + { + "epoch": 11.246034973566491, + "grad_norm": 0.004402125784359765, + "learning_rate": 9.39695430797467e-07, + "loss": 0.0, + "step": 27654 + }, + { + "epoch": 11.246441642944287, + "grad_norm": 0.0010385454525915806, + "learning_rate": 9.392666834019504e-07, + "loss": 0.0, + "step": 27655 + }, + { + "epoch": 11.246848312322083, + "grad_norm": 0.009890058993996077, + "learning_rate": 9.388380290194222e-07, + "loss": 0.0001, + "step": 27656 + }, + { + "epoch": 11.247254981699879, + "grad_norm": 0.0038963031813480575, + "learning_rate": 9.38409467654281e-07, + "loss": 0.0, + "step": 27657 + }, + { + "epoch": 11.247661651077674, + "grad_norm": 0.002263933810781349, + "learning_rate": 9.379809993109268e-07, + "loss": 0.0, + "step": 27658 + }, + { + "epoch": 11.24806832045547, + "grad_norm": 0.028489211224543298, + "learning_rate": 9.375526239937594e-07, + "loss": 0.0002, + "step": 27659 + }, + { + "epoch": 11.248474989833266, + "grad_norm": 0.03192776878784288, + "learning_rate": 9.37124341707173e-07, + "loss": 0.0003, + "step": 27660 + }, + { + "epoch": 11.248881659211062, + "grad_norm": 0.002629549568745106, + "learning_rate": 9.366961524555684e-07, + "loss": 0.0, + "step": 27661 + }, + { + "epoch": 11.249288328588857, + "grad_norm": 0.025040460912576067, + "learning_rate": 9.36268056243339e-07, + "loss": 0.0003, + "step": 27662 + }, + { + "epoch": 11.249694997966653, + "grad_norm": 0.00010122982324738296, + "learning_rate": 9.358400530748802e-07, + "loss": 0.0, + "step": 27663 + }, + { + "epoch": 11.250101667344449, + "grad_norm": 0.02162289145069197, + "learning_rate": 9.354121429545848e-07, + "loss": 0.0002, + "step": 27664 + }, + { + "epoch": 11.250508336722245, + "grad_norm": 0.0033186136458979807, + "learning_rate": 9.349843258868463e-07, + "loss": 0.0, + "step": 27665 + }, + { + "epoch": 11.25091500610004, + "grad_norm": 0.013690568815440526, + "learning_rate": 9.345566018760555e-07, + "loss": 0.0001, + "step": 27666 + }, + { + "epoch": 11.251321675477836, + "grad_norm": 0.019663865056016742, + "learning_rate": 9.341289709266022e-07, + "loss": 0.0001, + "step": 27667 + }, + { + "epoch": 11.251728344855632, + "grad_norm": 0.008028859380703836, + "learning_rate": 9.337014330428795e-07, + "loss": 0.0, + "step": 27668 + }, + { + "epoch": 11.252135014233428, + "grad_norm": 0.030492063417548006, + "learning_rate": 9.332739882292752e-07, + "loss": 0.0002, + "step": 27669 + }, + { + "epoch": 11.252541683611224, + "grad_norm": 0.039009817697083296, + "learning_rate": 9.328466364901756e-07, + "loss": 0.0003, + "step": 27670 + }, + { + "epoch": 11.25294835298902, + "grad_norm": 0.00832906376841972, + "learning_rate": 9.324193778299695e-07, + "loss": 0.0001, + "step": 27671 + }, + { + "epoch": 11.253355022366815, + "grad_norm": 0.0009356655702048099, + "learning_rate": 9.319922122530412e-07, + "loss": 0.0, + "step": 27672 + }, + { + "epoch": 11.25376169174461, + "grad_norm": 0.030174141223461882, + "learning_rate": 9.315651397637759e-07, + "loss": 0.0003, + "step": 27673 + }, + { + "epoch": 11.254168361122407, + "grad_norm": 0.008205569599956698, + "learning_rate": 9.311381603665592e-07, + "loss": 0.0001, + "step": 27674 + }, + { + "epoch": 11.254575030500204, + "grad_norm": 0.002769742030639827, + "learning_rate": 9.307112740657742e-07, + "loss": 0.0, + "step": 27675 + }, + { + "epoch": 11.254981699878, + "grad_norm": 0.25855871492031196, + "learning_rate": 9.302844808658018e-07, + "loss": 0.0013, + "step": 27676 + }, + { + "epoch": 11.255388369255796, + "grad_norm": 0.011257749351626838, + "learning_rate": 9.298577807710252e-07, + "loss": 0.0, + "step": 27677 + }, + { + "epoch": 11.255795038633591, + "grad_norm": 0.00023507241246363642, + "learning_rate": 9.29431173785822e-07, + "loss": 0.0, + "step": 27678 + }, + { + "epoch": 11.256201708011387, + "grad_norm": 0.23221736903659848, + "learning_rate": 9.290046599145741e-07, + "loss": 0.0017, + "step": 27679 + }, + { + "epoch": 11.256608377389183, + "grad_norm": 0.0013014224850249625, + "learning_rate": 9.285782391616561e-07, + "loss": 0.0, + "step": 27680 + }, + { + "epoch": 11.257015046766979, + "grad_norm": 0.026330145141802836, + "learning_rate": 9.281519115314497e-07, + "loss": 0.0001, + "step": 27681 + }, + { + "epoch": 11.257421716144774, + "grad_norm": 0.009808972384003774, + "learning_rate": 9.277256770283305e-07, + "loss": 0.0001, + "step": 27682 + }, + { + "epoch": 11.25782838552257, + "grad_norm": 0.002522884674304381, + "learning_rate": 9.272995356566728e-07, + "loss": 0.0, + "step": 27683 + }, + { + "epoch": 11.258235054900366, + "grad_norm": 0.007363458682418853, + "learning_rate": 9.268734874208519e-07, + "loss": 0.0, + "step": 27684 + }, + { + "epoch": 11.258641724278162, + "grad_norm": 0.012945781366877568, + "learning_rate": 9.264475323252408e-07, + "loss": 0.0002, + "step": 27685 + }, + { + "epoch": 11.259048393655958, + "grad_norm": 0.02215028017074442, + "learning_rate": 9.260216703742098e-07, + "loss": 0.0001, + "step": 27686 + }, + { + "epoch": 11.259455063033753, + "grad_norm": 0.0008101886100959513, + "learning_rate": 9.255959015721361e-07, + "loss": 0.0, + "step": 27687 + }, + { + "epoch": 11.259861732411549, + "grad_norm": 0.024751171491214512, + "learning_rate": 9.251702259233875e-07, + "loss": 0.0003, + "step": 27688 + }, + { + "epoch": 11.260268401789345, + "grad_norm": 0.015393143891502339, + "learning_rate": 9.247446434323326e-07, + "loss": 0.0001, + "step": 27689 + }, + { + "epoch": 11.26067507116714, + "grad_norm": 0.08584220707744393, + "learning_rate": 9.243191541033425e-07, + "loss": 0.0007, + "step": 27690 + }, + { + "epoch": 11.261081740544936, + "grad_norm": 0.016449421772906724, + "learning_rate": 9.238937579407836e-07, + "loss": 0.0001, + "step": 27691 + }, + { + "epoch": 11.261488409922732, + "grad_norm": 0.002653524092446039, + "learning_rate": 9.234684549490236e-07, + "loss": 0.0, + "step": 27692 + }, + { + "epoch": 11.261895079300528, + "grad_norm": 0.0014846324325397525, + "learning_rate": 9.230432451324256e-07, + "loss": 0.0, + "step": 27693 + }, + { + "epoch": 11.262301748678324, + "grad_norm": 0.11026211266111878, + "learning_rate": 9.226181284953595e-07, + "loss": 0.0013, + "step": 27694 + }, + { + "epoch": 11.262708418056121, + "grad_norm": 0.001561246373284544, + "learning_rate": 9.221931050421862e-07, + "loss": 0.0, + "step": 27695 + }, + { + "epoch": 11.263115087433917, + "grad_norm": 0.0018025181661244446, + "learning_rate": 9.2176817477727e-07, + "loss": 0.0, + "step": 27696 + }, + { + "epoch": 11.263521756811713, + "grad_norm": 0.00041516394944223444, + "learning_rate": 9.213433377049719e-07, + "loss": 0.0, + "step": 27697 + }, + { + "epoch": 11.263928426189509, + "grad_norm": 0.16484107862344075, + "learning_rate": 9.209185938296539e-07, + "loss": 0.0012, + "step": 27698 + }, + { + "epoch": 11.264335095567304, + "grad_norm": 0.0017432820907459874, + "learning_rate": 9.204939431556748e-07, + "loss": 0.0, + "step": 27699 + }, + { + "epoch": 11.2647417649451, + "grad_norm": 0.01331768193054799, + "learning_rate": 9.200693856873955e-07, + "loss": 0.0001, + "step": 27700 + }, + { + "epoch": 11.265148434322896, + "grad_norm": 0.009232444520393476, + "learning_rate": 9.196449214291747e-07, + "loss": 0.0, + "step": 27701 + }, + { + "epoch": 11.265555103700692, + "grad_norm": 0.009180908230830639, + "learning_rate": 9.192205503853691e-07, + "loss": 0.0001, + "step": 27702 + }, + { + "epoch": 11.265961773078487, + "grad_norm": 0.18650895094619974, + "learning_rate": 9.187962725603339e-07, + "loss": 0.0013, + "step": 27703 + }, + { + "epoch": 11.266368442456283, + "grad_norm": 0.00016124824604886042, + "learning_rate": 9.183720879584257e-07, + "loss": 0.0, + "step": 27704 + }, + { + "epoch": 11.266775111834079, + "grad_norm": 0.4415746526900174, + "learning_rate": 9.179479965839988e-07, + "loss": 0.0038, + "step": 27705 + }, + { + "epoch": 11.267181781211875, + "grad_norm": 0.33035366382822884, + "learning_rate": 9.175239984414053e-07, + "loss": 0.0033, + "step": 27706 + }, + { + "epoch": 11.26758845058967, + "grad_norm": 0.08459161097980594, + "learning_rate": 9.171000935350005e-07, + "loss": 0.0002, + "step": 27707 + }, + { + "epoch": 11.267995119967466, + "grad_norm": 0.0028934646532405915, + "learning_rate": 9.166762818691343e-07, + "loss": 0.0, + "step": 27708 + }, + { + "epoch": 11.268401789345262, + "grad_norm": 0.001036239163819472, + "learning_rate": 9.162525634481578e-07, + "loss": 0.0, + "step": 27709 + }, + { + "epoch": 11.268808458723058, + "grad_norm": 0.009486678774465844, + "learning_rate": 9.158289382764208e-07, + "loss": 0.0001, + "step": 27710 + }, + { + "epoch": 11.269215128100853, + "grad_norm": 0.09134141606010618, + "learning_rate": 9.154054063582707e-07, + "loss": 0.0007, + "step": 27711 + }, + { + "epoch": 11.26962179747865, + "grad_norm": 0.013958807311042471, + "learning_rate": 9.149819676980576e-07, + "loss": 0.0001, + "step": 27712 + }, + { + "epoch": 11.270028466856445, + "grad_norm": 0.13402153781060117, + "learning_rate": 9.145586223001246e-07, + "loss": 0.0011, + "step": 27713 + }, + { + "epoch": 11.27043513623424, + "grad_norm": 0.0003800222058280333, + "learning_rate": 9.141353701688215e-07, + "loss": 0.0, + "step": 27714 + }, + { + "epoch": 11.270841805612037, + "grad_norm": 0.021406114623127925, + "learning_rate": 9.137122113084917e-07, + "loss": 0.0002, + "step": 27715 + }, + { + "epoch": 11.271248474989834, + "grad_norm": 0.01706589817049141, + "learning_rate": 9.132891457234793e-07, + "loss": 0.0002, + "step": 27716 + }, + { + "epoch": 11.27165514436763, + "grad_norm": 0.00020007190308908053, + "learning_rate": 9.128661734181276e-07, + "loss": 0.0, + "step": 27717 + }, + { + "epoch": 11.272061813745426, + "grad_norm": 0.036567064840536005, + "learning_rate": 9.124432943967776e-07, + "loss": 0.0002, + "step": 27718 + }, + { + "epoch": 11.272468483123221, + "grad_norm": 0.012657491187964758, + "learning_rate": 9.12020508663769e-07, + "loss": 0.0002, + "step": 27719 + }, + { + "epoch": 11.272875152501017, + "grad_norm": 0.0016968271941601696, + "learning_rate": 9.115978162234451e-07, + "loss": 0.0, + "step": 27720 + }, + { + "epoch": 11.273281821878813, + "grad_norm": 0.00547386175419438, + "learning_rate": 9.111752170801447e-07, + "loss": 0.0001, + "step": 27721 + }, + { + "epoch": 11.273688491256609, + "grad_norm": 0.008572729781139113, + "learning_rate": 9.107527112382042e-07, + "loss": 0.0001, + "step": 27722 + }, + { + "epoch": 11.274095160634404, + "grad_norm": 0.02635006629549642, + "learning_rate": 9.103302987019613e-07, + "loss": 0.0002, + "step": 27723 + }, + { + "epoch": 11.2745018300122, + "grad_norm": 0.0037191121737089883, + "learning_rate": 9.099079794757537e-07, + "loss": 0.0, + "step": 27724 + }, + { + "epoch": 11.274908499389996, + "grad_norm": 0.014259244721631382, + "learning_rate": 9.094857535639157e-07, + "loss": 0.0001, + "step": 27725 + }, + { + "epoch": 11.275315168767792, + "grad_norm": 0.20255886277532234, + "learning_rate": 9.090636209707782e-07, + "loss": 0.0021, + "step": 27726 + }, + { + "epoch": 11.275721838145587, + "grad_norm": 0.00021118565936555962, + "learning_rate": 9.08641581700681e-07, + "loss": 0.0, + "step": 27727 + }, + { + "epoch": 11.276128507523383, + "grad_norm": 0.03078013907045041, + "learning_rate": 9.082196357579542e-07, + "loss": 0.0001, + "step": 27728 + }, + { + "epoch": 11.276535176901179, + "grad_norm": 0.013986477445284886, + "learning_rate": 9.077977831469275e-07, + "loss": 0.0001, + "step": 27729 + }, + { + "epoch": 11.276941846278975, + "grad_norm": 0.00195797013551985, + "learning_rate": 9.073760238719343e-07, + "loss": 0.0, + "step": 27730 + }, + { + "epoch": 11.27734851565677, + "grad_norm": 0.0001358955284006445, + "learning_rate": 9.069543579372996e-07, + "loss": 0.0, + "step": 27731 + }, + { + "epoch": 11.277755185034566, + "grad_norm": 0.005449778510121397, + "learning_rate": 9.06532785347356e-07, + "loss": 0.0, + "step": 27732 + }, + { + "epoch": 11.278161854412362, + "grad_norm": 0.004421398472992234, + "learning_rate": 9.061113061064308e-07, + "loss": 0.0, + "step": 27733 + }, + { + "epoch": 11.278568523790158, + "grad_norm": 0.04072325437033397, + "learning_rate": 9.056899202188496e-07, + "loss": 0.0003, + "step": 27734 + }, + { + "epoch": 11.278975193167954, + "grad_norm": 0.0036078604199372873, + "learning_rate": 9.052686276889378e-07, + "loss": 0.0, + "step": 27735 + }, + { + "epoch": 11.279381862545751, + "grad_norm": 0.0002728950247437111, + "learning_rate": 9.048474285210218e-07, + "loss": 0.0, + "step": 27736 + }, + { + "epoch": 11.279788531923547, + "grad_norm": 0.6711791655549113, + "learning_rate": 9.044263227194217e-07, + "loss": 0.0061, + "step": 27737 + }, + { + "epoch": 11.280195201301343, + "grad_norm": 0.006602314869956479, + "learning_rate": 9.040053102884649e-07, + "loss": 0.0001, + "step": 27738 + }, + { + "epoch": 11.280601870679138, + "grad_norm": 0.09833152621950421, + "learning_rate": 9.035843912324727e-07, + "loss": 0.0006, + "step": 27739 + }, + { + "epoch": 11.281008540056934, + "grad_norm": 0.002167588305834036, + "learning_rate": 9.031635655557636e-07, + "loss": 0.0, + "step": 27740 + }, + { + "epoch": 11.28141520943473, + "grad_norm": 0.0074396104376469475, + "learning_rate": 9.027428332626597e-07, + "loss": 0.0001, + "step": 27741 + }, + { + "epoch": 11.281821878812526, + "grad_norm": 0.014490919250625229, + "learning_rate": 9.023221943574789e-07, + "loss": 0.0001, + "step": 27742 + }, + { + "epoch": 11.282228548190322, + "grad_norm": 0.14587673583511648, + "learning_rate": 9.019016488445398e-07, + "loss": 0.0016, + "step": 27743 + }, + { + "epoch": 11.282635217568117, + "grad_norm": 4.43154217991193e-05, + "learning_rate": 9.014811967281567e-07, + "loss": 0.0, + "step": 27744 + }, + { + "epoch": 11.283041886945913, + "grad_norm": 0.0009854721543008141, + "learning_rate": 9.010608380126506e-07, + "loss": 0.0, + "step": 27745 + }, + { + "epoch": 11.283448556323709, + "grad_norm": 0.04386963899966676, + "learning_rate": 9.006405727023338e-07, + "loss": 0.0001, + "step": 27746 + }, + { + "epoch": 11.283855225701505, + "grad_norm": 0.03273653652158751, + "learning_rate": 9.002204008015214e-07, + "loss": 0.0004, + "step": 27747 + }, + { + "epoch": 11.2842618950793, + "grad_norm": 0.0932802842219224, + "learning_rate": 8.998003223145269e-07, + "loss": 0.0008, + "step": 27748 + }, + { + "epoch": 11.284668564457096, + "grad_norm": 0.04084392568470348, + "learning_rate": 8.993803372456622e-07, + "loss": 0.0002, + "step": 27749 + }, + { + "epoch": 11.285075233834892, + "grad_norm": 5.529664905188212e-05, + "learning_rate": 8.989604455992362e-07, + "loss": 0.0, + "step": 27750 + }, + { + "epoch": 11.285481903212688, + "grad_norm": 0.37378810689050346, + "learning_rate": 8.985406473795644e-07, + "loss": 0.0036, + "step": 27751 + }, + { + "epoch": 11.285888572590483, + "grad_norm": 0.004198329998695451, + "learning_rate": 8.981209425909521e-07, + "loss": 0.0, + "step": 27752 + }, + { + "epoch": 11.28629524196828, + "grad_norm": 0.0045913303730353815, + "learning_rate": 8.977013312377103e-07, + "loss": 0.0001, + "step": 27753 + }, + { + "epoch": 11.286701911346075, + "grad_norm": 0.005019667879494652, + "learning_rate": 8.972818133241445e-07, + "loss": 0.0001, + "step": 27754 + }, + { + "epoch": 11.28710858072387, + "grad_norm": 0.02553089622441609, + "learning_rate": 8.968623888545636e-07, + "loss": 0.0003, + "step": 27755 + }, + { + "epoch": 11.287515250101666, + "grad_norm": 0.001172517493838312, + "learning_rate": 8.964430578332706e-07, + "loss": 0.0, + "step": 27756 + }, + { + "epoch": 11.287921919479464, + "grad_norm": 0.05580083881341808, + "learning_rate": 8.960238202645699e-07, + "loss": 0.0002, + "step": 27757 + }, + { + "epoch": 11.28832858885726, + "grad_norm": 0.10214222630595414, + "learning_rate": 8.956046761527682e-07, + "loss": 0.0008, + "step": 27758 + }, + { + "epoch": 11.288735258235056, + "grad_norm": 0.010531403408352995, + "learning_rate": 8.951856255021674e-07, + "loss": 0.0001, + "step": 27759 + }, + { + "epoch": 11.289141927612851, + "grad_norm": 0.0020624755458609656, + "learning_rate": 8.947666683170675e-07, + "loss": 0.0, + "step": 27760 + }, + { + "epoch": 11.289548596990647, + "grad_norm": 0.00012566865769768343, + "learning_rate": 8.943478046017706e-07, + "loss": 0.0, + "step": 27761 + }, + { + "epoch": 11.289955266368443, + "grad_norm": 0.005533682014368114, + "learning_rate": 8.939290343605766e-07, + "loss": 0.0001, + "step": 27762 + }, + { + "epoch": 11.290361935746239, + "grad_norm": 0.4540517275265066, + "learning_rate": 8.935103575977822e-07, + "loss": 0.0038, + "step": 27763 + }, + { + "epoch": 11.290768605124034, + "grad_norm": 0.0656352233769601, + "learning_rate": 8.930917743176892e-07, + "loss": 0.0005, + "step": 27764 + }, + { + "epoch": 11.29117527450183, + "grad_norm": 0.0010415089216777183, + "learning_rate": 8.926732845245922e-07, + "loss": 0.0, + "step": 27765 + }, + { + "epoch": 11.291581943879626, + "grad_norm": 0.016983064293985082, + "learning_rate": 8.922548882227877e-07, + "loss": 0.0001, + "step": 27766 + }, + { + "epoch": 11.291988613257422, + "grad_norm": 6.589566460230887e-05, + "learning_rate": 8.9183658541657e-07, + "loss": 0.0, + "step": 27767 + }, + { + "epoch": 11.292395282635217, + "grad_norm": 0.0001766839645248642, + "learning_rate": 8.914183761102347e-07, + "loss": 0.0, + "step": 27768 + }, + { + "epoch": 11.292801952013013, + "grad_norm": 0.003224289256117835, + "learning_rate": 8.910002603080736e-07, + "loss": 0.0, + "step": 27769 + }, + { + "epoch": 11.293208621390809, + "grad_norm": 0.00011557528807956533, + "learning_rate": 8.90582238014378e-07, + "loss": 0.0, + "step": 27770 + }, + { + "epoch": 11.293615290768605, + "grad_norm": 0.002045212901590454, + "learning_rate": 8.901643092334411e-07, + "loss": 0.0, + "step": 27771 + }, + { + "epoch": 11.2940219601464, + "grad_norm": 0.0034051884241948775, + "learning_rate": 8.897464739695538e-07, + "loss": 0.0, + "step": 27772 + }, + { + "epoch": 11.294428629524196, + "grad_norm": 0.010784926565315312, + "learning_rate": 8.893287322270028e-07, + "loss": 0.0001, + "step": 27773 + }, + { + "epoch": 11.294835298901992, + "grad_norm": 0.010413133395987362, + "learning_rate": 8.889110840100778e-07, + "loss": 0.0001, + "step": 27774 + }, + { + "epoch": 11.295241968279788, + "grad_norm": 0.03953481706999216, + "learning_rate": 8.884935293230668e-07, + "loss": 0.0004, + "step": 27775 + }, + { + "epoch": 11.295648637657584, + "grad_norm": 0.0021093486928481513, + "learning_rate": 8.880760681702539e-07, + "loss": 0.0, + "step": 27776 + }, + { + "epoch": 11.296055307035381, + "grad_norm": 9.057964735320602e-05, + "learning_rate": 8.876587005559268e-07, + "loss": 0.0, + "step": 27777 + }, + { + "epoch": 11.296461976413177, + "grad_norm": 0.001990361779350175, + "learning_rate": 8.872414264843698e-07, + "loss": 0.0, + "step": 27778 + }, + { + "epoch": 11.296868645790973, + "grad_norm": 0.001976676500284939, + "learning_rate": 8.868242459598652e-07, + "loss": 0.0, + "step": 27779 + }, + { + "epoch": 11.297275315168768, + "grad_norm": 0.008358682923336606, + "learning_rate": 8.864071589866974e-07, + "loss": 0.0001, + "step": 27780 + }, + { + "epoch": 11.297681984546564, + "grad_norm": 0.0008102404891123655, + "learning_rate": 8.859901655691461e-07, + "loss": 0.0, + "step": 27781 + }, + { + "epoch": 11.29808865392436, + "grad_norm": 0.001202080518581142, + "learning_rate": 8.855732657114935e-07, + "loss": 0.0, + "step": 27782 + }, + { + "epoch": 11.298495323302156, + "grad_norm": 0.06027298944165144, + "learning_rate": 8.851564594180162e-07, + "loss": 0.0002, + "step": 27783 + }, + { + "epoch": 11.298901992679951, + "grad_norm": 0.0014422593990205893, + "learning_rate": 8.847397466929974e-07, + "loss": 0.0, + "step": 27784 + }, + { + "epoch": 11.299308662057747, + "grad_norm": 0.014910577550201696, + "learning_rate": 8.843231275407127e-07, + "loss": 0.0001, + "step": 27785 + }, + { + "epoch": 11.299715331435543, + "grad_norm": 0.0023500405125793343, + "learning_rate": 8.839066019654386e-07, + "loss": 0.0, + "step": 27786 + }, + { + "epoch": 11.300122000813339, + "grad_norm": 0.015428383581806192, + "learning_rate": 8.834901699714505e-07, + "loss": 0.0001, + "step": 27787 + }, + { + "epoch": 11.300528670191134, + "grad_norm": 0.018198805769051554, + "learning_rate": 8.830738315630249e-07, + "loss": 0.0001, + "step": 27788 + }, + { + "epoch": 11.30093533956893, + "grad_norm": 0.026362501523737952, + "learning_rate": 8.826575867444331e-07, + "loss": 0.0002, + "step": 27789 + }, + { + "epoch": 11.301342008946726, + "grad_norm": 0.005039833728172953, + "learning_rate": 8.822414355199516e-07, + "loss": 0.0, + "step": 27790 + }, + { + "epoch": 11.301748678324522, + "grad_norm": 0.0004814442607773025, + "learning_rate": 8.818253778938512e-07, + "loss": 0.0, + "step": 27791 + }, + { + "epoch": 11.302155347702318, + "grad_norm": 0.5064144843957398, + "learning_rate": 8.81409413870401e-07, + "loss": 0.0042, + "step": 27792 + }, + { + "epoch": 11.302562017080113, + "grad_norm": 0.03708271424429012, + "learning_rate": 8.809935434538741e-07, + "loss": 0.0003, + "step": 27793 + }, + { + "epoch": 11.302968686457909, + "grad_norm": 0.016793971300863946, + "learning_rate": 8.805777666485371e-07, + "loss": 0.0001, + "step": 27794 + }, + { + "epoch": 11.303375355835705, + "grad_norm": 0.0026824720681692285, + "learning_rate": 8.801620834586589e-07, + "loss": 0.0, + "step": 27795 + }, + { + "epoch": 11.3037820252135, + "grad_norm": 0.013714832653067664, + "learning_rate": 8.797464938885047e-07, + "loss": 0.0001, + "step": 27796 + }, + { + "epoch": 11.304188694591296, + "grad_norm": 0.0424172032533958, + "learning_rate": 8.793309979423458e-07, + "loss": 0.0005, + "step": 27797 + }, + { + "epoch": 11.304595363969094, + "grad_norm": 0.031561746648805046, + "learning_rate": 8.789155956244433e-07, + "loss": 0.0003, + "step": 27798 + }, + { + "epoch": 11.30500203334689, + "grad_norm": 0.014703039337664215, + "learning_rate": 8.785002869390635e-07, + "loss": 0.0001, + "step": 27799 + }, + { + "epoch": 11.305408702724685, + "grad_norm": 0.0019428168975035548, + "learning_rate": 8.780850718904688e-07, + "loss": 0.0, + "step": 27800 + }, + { + "epoch": 11.305815372102481, + "grad_norm": 0.5955523948532947, + "learning_rate": 8.776699504829222e-07, + "loss": 0.0052, + "step": 27801 + }, + { + "epoch": 11.306222041480277, + "grad_norm": 0.051211118592116835, + "learning_rate": 8.772549227206827e-07, + "loss": 0.0005, + "step": 27802 + }, + { + "epoch": 11.306628710858073, + "grad_norm": 0.003271106794628965, + "learning_rate": 8.768399886080148e-07, + "loss": 0.0, + "step": 27803 + }, + { + "epoch": 11.307035380235869, + "grad_norm": 0.002773094296085965, + "learning_rate": 8.764251481491759e-07, + "loss": 0.0, + "step": 27804 + }, + { + "epoch": 11.307442049613664, + "grad_norm": 0.05675668707064373, + "learning_rate": 8.760104013484238e-07, + "loss": 0.0005, + "step": 27805 + }, + { + "epoch": 11.30784871899146, + "grad_norm": 6.95363519717574e-05, + "learning_rate": 8.755957482100174e-07, + "loss": 0.0, + "step": 27806 + }, + { + "epoch": 11.308255388369256, + "grad_norm": 0.5125750997155485, + "learning_rate": 8.751811887382133e-07, + "loss": 0.0071, + "step": 27807 + }, + { + "epoch": 11.308662057747052, + "grad_norm": 0.042885807882840174, + "learning_rate": 8.747667229372669e-07, + "loss": 0.0003, + "step": 27808 + }, + { + "epoch": 11.309068727124847, + "grad_norm": 0.16446329684453956, + "learning_rate": 8.743523508114305e-07, + "loss": 0.0018, + "step": 27809 + }, + { + "epoch": 11.309475396502643, + "grad_norm": 0.018777288491963775, + "learning_rate": 8.739380723649616e-07, + "loss": 0.0002, + "step": 27810 + }, + { + "epoch": 11.309882065880439, + "grad_norm": 0.0008961691773773985, + "learning_rate": 8.735238876021113e-07, + "loss": 0.0, + "step": 27811 + }, + { + "epoch": 11.310288735258235, + "grad_norm": 0.000513825447831881, + "learning_rate": 8.731097965271329e-07, + "loss": 0.0, + "step": 27812 + }, + { + "epoch": 11.31069540463603, + "grad_norm": 0.013878757196770428, + "learning_rate": 8.726957991442742e-07, + "loss": 0.0001, + "step": 27813 + }, + { + "epoch": 11.311102074013826, + "grad_norm": 0.0045952053377615616, + "learning_rate": 8.722818954577883e-07, + "loss": 0.0, + "step": 27814 + }, + { + "epoch": 11.311508743391622, + "grad_norm": 0.007835741124229575, + "learning_rate": 8.718680854719197e-07, + "loss": 0.0001, + "step": 27815 + }, + { + "epoch": 11.311915412769418, + "grad_norm": 0.0008552862132364164, + "learning_rate": 8.714543691909217e-07, + "loss": 0.0, + "step": 27816 + }, + { + "epoch": 11.312322082147213, + "grad_norm": 0.06558743580503304, + "learning_rate": 8.710407466190385e-07, + "loss": 0.0006, + "step": 27817 + }, + { + "epoch": 11.312728751525011, + "grad_norm": 0.0006773295639430448, + "learning_rate": 8.706272177605168e-07, + "loss": 0.0, + "step": 27818 + }, + { + "epoch": 11.313135420902807, + "grad_norm": 0.006266858120974683, + "learning_rate": 8.702137826196011e-07, + "loss": 0.0, + "step": 27819 + }, + { + "epoch": 11.313542090280603, + "grad_norm": 0.023831423656552466, + "learning_rate": 8.698004412005368e-07, + "loss": 0.0002, + "step": 27820 + }, + { + "epoch": 11.313948759658398, + "grad_norm": 0.001225287106161581, + "learning_rate": 8.693871935075648e-07, + "loss": 0.0, + "step": 27821 + }, + { + "epoch": 11.314355429036194, + "grad_norm": 1.1097013257376147, + "learning_rate": 8.689740395449275e-07, + "loss": 0.0089, + "step": 27822 + }, + { + "epoch": 11.31476209841399, + "grad_norm": 0.04615668154361113, + "learning_rate": 8.685609793168692e-07, + "loss": 0.0005, + "step": 27823 + }, + { + "epoch": 11.315168767791786, + "grad_norm": 0.031527313179289126, + "learning_rate": 8.681480128276276e-07, + "loss": 0.0003, + "step": 27824 + }, + { + "epoch": 11.315575437169581, + "grad_norm": 0.01101055442357582, + "learning_rate": 8.677351400814438e-07, + "loss": 0.0001, + "step": 27825 + }, + { + "epoch": 11.315982106547377, + "grad_norm": 0.004987334934400474, + "learning_rate": 8.673223610825532e-07, + "loss": 0.0, + "step": 27826 + }, + { + "epoch": 11.316388775925173, + "grad_norm": 0.00034228856881696066, + "learning_rate": 8.669096758351958e-07, + "loss": 0.0, + "step": 27827 + }, + { + "epoch": 11.316795445302969, + "grad_norm": 0.0009589882840482012, + "learning_rate": 8.664970843436082e-07, + "loss": 0.0, + "step": 27828 + }, + { + "epoch": 11.317202114680764, + "grad_norm": 0.0006621597657750558, + "learning_rate": 8.660845866120216e-07, + "loss": 0.0, + "step": 27829 + }, + { + "epoch": 11.31760878405856, + "grad_norm": 0.0035506767893043363, + "learning_rate": 8.656721826446779e-07, + "loss": 0.0, + "step": 27830 + }, + { + "epoch": 11.318015453436356, + "grad_norm": 0.8025063623920448, + "learning_rate": 8.65259872445805e-07, + "loss": 0.0093, + "step": 27831 + }, + { + "epoch": 11.318422122814152, + "grad_norm": 0.00166556079833165, + "learning_rate": 8.648476560196362e-07, + "loss": 0.0, + "step": 27832 + }, + { + "epoch": 11.318828792191947, + "grad_norm": 5.1548814189746766e-05, + "learning_rate": 8.644355333704036e-07, + "loss": 0.0, + "step": 27833 + }, + { + "epoch": 11.319235461569743, + "grad_norm": 0.000299129829706309, + "learning_rate": 8.640235045023371e-07, + "loss": 0.0, + "step": 27834 + }, + { + "epoch": 11.319642130947539, + "grad_norm": 0.48236594456275567, + "learning_rate": 8.63611569419669e-07, + "loss": 0.005, + "step": 27835 + }, + { + "epoch": 11.320048800325335, + "grad_norm": 0.031660192996049136, + "learning_rate": 8.631997281266257e-07, + "loss": 0.0002, + "step": 27836 + }, + { + "epoch": 11.32045546970313, + "grad_norm": 0.040609693868508726, + "learning_rate": 8.627879806274364e-07, + "loss": 0.0004, + "step": 27837 + }, + { + "epoch": 11.320862139080926, + "grad_norm": 0.0007226692219297521, + "learning_rate": 8.623763269263275e-07, + "loss": 0.0, + "step": 27838 + }, + { + "epoch": 11.321268808458724, + "grad_norm": 0.0013446486776606307, + "learning_rate": 8.619647670275233e-07, + "loss": 0.0, + "step": 27839 + }, + { + "epoch": 11.32167547783652, + "grad_norm": 0.003912757465104926, + "learning_rate": 8.615533009352495e-07, + "loss": 0.0, + "step": 27840 + }, + { + "epoch": 11.322082147214315, + "grad_norm": 0.00826043638837465, + "learning_rate": 8.611419286537303e-07, + "loss": 0.0001, + "step": 27841 + }, + { + "epoch": 11.322488816592111, + "grad_norm": 0.011637801504317048, + "learning_rate": 8.607306501871903e-07, + "loss": 0.0001, + "step": 27842 + }, + { + "epoch": 11.322895485969907, + "grad_norm": 0.17416156225099258, + "learning_rate": 8.603194655398495e-07, + "loss": 0.0017, + "step": 27843 + }, + { + "epoch": 11.323302155347703, + "grad_norm": 0.0350583114684934, + "learning_rate": 8.599083747159287e-07, + "loss": 0.0004, + "step": 27844 + }, + { + "epoch": 11.323708824725498, + "grad_norm": 0.012540832973241935, + "learning_rate": 8.594973777196491e-07, + "loss": 0.0001, + "step": 27845 + }, + { + "epoch": 11.324115494103294, + "grad_norm": 0.04416969242970995, + "learning_rate": 8.590864745552285e-07, + "loss": 0.0003, + "step": 27846 + }, + { + "epoch": 11.32452216348109, + "grad_norm": 0.0321216904996893, + "learning_rate": 8.586756652268846e-07, + "loss": 0.0002, + "step": 27847 + }, + { + "epoch": 11.324928832858886, + "grad_norm": 0.010404175765697476, + "learning_rate": 8.582649497388373e-07, + "loss": 0.0, + "step": 27848 + }, + { + "epoch": 11.325335502236682, + "grad_norm": 0.00204152954998663, + "learning_rate": 8.578543280953022e-07, + "loss": 0.0, + "step": 27849 + }, + { + "epoch": 11.325742171614477, + "grad_norm": 0.028896796050449577, + "learning_rate": 8.574438003004926e-07, + "loss": 0.0002, + "step": 27850 + }, + { + "epoch": 11.326148840992273, + "grad_norm": 0.001037752551618247, + "learning_rate": 8.570333663586239e-07, + "loss": 0.0, + "step": 27851 + }, + { + "epoch": 11.326555510370069, + "grad_norm": 0.0020677477024363226, + "learning_rate": 8.566230262739095e-07, + "loss": 0.0, + "step": 27852 + }, + { + "epoch": 11.326962179747865, + "grad_norm": 0.20715459081040863, + "learning_rate": 8.562127800505593e-07, + "loss": 0.0021, + "step": 27853 + }, + { + "epoch": 11.32736884912566, + "grad_norm": 0.021214297444479103, + "learning_rate": 8.558026276927889e-07, + "loss": 0.0002, + "step": 27854 + }, + { + "epoch": 11.327775518503456, + "grad_norm": 0.013033112728950934, + "learning_rate": 8.553925692048059e-07, + "loss": 0.0001, + "step": 27855 + }, + { + "epoch": 11.328182187881252, + "grad_norm": 9.647025151662655e-05, + "learning_rate": 8.549826045908216e-07, + "loss": 0.0, + "step": 27856 + }, + { + "epoch": 11.328588857259048, + "grad_norm": 0.001170461966169685, + "learning_rate": 8.545727338550436e-07, + "loss": 0.0, + "step": 27857 + }, + { + "epoch": 11.328995526636843, + "grad_norm": 0.6281593015661273, + "learning_rate": 8.541629570016785e-07, + "loss": 0.0056, + "step": 27858 + }, + { + "epoch": 11.329402196014641, + "grad_norm": 0.0032483469904444643, + "learning_rate": 8.537532740349342e-07, + "loss": 0.0, + "step": 27859 + }, + { + "epoch": 11.329808865392437, + "grad_norm": 0.0008663739975411113, + "learning_rate": 8.533436849590138e-07, + "loss": 0.0, + "step": 27860 + }, + { + "epoch": 11.330215534770232, + "grad_norm": 0.008825670225770885, + "learning_rate": 8.529341897781252e-07, + "loss": 0.0001, + "step": 27861 + }, + { + "epoch": 11.330622204148028, + "grad_norm": 0.12323384824467377, + "learning_rate": 8.525247884964715e-07, + "loss": 0.0008, + "step": 27862 + }, + { + "epoch": 11.331028873525824, + "grad_norm": 0.0003691073520088933, + "learning_rate": 8.521154811182542e-07, + "loss": 0.0, + "step": 27863 + }, + { + "epoch": 11.33143554290362, + "grad_norm": 0.015180251954890978, + "learning_rate": 8.517062676476761e-07, + "loss": 0.0002, + "step": 27864 + }, + { + "epoch": 11.331842212281416, + "grad_norm": 0.004151806464187262, + "learning_rate": 8.512971480889365e-07, + "loss": 0.0, + "step": 27865 + }, + { + "epoch": 11.332248881659211, + "grad_norm": 0.00588379172638749, + "learning_rate": 8.50888122446235e-07, + "loss": 0.0, + "step": 27866 + }, + { + "epoch": 11.332655551037007, + "grad_norm": 0.0027971988218802074, + "learning_rate": 8.504791907237731e-07, + "loss": 0.0, + "step": 27867 + }, + { + "epoch": 11.333062220414803, + "grad_norm": 0.025550094293702783, + "learning_rate": 8.50070352925747e-07, + "loss": 0.0002, + "step": 27868 + }, + { + "epoch": 11.333468889792599, + "grad_norm": 0.137395218351027, + "learning_rate": 8.496616090563548e-07, + "loss": 0.0014, + "step": 27869 + }, + { + "epoch": 11.333875559170394, + "grad_norm": 0.0108742358838297, + "learning_rate": 8.492529591197907e-07, + "loss": 0.0001, + "step": 27870 + }, + { + "epoch": 11.33428222854819, + "grad_norm": 0.004756959921359973, + "learning_rate": 8.488444031202503e-07, + "loss": 0.0, + "step": 27871 + }, + { + "epoch": 11.334688897925986, + "grad_norm": 0.056133307986938635, + "learning_rate": 8.484359410619281e-07, + "loss": 0.0004, + "step": 27872 + }, + { + "epoch": 11.335095567303782, + "grad_norm": 0.10146867014545574, + "learning_rate": 8.48027572949014e-07, + "loss": 0.0009, + "step": 27873 + }, + { + "epoch": 11.335502236681577, + "grad_norm": 0.004560844163651955, + "learning_rate": 8.476192987857057e-07, + "loss": 0.0, + "step": 27874 + }, + { + "epoch": 11.335908906059373, + "grad_norm": 0.0007243120950486828, + "learning_rate": 8.472111185761922e-07, + "loss": 0.0, + "step": 27875 + }, + { + "epoch": 11.336315575437169, + "grad_norm": 0.1333656279256014, + "learning_rate": 8.468030323246623e-07, + "loss": 0.0014, + "step": 27876 + }, + { + "epoch": 11.336722244814965, + "grad_norm": 0.0015173342790326988, + "learning_rate": 8.46395040035306e-07, + "loss": 0.0, + "step": 27877 + }, + { + "epoch": 11.33712891419276, + "grad_norm": 0.018563322950393552, + "learning_rate": 8.459871417123111e-07, + "loss": 0.0002, + "step": 27878 + }, + { + "epoch": 11.337535583570556, + "grad_norm": 0.004937686198278418, + "learning_rate": 8.455793373598642e-07, + "loss": 0.0, + "step": 27879 + }, + { + "epoch": 11.337942252948354, + "grad_norm": 0.023084936215152063, + "learning_rate": 8.451716269821541e-07, + "loss": 0.0002, + "step": 27880 + }, + { + "epoch": 11.33834892232615, + "grad_norm": 0.0006442786143797245, + "learning_rate": 8.447640105833654e-07, + "loss": 0.0, + "step": 27881 + }, + { + "epoch": 11.338755591703945, + "grad_norm": 0.003482312224317873, + "learning_rate": 8.443564881676813e-07, + "loss": 0.0, + "step": 27882 + }, + { + "epoch": 11.339162261081741, + "grad_norm": 0.00011961808045193309, + "learning_rate": 8.439490597392863e-07, + "loss": 0.0, + "step": 27883 + }, + { + "epoch": 11.339568930459537, + "grad_norm": 0.0019173824304987174, + "learning_rate": 8.435417253023614e-07, + "loss": 0.0, + "step": 27884 + }, + { + "epoch": 11.339975599837333, + "grad_norm": 0.005996262681886204, + "learning_rate": 8.431344848610901e-07, + "loss": 0.0001, + "step": 27885 + }, + { + "epoch": 11.340382269215128, + "grad_norm": 0.09827345361448246, + "learning_rate": 8.427273384196499e-07, + "loss": 0.0008, + "step": 27886 + }, + { + "epoch": 11.340788938592924, + "grad_norm": 0.08548468568985015, + "learning_rate": 8.423202859822233e-07, + "loss": 0.0007, + "step": 27887 + }, + { + "epoch": 11.34119560797072, + "grad_norm": 0.03283445704987814, + "learning_rate": 8.41913327552989e-07, + "loss": 0.0003, + "step": 27888 + }, + { + "epoch": 11.341602277348516, + "grad_norm": 0.03270421841596413, + "learning_rate": 8.415064631361225e-07, + "loss": 0.0003, + "step": 27889 + }, + { + "epoch": 11.342008946726311, + "grad_norm": 0.0008865481844665957, + "learning_rate": 8.410996927358028e-07, + "loss": 0.0, + "step": 27890 + }, + { + "epoch": 11.342415616104107, + "grad_norm": 0.0432266884877905, + "learning_rate": 8.406930163562032e-07, + "loss": 0.0004, + "step": 27891 + }, + { + "epoch": 11.342822285481903, + "grad_norm": 0.003792571989340772, + "learning_rate": 8.402864340014982e-07, + "loss": 0.0, + "step": 27892 + }, + { + "epoch": 11.343228954859699, + "grad_norm": 0.11097446455577427, + "learning_rate": 8.398799456758644e-07, + "loss": 0.0011, + "step": 27893 + }, + { + "epoch": 11.343635624237494, + "grad_norm": 0.0015665510322057455, + "learning_rate": 8.39473551383474e-07, + "loss": 0.0, + "step": 27894 + }, + { + "epoch": 11.34404229361529, + "grad_norm": 0.001289048617552251, + "learning_rate": 8.39067251128497e-07, + "loss": 0.0, + "step": 27895 + }, + { + "epoch": 11.344448962993086, + "grad_norm": 0.40588671493192746, + "learning_rate": 8.386610449151055e-07, + "loss": 0.0036, + "step": 27896 + }, + { + "epoch": 11.344855632370882, + "grad_norm": 0.024701065522049495, + "learning_rate": 8.382549327474698e-07, + "loss": 0.0001, + "step": 27897 + }, + { + "epoch": 11.345262301748678, + "grad_norm": 0.01330042645632544, + "learning_rate": 8.378489146297575e-07, + "loss": 0.0001, + "step": 27898 + }, + { + "epoch": 11.345668971126473, + "grad_norm": 0.001826879188604023, + "learning_rate": 8.374429905661363e-07, + "loss": 0.0, + "step": 27899 + }, + { + "epoch": 11.34607564050427, + "grad_norm": 0.0020272175411363757, + "learning_rate": 8.370371605607752e-07, + "loss": 0.0, + "step": 27900 + }, + { + "epoch": 11.346482309882067, + "grad_norm": 7.353878295672128e-05, + "learning_rate": 8.366314246178397e-07, + "loss": 0.0, + "step": 27901 + }, + { + "epoch": 11.346888979259862, + "grad_norm": 0.07529484095893837, + "learning_rate": 8.362257827414943e-07, + "loss": 0.0009, + "step": 27902 + }, + { + "epoch": 11.347295648637658, + "grad_norm": 0.001279737258011903, + "learning_rate": 8.358202349359035e-07, + "loss": 0.0, + "step": 27903 + }, + { + "epoch": 11.347702318015454, + "grad_norm": 0.0012966694837562279, + "learning_rate": 8.354147812052305e-07, + "loss": 0.0, + "step": 27904 + }, + { + "epoch": 11.34810898739325, + "grad_norm": 0.06197513873304677, + "learning_rate": 8.350094215536353e-07, + "loss": 0.0005, + "step": 27905 + }, + { + "epoch": 11.348515656771045, + "grad_norm": 0.059093794853534096, + "learning_rate": 8.346041559852824e-07, + "loss": 0.0005, + "step": 27906 + }, + { + "epoch": 11.348922326148841, + "grad_norm": 0.004053561449975298, + "learning_rate": 8.341989845043308e-07, + "loss": 0.0001, + "step": 27907 + }, + { + "epoch": 11.349328995526637, + "grad_norm": 0.009102451294210652, + "learning_rate": 8.337939071149393e-07, + "loss": 0.0001, + "step": 27908 + }, + { + "epoch": 11.349735664904433, + "grad_norm": 0.015964911480454237, + "learning_rate": 8.333889238212667e-07, + "loss": 0.0002, + "step": 27909 + }, + { + "epoch": 11.350142334282229, + "grad_norm": 0.00398542420150848, + "learning_rate": 8.329840346274709e-07, + "loss": 0.0, + "step": 27910 + }, + { + "epoch": 11.350549003660024, + "grad_norm": 0.0028750511302784443, + "learning_rate": 8.325792395377075e-07, + "loss": 0.0, + "step": 27911 + }, + { + "epoch": 11.35095567303782, + "grad_norm": 0.00012487701261946856, + "learning_rate": 8.321745385561298e-07, + "loss": 0.0, + "step": 27912 + }, + { + "epoch": 11.351362342415616, + "grad_norm": 0.007436536698433159, + "learning_rate": 8.317699316868966e-07, + "loss": 0.0001, + "step": 27913 + }, + { + "epoch": 11.351769011793412, + "grad_norm": 0.7175459172293215, + "learning_rate": 8.313654189341591e-07, + "loss": 0.0063, + "step": 27914 + }, + { + "epoch": 11.352175681171207, + "grad_norm": 0.07640995669405526, + "learning_rate": 8.309610003020707e-07, + "loss": 0.0008, + "step": 27915 + }, + { + "epoch": 11.352582350549003, + "grad_norm": 0.011670781927036277, + "learning_rate": 8.305566757947814e-07, + "loss": 0.0001, + "step": 27916 + }, + { + "epoch": 11.352989019926799, + "grad_norm": 0.009337684327967331, + "learning_rate": 8.301524454164433e-07, + "loss": 0.0001, + "step": 27917 + }, + { + "epoch": 11.353395689304595, + "grad_norm": 0.010671222831241174, + "learning_rate": 8.297483091712033e-07, + "loss": 0.0, + "step": 27918 + }, + { + "epoch": 11.35380235868239, + "grad_norm": 0.01063706215173629, + "learning_rate": 8.293442670632146e-07, + "loss": 0.0001, + "step": 27919 + }, + { + "epoch": 11.354209028060186, + "grad_norm": 0.04136679903278343, + "learning_rate": 8.289403190966216e-07, + "loss": 0.0004, + "step": 27920 + }, + { + "epoch": 11.354615697437984, + "grad_norm": 0.012811861548026054, + "learning_rate": 8.285364652755734e-07, + "loss": 0.0001, + "step": 27921 + }, + { + "epoch": 11.35502236681578, + "grad_norm": 0.018758625443390773, + "learning_rate": 8.281327056042132e-07, + "loss": 0.0002, + "step": 27922 + }, + { + "epoch": 11.355429036193575, + "grad_norm": 0.04307687528515635, + "learning_rate": 8.277290400866866e-07, + "loss": 0.0005, + "step": 27923 + }, + { + "epoch": 11.355835705571371, + "grad_norm": 0.004352088558241807, + "learning_rate": 8.273254687271381e-07, + "loss": 0.0, + "step": 27924 + }, + { + "epoch": 11.356242374949167, + "grad_norm": 0.011788272960230618, + "learning_rate": 8.269219915297089e-07, + "loss": 0.0001, + "step": 27925 + }, + { + "epoch": 11.356649044326963, + "grad_norm": 0.00015348480457914984, + "learning_rate": 8.265186084985433e-07, + "loss": 0.0, + "step": 27926 + }, + { + "epoch": 11.357055713704758, + "grad_norm": 0.002636495476362485, + "learning_rate": 8.261153196377814e-07, + "loss": 0.0, + "step": 27927 + }, + { + "epoch": 11.357462383082554, + "grad_norm": 9.487031209114965e-05, + "learning_rate": 8.25712124951562e-07, + "loss": 0.0, + "step": 27928 + }, + { + "epoch": 11.35786905246035, + "grad_norm": 0.12967537330510312, + "learning_rate": 8.253090244440265e-07, + "loss": 0.0007, + "step": 27929 + }, + { + "epoch": 11.358275721838146, + "grad_norm": 0.006762911274671768, + "learning_rate": 8.249060181193103e-07, + "loss": 0.0001, + "step": 27930 + }, + { + "epoch": 11.358682391215941, + "grad_norm": 0.033506343719772, + "learning_rate": 8.245031059815511e-07, + "loss": 0.0004, + "step": 27931 + }, + { + "epoch": 11.359089060593737, + "grad_norm": 0.6391599455903484, + "learning_rate": 8.241002880348869e-07, + "loss": 0.0063, + "step": 27932 + }, + { + "epoch": 11.359495729971533, + "grad_norm": 0.00216260768967418, + "learning_rate": 8.23697564283451e-07, + "loss": 0.0, + "step": 27933 + }, + { + "epoch": 11.359902399349329, + "grad_norm": 0.07544797394545621, + "learning_rate": 8.232949347313768e-07, + "loss": 0.0005, + "step": 27934 + }, + { + "epoch": 11.360309068727124, + "grad_norm": 0.04751660738956945, + "learning_rate": 8.228923993827998e-07, + "loss": 0.0005, + "step": 27935 + }, + { + "epoch": 11.36071573810492, + "grad_norm": 0.0002761385523211734, + "learning_rate": 8.22489958241851e-07, + "loss": 0.0, + "step": 27936 + }, + { + "epoch": 11.361122407482716, + "grad_norm": 0.005057880176207064, + "learning_rate": 8.220876113126596e-07, + "loss": 0.0, + "step": 27937 + }, + { + "epoch": 11.361529076860512, + "grad_norm": 0.008853647708693471, + "learning_rate": 8.21685358599359e-07, + "loss": 0.0001, + "step": 27938 + }, + { + "epoch": 11.361935746238307, + "grad_norm": 0.0003529979763046704, + "learning_rate": 8.212832001060788e-07, + "loss": 0.0, + "step": 27939 + }, + { + "epoch": 11.362342415616103, + "grad_norm": 0.00031177324283763303, + "learning_rate": 8.208811358369462e-07, + "loss": 0.0, + "step": 27940 + }, + { + "epoch": 11.3627490849939, + "grad_norm": 0.011668854904087034, + "learning_rate": 8.204791657960876e-07, + "loss": 0.0001, + "step": 27941 + }, + { + "epoch": 11.363155754371697, + "grad_norm": 4.3845831445569e-05, + "learning_rate": 8.200772899876308e-07, + "loss": 0.0, + "step": 27942 + }, + { + "epoch": 11.363562423749492, + "grad_norm": 0.0008938384181702839, + "learning_rate": 8.196755084156982e-07, + "loss": 0.0, + "step": 27943 + }, + { + "epoch": 11.363969093127288, + "grad_norm": 0.014248253727582871, + "learning_rate": 8.192738210844198e-07, + "loss": 0.0001, + "step": 27944 + }, + { + "epoch": 11.364375762505084, + "grad_norm": 0.01842497864633757, + "learning_rate": 8.188722279979155e-07, + "loss": 0.0002, + "step": 27945 + }, + { + "epoch": 11.36478243188288, + "grad_norm": 0.14820441113206226, + "learning_rate": 8.184707291603089e-07, + "loss": 0.0011, + "step": 27946 + }, + { + "epoch": 11.365189101260675, + "grad_norm": 0.016654146021175324, + "learning_rate": 8.180693245757209e-07, + "loss": 0.0002, + "step": 27947 + }, + { + "epoch": 11.365595770638471, + "grad_norm": 0.09433311072221648, + "learning_rate": 8.176680142482729e-07, + "loss": 0.0003, + "step": 27948 + }, + { + "epoch": 11.366002440016267, + "grad_norm": 0.00018270892154593457, + "learning_rate": 8.172667981820836e-07, + "loss": 0.0, + "step": 27949 + }, + { + "epoch": 11.366409109394063, + "grad_norm": 0.02956452485747363, + "learning_rate": 8.16865676381271e-07, + "loss": 0.0002, + "step": 27950 + }, + { + "epoch": 11.366815778771858, + "grad_norm": 0.01332851171297584, + "learning_rate": 8.16464648849955e-07, + "loss": 0.0001, + "step": 27951 + }, + { + "epoch": 11.367222448149654, + "grad_norm": 0.00809610113057215, + "learning_rate": 8.160637155922524e-07, + "loss": 0.0001, + "step": 27952 + }, + { + "epoch": 11.36762911752745, + "grad_norm": 0.144244306705411, + "learning_rate": 8.156628766122776e-07, + "loss": 0.0016, + "step": 27953 + }, + { + "epoch": 11.368035786905246, + "grad_norm": 4.323304275890141e-05, + "learning_rate": 8.152621319141452e-07, + "loss": 0.0, + "step": 27954 + }, + { + "epoch": 11.368442456283042, + "grad_norm": 0.038582428115642387, + "learning_rate": 8.148614815019696e-07, + "loss": 0.0006, + "step": 27955 + }, + { + "epoch": 11.368849125660837, + "grad_norm": 0.00029874978998949424, + "learning_rate": 8.144609253798619e-07, + "loss": 0.0, + "step": 27956 + }, + { + "epoch": 11.369255795038633, + "grad_norm": 0.013707557630233251, + "learning_rate": 8.140604635519378e-07, + "loss": 0.0002, + "step": 27957 + }, + { + "epoch": 11.369662464416429, + "grad_norm": 0.01569623450542522, + "learning_rate": 8.136600960223074e-07, + "loss": 0.0001, + "step": 27958 + }, + { + "epoch": 11.370069133794225, + "grad_norm": 0.140711675907966, + "learning_rate": 8.132598227950783e-07, + "loss": 0.0006, + "step": 27959 + }, + { + "epoch": 11.37047580317202, + "grad_norm": 1.1470400711362932e-05, + "learning_rate": 8.128596438743608e-07, + "loss": 0.0, + "step": 27960 + }, + { + "epoch": 11.370882472549816, + "grad_norm": 0.011279589518559474, + "learning_rate": 8.124595592642625e-07, + "loss": 0.0001, + "step": 27961 + }, + { + "epoch": 11.371289141927614, + "grad_norm": 0.003130170697513267, + "learning_rate": 8.120595689688914e-07, + "loss": 0.0, + "step": 27962 + }, + { + "epoch": 11.37169581130541, + "grad_norm": 0.013317052012581123, + "learning_rate": 8.116596729923509e-07, + "loss": 0.0002, + "step": 27963 + }, + { + "epoch": 11.372102480683205, + "grad_norm": 0.02230722988055319, + "learning_rate": 8.112598713387498e-07, + "loss": 0.0002, + "step": 27964 + }, + { + "epoch": 11.372509150061001, + "grad_norm": 0.03535028371241457, + "learning_rate": 8.108601640121916e-07, + "loss": 0.0003, + "step": 27965 + }, + { + "epoch": 11.372915819438797, + "grad_norm": 0.00023123299272569188, + "learning_rate": 8.104605510167773e-07, + "loss": 0.0, + "step": 27966 + }, + { + "epoch": 11.373322488816592, + "grad_norm": 0.00813015110868441, + "learning_rate": 8.100610323566116e-07, + "loss": 0.0001, + "step": 27967 + }, + { + "epoch": 11.373729158194388, + "grad_norm": 0.00013422571063022564, + "learning_rate": 8.096616080357944e-07, + "loss": 0.0, + "step": 27968 + }, + { + "epoch": 11.374135827572184, + "grad_norm": 0.026186138905537247, + "learning_rate": 8.092622780584237e-07, + "loss": 0.0002, + "step": 27969 + }, + { + "epoch": 11.37454249694998, + "grad_norm": 0.00025782024303752584, + "learning_rate": 8.088630424286037e-07, + "loss": 0.0, + "step": 27970 + }, + { + "epoch": 11.374949166327776, + "grad_norm": 0.0005643134037709681, + "learning_rate": 8.084639011504303e-07, + "loss": 0.0, + "step": 27971 + }, + { + "epoch": 11.375355835705571, + "grad_norm": 0.0028368546430263446, + "learning_rate": 8.080648542280012e-07, + "loss": 0.0, + "step": 27972 + }, + { + "epoch": 11.375762505083367, + "grad_norm": 0.01225396344184908, + "learning_rate": 8.07665901665412e-07, + "loss": 0.0001, + "step": 27973 + }, + { + "epoch": 11.376169174461163, + "grad_norm": 0.0011295752628842125, + "learning_rate": 8.072670434667596e-07, + "loss": 0.0, + "step": 27974 + }, + { + "epoch": 11.376575843838959, + "grad_norm": 0.0302172093012494, + "learning_rate": 8.068682796361371e-07, + "loss": 0.0002, + "step": 27975 + }, + { + "epoch": 11.376982513216754, + "grad_norm": 0.005182926999499434, + "learning_rate": 8.06469610177636e-07, + "loss": 0.0, + "step": 27976 + }, + { + "epoch": 11.37738918259455, + "grad_norm": 0.003497701211340814, + "learning_rate": 8.060710350953537e-07, + "loss": 0.0, + "step": 27977 + }, + { + "epoch": 11.377795851972346, + "grad_norm": 0.008777177954327774, + "learning_rate": 8.056725543933796e-07, + "loss": 0.0001, + "step": 27978 + }, + { + "epoch": 11.378202521350142, + "grad_norm": 0.011841812951542734, + "learning_rate": 8.052741680758047e-07, + "loss": 0.0001, + "step": 27979 + }, + { + "epoch": 11.378609190727937, + "grad_norm": 0.0395487439653278, + "learning_rate": 8.048758761467168e-07, + "loss": 0.0002, + "step": 27980 + }, + { + "epoch": 11.379015860105733, + "grad_norm": 0.02285128454402995, + "learning_rate": 8.044776786102071e-07, + "loss": 0.0002, + "step": 27981 + }, + { + "epoch": 11.37942252948353, + "grad_norm": 0.05197513151920828, + "learning_rate": 8.040795754703601e-07, + "loss": 0.0005, + "step": 27982 + }, + { + "epoch": 11.379829198861326, + "grad_norm": 0.033319548992973014, + "learning_rate": 8.036815667312659e-07, + "loss": 0.0003, + "step": 27983 + }, + { + "epoch": 11.380235868239122, + "grad_norm": 0.07148217513967112, + "learning_rate": 8.032836523970078e-07, + "loss": 0.0007, + "step": 27984 + }, + { + "epoch": 11.380642537616918, + "grad_norm": 0.05725059663723029, + "learning_rate": 8.028858324716727e-07, + "loss": 0.0005, + "step": 27985 + }, + { + "epoch": 11.381049206994714, + "grad_norm": 0.01806687513513796, + "learning_rate": 8.024881069593438e-07, + "loss": 0.0001, + "step": 27986 + }, + { + "epoch": 11.38145587637251, + "grad_norm": 0.06635398440290474, + "learning_rate": 8.020904758641023e-07, + "loss": 0.0006, + "step": 27987 + }, + { + "epoch": 11.381862545750305, + "grad_norm": 0.014966300638108384, + "learning_rate": 8.016929391900318e-07, + "loss": 0.0001, + "step": 27988 + }, + { + "epoch": 11.382269215128101, + "grad_norm": 0.0039826569804211715, + "learning_rate": 8.012954969412112e-07, + "loss": 0.0, + "step": 27989 + }, + { + "epoch": 11.382675884505897, + "grad_norm": 0.00014369758811101855, + "learning_rate": 8.008981491217238e-07, + "loss": 0.0, + "step": 27990 + }, + { + "epoch": 11.383082553883693, + "grad_norm": 0.008609977553708672, + "learning_rate": 8.005008957356464e-07, + "loss": 0.0001, + "step": 27991 + }, + { + "epoch": 11.383489223261488, + "grad_norm": 0.033364292266659316, + "learning_rate": 8.001037367870567e-07, + "loss": 0.0002, + "step": 27992 + }, + { + "epoch": 11.383895892639284, + "grad_norm": 0.00019967857552691425, + "learning_rate": 7.99706672280034e-07, + "loss": 0.0, + "step": 27993 + }, + { + "epoch": 11.38430256201708, + "grad_norm": 0.00030368831512420845, + "learning_rate": 7.993097022186513e-07, + "loss": 0.0, + "step": 27994 + }, + { + "epoch": 11.384709231394876, + "grad_norm": 0.1505235418068424, + "learning_rate": 7.989128266069834e-07, + "loss": 0.0009, + "step": 27995 + }, + { + "epoch": 11.385115900772671, + "grad_norm": 0.002058583008700809, + "learning_rate": 7.985160454491081e-07, + "loss": 0.0, + "step": 27996 + }, + { + "epoch": 11.385522570150467, + "grad_norm": 0.0038729961566453582, + "learning_rate": 7.981193587490977e-07, + "loss": 0.0, + "step": 27997 + }, + { + "epoch": 11.385929239528263, + "grad_norm": 0.011837216580277709, + "learning_rate": 7.977227665110221e-07, + "loss": 0.0001, + "step": 27998 + }, + { + "epoch": 11.386335908906059, + "grad_norm": 0.007455002449603412, + "learning_rate": 7.973262687389538e-07, + "loss": 0.0001, + "step": 27999 + }, + { + "epoch": 11.386742578283854, + "grad_norm": 0.0034500986112135233, + "learning_rate": 7.969298654369629e-07, + "loss": 0.0, + "step": 28000 + }, + { + "epoch": 11.38714924766165, + "grad_norm": 0.0037657090107802694, + "learning_rate": 7.965335566091193e-07, + "loss": 0.0, + "step": 28001 + }, + { + "epoch": 11.387555917039446, + "grad_norm": 0.0029877248060844737, + "learning_rate": 7.961373422594887e-07, + "loss": 0.0, + "step": 28002 + }, + { + "epoch": 11.387962586417244, + "grad_norm": 0.009524586238819955, + "learning_rate": 7.957412223921424e-07, + "loss": 0.0001, + "step": 28003 + }, + { + "epoch": 11.38836925579504, + "grad_norm": 1.3327009342528182e-05, + "learning_rate": 7.953451970111448e-07, + "loss": 0.0, + "step": 28004 + }, + { + "epoch": 11.388775925172835, + "grad_norm": 0.0024426476616776797, + "learning_rate": 7.949492661205604e-07, + "loss": 0.0, + "step": 28005 + }, + { + "epoch": 11.38918259455063, + "grad_norm": 0.00023113115395024253, + "learning_rate": 7.945534297244562e-07, + "loss": 0.0, + "step": 28006 + }, + { + "epoch": 11.389589263928427, + "grad_norm": 0.00024542972899086265, + "learning_rate": 7.94157687826893e-07, + "loss": 0.0, + "step": 28007 + }, + { + "epoch": 11.389995933306222, + "grad_norm": 0.009149830247502554, + "learning_rate": 7.937620404319324e-07, + "loss": 0.0001, + "step": 28008 + }, + { + "epoch": 11.390402602684018, + "grad_norm": 0.0009670320992114222, + "learning_rate": 7.933664875436409e-07, + "loss": 0.0, + "step": 28009 + }, + { + "epoch": 11.390809272061814, + "grad_norm": 0.00889404748120102, + "learning_rate": 7.929710291660742e-07, + "loss": 0.0001, + "step": 28010 + }, + { + "epoch": 11.39121594143961, + "grad_norm": 0.0012674692634992624, + "learning_rate": 7.925756653032957e-07, + "loss": 0.0, + "step": 28011 + }, + { + "epoch": 11.391622610817405, + "grad_norm": 0.010256288475462675, + "learning_rate": 7.921803959593599e-07, + "loss": 0.0001, + "step": 28012 + }, + { + "epoch": 11.392029280195201, + "grad_norm": 0.007764658931392658, + "learning_rate": 7.917852211383282e-07, + "loss": 0.0001, + "step": 28013 + }, + { + "epoch": 11.392435949572997, + "grad_norm": 0.0016116612410006263, + "learning_rate": 7.913901408442548e-07, + "loss": 0.0, + "step": 28014 + }, + { + "epoch": 11.392842618950793, + "grad_norm": 0.004013832984859312, + "learning_rate": 7.909951550811945e-07, + "loss": 0.0, + "step": 28015 + }, + { + "epoch": 11.393249288328589, + "grad_norm": 0.0029908397841039588, + "learning_rate": 7.906002638532062e-07, + "loss": 0.0, + "step": 28016 + }, + { + "epoch": 11.393655957706384, + "grad_norm": 0.0031711928273170605, + "learning_rate": 7.9020546716434e-07, + "loss": 0.0, + "step": 28017 + }, + { + "epoch": 11.39406262708418, + "grad_norm": 0.0001987770392444759, + "learning_rate": 7.898107650186504e-07, + "loss": 0.0, + "step": 28018 + }, + { + "epoch": 11.394469296461976, + "grad_norm": 0.0020255319992123566, + "learning_rate": 7.894161574201887e-07, + "loss": 0.0, + "step": 28019 + }, + { + "epoch": 11.394875965839772, + "grad_norm": 0.008201478342881733, + "learning_rate": 7.890216443730059e-07, + "loss": 0.0001, + "step": 28020 + }, + { + "epoch": 11.395282635217567, + "grad_norm": 0.035344272012912833, + "learning_rate": 7.8862722588115e-07, + "loss": 0.0003, + "step": 28021 + }, + { + "epoch": 11.395689304595363, + "grad_norm": 0.00025468383973482704, + "learning_rate": 7.882329019486735e-07, + "loss": 0.0, + "step": 28022 + }, + { + "epoch": 11.39609597397316, + "grad_norm": 0.0013488135926918196, + "learning_rate": 7.878386725796217e-07, + "loss": 0.0, + "step": 28023 + }, + { + "epoch": 11.396502643350956, + "grad_norm": 0.0005627375075652109, + "learning_rate": 7.874445377780426e-07, + "loss": 0.0, + "step": 28024 + }, + { + "epoch": 11.396909312728752, + "grad_norm": 0.0030366913970267845, + "learning_rate": 7.870504975479831e-07, + "loss": 0.0, + "step": 28025 + }, + { + "epoch": 11.397315982106548, + "grad_norm": 0.00034218960932067433, + "learning_rate": 7.866565518934855e-07, + "loss": 0.0, + "step": 28026 + }, + { + "epoch": 11.397722651484344, + "grad_norm": 0.028950133572114675, + "learning_rate": 7.862627008185964e-07, + "loss": 0.0003, + "step": 28027 + }, + { + "epoch": 11.39812932086214, + "grad_norm": 4.3006027015350834e-05, + "learning_rate": 7.858689443273548e-07, + "loss": 0.0, + "step": 28028 + }, + { + "epoch": 11.398535990239935, + "grad_norm": 0.005883333612506664, + "learning_rate": 7.85475282423811e-07, + "loss": 0.0001, + "step": 28029 + }, + { + "epoch": 11.398942659617731, + "grad_norm": 0.0011004662042257921, + "learning_rate": 7.850817151119983e-07, + "loss": 0.0, + "step": 28030 + }, + { + "epoch": 11.399349328995527, + "grad_norm": 0.01950964139126283, + "learning_rate": 7.84688242395959e-07, + "loss": 0.0001, + "step": 28031 + }, + { + "epoch": 11.399755998373323, + "grad_norm": 0.002220627586304192, + "learning_rate": 7.842948642797332e-07, + "loss": 0.0, + "step": 28032 + }, + { + "epoch": 11.400162667751118, + "grad_norm": 0.09518078922310552, + "learning_rate": 7.839015807673578e-07, + "loss": 0.0007, + "step": 28033 + }, + { + "epoch": 11.400569337128914, + "grad_norm": 0.06667555122849542, + "learning_rate": 7.835083918628727e-07, + "loss": 0.0002, + "step": 28034 + }, + { + "epoch": 11.40097600650671, + "grad_norm": 0.07294938091097088, + "learning_rate": 7.831152975703116e-07, + "loss": 0.0007, + "step": 28035 + }, + { + "epoch": 11.401382675884506, + "grad_norm": 0.017915922467932537, + "learning_rate": 7.827222978937121e-07, + "loss": 0.0001, + "step": 28036 + }, + { + "epoch": 11.401789345262301, + "grad_norm": 0.0032050163987890168, + "learning_rate": 7.823293928371056e-07, + "loss": 0.0, + "step": 28037 + }, + { + "epoch": 11.402196014640097, + "grad_norm": 0.22387565709397253, + "learning_rate": 7.819365824045277e-07, + "loss": 0.0021, + "step": 28038 + }, + { + "epoch": 11.402602684017893, + "grad_norm": 0.00390794402107948, + "learning_rate": 7.815438666000108e-07, + "loss": 0.0, + "step": 28039 + }, + { + "epoch": 11.403009353395689, + "grad_norm": 0.08442088689868549, + "learning_rate": 7.811512454275826e-07, + "loss": 0.001, + "step": 28040 + }, + { + "epoch": 11.403416022773484, + "grad_norm": 0.1789564510340528, + "learning_rate": 7.807587188912802e-07, + "loss": 0.0023, + "step": 28041 + }, + { + "epoch": 11.40382269215128, + "grad_norm": 0.005290856954003498, + "learning_rate": 7.803662869951278e-07, + "loss": 0.0, + "step": 28042 + }, + { + "epoch": 11.404229361529076, + "grad_norm": 0.0009565189690651913, + "learning_rate": 7.799739497431568e-07, + "loss": 0.0, + "step": 28043 + }, + { + "epoch": 11.404636030906874, + "grad_norm": 0.0011467307490984249, + "learning_rate": 7.79581707139393e-07, + "loss": 0.0, + "step": 28044 + }, + { + "epoch": 11.40504270028467, + "grad_norm": 0.13564285218046568, + "learning_rate": 7.791895591878629e-07, + "loss": 0.0013, + "step": 28045 + }, + { + "epoch": 11.405449369662465, + "grad_norm": 0.0012071726287348904, + "learning_rate": 7.787975058925912e-07, + "loss": 0.0, + "step": 28046 + }, + { + "epoch": 11.40585603904026, + "grad_norm": 0.2903435193032478, + "learning_rate": 7.784055472576069e-07, + "loss": 0.0033, + "step": 28047 + }, + { + "epoch": 11.406262708418057, + "grad_norm": 0.003589317928309434, + "learning_rate": 7.780136832869301e-07, + "loss": 0.0, + "step": 28048 + }, + { + "epoch": 11.406669377795852, + "grad_norm": 0.1463347238051059, + "learning_rate": 7.776219139845842e-07, + "loss": 0.0016, + "step": 28049 + }, + { + "epoch": 11.407076047173648, + "grad_norm": 0.003028458514751987, + "learning_rate": 7.772302393545905e-07, + "loss": 0.0, + "step": 28050 + }, + { + "epoch": 11.407482716551444, + "grad_norm": 0.00872674583162325, + "learning_rate": 7.768386594009702e-07, + "loss": 0.0001, + "step": 28051 + }, + { + "epoch": 11.40788938592924, + "grad_norm": 0.0011264986583213032, + "learning_rate": 7.764471741277435e-07, + "loss": 0.0, + "step": 28052 + }, + { + "epoch": 11.408296055307035, + "grad_norm": 0.00023320167415707483, + "learning_rate": 7.76055783538926e-07, + "loss": 0.0, + "step": 28053 + }, + { + "epoch": 11.408702724684831, + "grad_norm": 0.0031734040244715046, + "learning_rate": 7.75664487638541e-07, + "loss": 0.0, + "step": 28054 + }, + { + "epoch": 11.409109394062627, + "grad_norm": 0.027772402168977608, + "learning_rate": 7.752732864306023e-07, + "loss": 0.0001, + "step": 28055 + }, + { + "epoch": 11.409516063440423, + "grad_norm": 0.011191462474216552, + "learning_rate": 7.748821799191253e-07, + "loss": 0.0001, + "step": 28056 + }, + { + "epoch": 11.409922732818218, + "grad_norm": 1.104845844920719, + "learning_rate": 7.744911681081268e-07, + "loss": 0.0102, + "step": 28057 + }, + { + "epoch": 11.410329402196014, + "grad_norm": 0.02579253168985173, + "learning_rate": 7.741002510016182e-07, + "loss": 0.0003, + "step": 28058 + }, + { + "epoch": 11.41073607157381, + "grad_norm": 0.0029131537834994987, + "learning_rate": 7.737094286036128e-07, + "loss": 0.0, + "step": 28059 + }, + { + "epoch": 11.411142740951606, + "grad_norm": 0.01127195732081821, + "learning_rate": 7.733187009181253e-07, + "loss": 0.0001, + "step": 28060 + }, + { + "epoch": 11.411549410329402, + "grad_norm": 0.007704647106181623, + "learning_rate": 7.729280679491657e-07, + "loss": 0.0, + "step": 28061 + }, + { + "epoch": 11.411956079707197, + "grad_norm": 0.0002235257558133611, + "learning_rate": 7.72537529700742e-07, + "loss": 0.0, + "step": 28062 + }, + { + "epoch": 11.412362749084993, + "grad_norm": 0.048092705832276615, + "learning_rate": 7.721470861768654e-07, + "loss": 0.0003, + "step": 28063 + }, + { + "epoch": 11.41276941846279, + "grad_norm": 0.0063261428381160955, + "learning_rate": 7.717567373815438e-07, + "loss": 0.0, + "step": 28064 + }, + { + "epoch": 11.413176087840586, + "grad_norm": 0.0065188421985981375, + "learning_rate": 7.713664833187829e-07, + "loss": 0.0001, + "step": 28065 + }, + { + "epoch": 11.413582757218382, + "grad_norm": 0.0002248599289608929, + "learning_rate": 7.709763239925882e-07, + "loss": 0.0, + "step": 28066 + }, + { + "epoch": 11.413989426596178, + "grad_norm": 0.0987546948808655, + "learning_rate": 7.70586259406968e-07, + "loss": 0.0013, + "step": 28067 + }, + { + "epoch": 11.414396095973974, + "grad_norm": 0.015906498089440547, + "learning_rate": 7.701962895659254e-07, + "loss": 0.0001, + "step": 28068 + }, + { + "epoch": 11.41480276535177, + "grad_norm": 0.007953974278589722, + "learning_rate": 7.698064144734629e-07, + "loss": 0.0001, + "step": 28069 + }, + { + "epoch": 11.415209434729565, + "grad_norm": 0.026251680875666393, + "learning_rate": 7.694166341335829e-07, + "loss": 0.0001, + "step": 28070 + }, + { + "epoch": 11.415616104107361, + "grad_norm": 0.06376111182807272, + "learning_rate": 7.690269485502866e-07, + "loss": 0.0007, + "step": 28071 + }, + { + "epoch": 11.416022773485157, + "grad_norm": 0.07187989889690027, + "learning_rate": 7.686373577275741e-07, + "loss": 0.0006, + "step": 28072 + }, + { + "epoch": 11.416429442862952, + "grad_norm": 1.057617739461982, + "learning_rate": 7.682478616694455e-07, + "loss": 0.0058, + "step": 28073 + }, + { + "epoch": 11.416836112240748, + "grad_norm": 0.004295273457084995, + "learning_rate": 7.678584603798989e-07, + "loss": 0.0, + "step": 28074 + }, + { + "epoch": 11.417242781618544, + "grad_norm": 0.010029544004577142, + "learning_rate": 7.674691538629319e-07, + "loss": 0.0001, + "step": 28075 + }, + { + "epoch": 11.41764945099634, + "grad_norm": 0.0015146473675230559, + "learning_rate": 7.670799421225406e-07, + "loss": 0.0, + "step": 28076 + }, + { + "epoch": 11.418056120374136, + "grad_norm": 0.005082722058587741, + "learning_rate": 7.666908251627215e-07, + "loss": 0.0, + "step": 28077 + }, + { + "epoch": 11.418462789751931, + "grad_norm": 0.025937256344833105, + "learning_rate": 7.663018029874669e-07, + "loss": 0.0003, + "step": 28078 + }, + { + "epoch": 11.418869459129727, + "grad_norm": 0.014700191276206802, + "learning_rate": 7.659128756007706e-07, + "loss": 0.0001, + "step": 28079 + }, + { + "epoch": 11.419276128507523, + "grad_norm": 0.0025318266781922663, + "learning_rate": 7.655240430066269e-07, + "loss": 0.0, + "step": 28080 + }, + { + "epoch": 11.419682797885319, + "grad_norm": 0.030615765782974813, + "learning_rate": 7.651353052090271e-07, + "loss": 0.0001, + "step": 28081 + }, + { + "epoch": 11.420089467263114, + "grad_norm": 0.008000293049994365, + "learning_rate": 7.647466622119614e-07, + "loss": 0.0001, + "step": 28082 + }, + { + "epoch": 11.42049613664091, + "grad_norm": 0.03626428213367087, + "learning_rate": 7.643581140194189e-07, + "loss": 0.0005, + "step": 28083 + }, + { + "epoch": 11.420902806018706, + "grad_norm": 1.5283325086168772e-05, + "learning_rate": 7.639696606353886e-07, + "loss": 0.0, + "step": 28084 + }, + { + "epoch": 11.421309475396503, + "grad_norm": 0.006544194355835116, + "learning_rate": 7.63581302063856e-07, + "loss": 0.0, + "step": 28085 + }, + { + "epoch": 11.4217161447743, + "grad_norm": 0.004277305651102915, + "learning_rate": 7.631930383088115e-07, + "loss": 0.0, + "step": 28086 + }, + { + "epoch": 11.422122814152095, + "grad_norm": 0.10803459895285042, + "learning_rate": 7.628048693742407e-07, + "loss": 0.0006, + "step": 28087 + }, + { + "epoch": 11.42252948352989, + "grad_norm": 1.1932882391064967e-05, + "learning_rate": 7.624167952641259e-07, + "loss": 0.0, + "step": 28088 + }, + { + "epoch": 11.422936152907686, + "grad_norm": 0.23232094303138698, + "learning_rate": 7.620288159824518e-07, + "loss": 0.0023, + "step": 28089 + }, + { + "epoch": 11.423342822285482, + "grad_norm": 0.043715161442483816, + "learning_rate": 7.616409315332008e-07, + "loss": 0.0003, + "step": 28090 + }, + { + "epoch": 11.423749491663278, + "grad_norm": 0.057284710291723946, + "learning_rate": 7.612531419203561e-07, + "loss": 0.0007, + "step": 28091 + }, + { + "epoch": 11.424156161041074, + "grad_norm": 0.10069693295857343, + "learning_rate": 7.60865447147896e-07, + "loss": 0.0006, + "step": 28092 + }, + { + "epoch": 11.42456283041887, + "grad_norm": 0.014983322800497531, + "learning_rate": 7.604778472198027e-07, + "loss": 0.0001, + "step": 28093 + }, + { + "epoch": 11.424969499796665, + "grad_norm": 0.01497419876022916, + "learning_rate": 7.600903421400551e-07, + "loss": 0.0002, + "step": 28094 + }, + { + "epoch": 11.425376169174461, + "grad_norm": 0.0015556597522622484, + "learning_rate": 7.597029319126315e-07, + "loss": 0.0, + "step": 28095 + }, + { + "epoch": 11.425782838552257, + "grad_norm": 0.009097417576437365, + "learning_rate": 7.593156165415061e-07, + "loss": 0.0001, + "step": 28096 + }, + { + "epoch": 11.426189507930053, + "grad_norm": 0.002088450686125667, + "learning_rate": 7.589283960306582e-07, + "loss": 0.0, + "step": 28097 + }, + { + "epoch": 11.426596177307848, + "grad_norm": 0.0017295208365490867, + "learning_rate": 7.585412703840589e-07, + "loss": 0.0, + "step": 28098 + }, + { + "epoch": 11.427002846685644, + "grad_norm": 0.011363621121342423, + "learning_rate": 7.581542396056863e-07, + "loss": 0.0001, + "step": 28099 + }, + { + "epoch": 11.42740951606344, + "grad_norm": 4.224177764154443e-05, + "learning_rate": 7.577673036995114e-07, + "loss": 0.0, + "step": 28100 + }, + { + "epoch": 11.427816185441236, + "grad_norm": 0.06831287221980871, + "learning_rate": 7.573804626695069e-07, + "loss": 0.0007, + "step": 28101 + }, + { + "epoch": 11.428222854819031, + "grad_norm": 0.002647910689916682, + "learning_rate": 7.569937165196428e-07, + "loss": 0.0, + "step": 28102 + }, + { + "epoch": 11.428629524196827, + "grad_norm": 0.09473042333906936, + "learning_rate": 7.566070652538914e-07, + "loss": 0.0012, + "step": 28103 + }, + { + "epoch": 11.429036193574623, + "grad_norm": 0.04379178663743144, + "learning_rate": 7.562205088762198e-07, + "loss": 0.0002, + "step": 28104 + }, + { + "epoch": 11.42944286295242, + "grad_norm": 0.005549131830549263, + "learning_rate": 7.558340473905957e-07, + "loss": 0.0001, + "step": 28105 + }, + { + "epoch": 11.429849532330216, + "grad_norm": 0.017357289183101185, + "learning_rate": 7.554476808009891e-07, + "loss": 0.0002, + "step": 28106 + }, + { + "epoch": 11.430256201708012, + "grad_norm": 0.03761292801939226, + "learning_rate": 7.550614091113651e-07, + "loss": 0.0002, + "step": 28107 + }, + { + "epoch": 11.430662871085808, + "grad_norm": 5.345541353371313e-06, + "learning_rate": 7.546752323256889e-07, + "loss": 0.0, + "step": 28108 + }, + { + "epoch": 11.431069540463604, + "grad_norm": 0.3804407150339145, + "learning_rate": 7.542891504479233e-07, + "loss": 0.0031, + "step": 28109 + }, + { + "epoch": 11.4314762098414, + "grad_norm": 0.0001484578821196885, + "learning_rate": 7.539031634820338e-07, + "loss": 0.0, + "step": 28110 + }, + { + "epoch": 11.431882879219195, + "grad_norm": 0.03546112855746146, + "learning_rate": 7.535172714319794e-07, + "loss": 0.0003, + "step": 28111 + }, + { + "epoch": 11.43228954859699, + "grad_norm": 0.007247126388355828, + "learning_rate": 7.531314743017259e-07, + "loss": 0.0001, + "step": 28112 + }, + { + "epoch": 11.432696217974787, + "grad_norm": 0.0008661615988902643, + "learning_rate": 7.527457720952325e-07, + "loss": 0.0, + "step": 28113 + }, + { + "epoch": 11.433102887352582, + "grad_norm": 0.0010444647376406679, + "learning_rate": 7.523601648164569e-07, + "loss": 0.0, + "step": 28114 + }, + { + "epoch": 11.433509556730378, + "grad_norm": 0.04309529328071427, + "learning_rate": 7.519746524693583e-07, + "loss": 0.0004, + "step": 28115 + }, + { + "epoch": 11.433916226108174, + "grad_norm": 0.004088440256767064, + "learning_rate": 7.515892350578946e-07, + "loss": 0.0, + "step": 28116 + }, + { + "epoch": 11.43432289548597, + "grad_norm": 0.011766507926098542, + "learning_rate": 7.512039125860226e-07, + "loss": 0.0001, + "step": 28117 + }, + { + "epoch": 11.434729564863765, + "grad_norm": 0.0005874546920100822, + "learning_rate": 7.508186850576948e-07, + "loss": 0.0, + "step": 28118 + }, + { + "epoch": 11.435136234241561, + "grad_norm": 0.002951893068911184, + "learning_rate": 7.504335524768702e-07, + "loss": 0.0, + "step": 28119 + }, + { + "epoch": 11.435542903619357, + "grad_norm": 0.00037999449474985717, + "learning_rate": 7.500485148475001e-07, + "loss": 0.0, + "step": 28120 + }, + { + "epoch": 11.435949572997153, + "grad_norm": 0.0019473966395577983, + "learning_rate": 7.496635721735379e-07, + "loss": 0.0, + "step": 28121 + }, + { + "epoch": 11.436356242374949, + "grad_norm": 0.07615417098398243, + "learning_rate": 7.492787244589339e-07, + "loss": 0.0007, + "step": 28122 + }, + { + "epoch": 11.436762911752744, + "grad_norm": 0.022002071124253102, + "learning_rate": 7.488939717076404e-07, + "loss": 0.0002, + "step": 28123 + }, + { + "epoch": 11.43716958113054, + "grad_norm": 0.00596951260426298, + "learning_rate": 7.485093139236044e-07, + "loss": 0.0001, + "step": 28124 + }, + { + "epoch": 11.437576250508336, + "grad_norm": 0.004406489475338647, + "learning_rate": 7.481247511107781e-07, + "loss": 0.0, + "step": 28125 + }, + { + "epoch": 11.437982919886133, + "grad_norm": 0.38907325244209456, + "learning_rate": 7.477402832731073e-07, + "loss": 0.0027, + "step": 28126 + }, + { + "epoch": 11.43838958926393, + "grad_norm": 0.001804820662180627, + "learning_rate": 7.47355910414539e-07, + "loss": 0.0, + "step": 28127 + }, + { + "epoch": 11.438796258641725, + "grad_norm": 0.10217205532644189, + "learning_rate": 7.469716325390197e-07, + "loss": 0.001, + "step": 28128 + }, + { + "epoch": 11.43920292801952, + "grad_norm": 0.02003248348061231, + "learning_rate": 7.465874496504944e-07, + "loss": 0.0002, + "step": 28129 + }, + { + "epoch": 11.439609597397316, + "grad_norm": 0.0017608482760322413, + "learning_rate": 7.46203361752904e-07, + "loss": 0.0, + "step": 28130 + }, + { + "epoch": 11.440016266775112, + "grad_norm": 0.238599052208538, + "learning_rate": 7.458193688501935e-07, + "loss": 0.0013, + "step": 28131 + }, + { + "epoch": 11.440422936152908, + "grad_norm": 0.012625575460121913, + "learning_rate": 7.454354709463063e-07, + "loss": 0.0001, + "step": 28132 + }, + { + "epoch": 11.440829605530704, + "grad_norm": 0.0005337788452916709, + "learning_rate": 7.450516680451803e-07, + "loss": 0.0, + "step": 28133 + }, + { + "epoch": 11.4412362749085, + "grad_norm": 0.02890006534216712, + "learning_rate": 7.446679601507579e-07, + "loss": 0.0002, + "step": 28134 + }, + { + "epoch": 11.441642944286295, + "grad_norm": 0.003908605975005849, + "learning_rate": 7.442843472669769e-07, + "loss": 0.0, + "step": 28135 + }, + { + "epoch": 11.442049613664091, + "grad_norm": 0.1606154615497145, + "learning_rate": 7.439008293977734e-07, + "loss": 0.0014, + "step": 28136 + }, + { + "epoch": 11.442456283041887, + "grad_norm": 0.167917437014758, + "learning_rate": 7.435174065470873e-07, + "loss": 0.0014, + "step": 28137 + }, + { + "epoch": 11.442862952419683, + "grad_norm": 0.06665676346645998, + "learning_rate": 7.431340787188546e-07, + "loss": 0.0002, + "step": 28138 + }, + { + "epoch": 11.443269621797478, + "grad_norm": 0.0006281310428351181, + "learning_rate": 7.427508459170096e-07, + "loss": 0.0, + "step": 28139 + }, + { + "epoch": 11.443676291175274, + "grad_norm": 0.0018340969595736032, + "learning_rate": 7.423677081454861e-07, + "loss": 0.0, + "step": 28140 + }, + { + "epoch": 11.44408296055307, + "grad_norm": 0.04282723450272181, + "learning_rate": 7.419846654082163e-07, + "loss": 0.0004, + "step": 28141 + }, + { + "epoch": 11.444489629930866, + "grad_norm": 0.00014184414336358215, + "learning_rate": 7.41601717709135e-07, + "loss": 0.0, + "step": 28142 + }, + { + "epoch": 11.444896299308661, + "grad_norm": 0.07872258934556481, + "learning_rate": 7.41218865052169e-07, + "loss": 0.0007, + "step": 28143 + }, + { + "epoch": 11.445302968686457, + "grad_norm": 6.205017636324887e-05, + "learning_rate": 7.408361074412529e-07, + "loss": 0.0, + "step": 28144 + }, + { + "epoch": 11.445709638064255, + "grad_norm": 0.00012603937918320896, + "learning_rate": 7.404534448803147e-07, + "loss": 0.0, + "step": 28145 + }, + { + "epoch": 11.44611630744205, + "grad_norm": 0.06270368541473341, + "learning_rate": 7.400708773732823e-07, + "loss": 0.0006, + "step": 28146 + }, + { + "epoch": 11.446522976819846, + "grad_norm": 0.024339857629285578, + "learning_rate": 7.396884049240826e-07, + "loss": 0.0002, + "step": 28147 + }, + { + "epoch": 11.446929646197642, + "grad_norm": 0.00431891223736226, + "learning_rate": 7.393060275366415e-07, + "loss": 0.0, + "step": 28148 + }, + { + "epoch": 11.447336315575438, + "grad_norm": 0.17169877561504884, + "learning_rate": 7.389237452148835e-07, + "loss": 0.0013, + "step": 28149 + }, + { + "epoch": 11.447742984953234, + "grad_norm": 0.01756153533413226, + "learning_rate": 7.385415579627353e-07, + "loss": 0.0001, + "step": 28150 + }, + { + "epoch": 11.44814965433103, + "grad_norm": 0.009744381362280256, + "learning_rate": 7.381594657841206e-07, + "loss": 0.0001, + "step": 28151 + }, + { + "epoch": 11.448556323708825, + "grad_norm": 0.02734786106227527, + "learning_rate": 7.377774686829598e-07, + "loss": 0.0003, + "step": 28152 + }, + { + "epoch": 11.44896299308662, + "grad_norm": 0.08303991912581722, + "learning_rate": 7.373955666631738e-07, + "loss": 0.001, + "step": 28153 + }, + { + "epoch": 11.449369662464417, + "grad_norm": 0.006637842082942048, + "learning_rate": 7.370137597286852e-07, + "loss": 0.0001, + "step": 28154 + }, + { + "epoch": 11.449776331842212, + "grad_norm": 0.02711182526027063, + "learning_rate": 7.366320478834121e-07, + "loss": 0.0002, + "step": 28155 + }, + { + "epoch": 11.450183001220008, + "grad_norm": 0.05479576039012088, + "learning_rate": 7.362504311312713e-07, + "loss": 0.0003, + "step": 28156 + }, + { + "epoch": 11.450589670597804, + "grad_norm": 4.890185496295571e-05, + "learning_rate": 7.358689094761839e-07, + "loss": 0.0, + "step": 28157 + }, + { + "epoch": 11.4509963399756, + "grad_norm": 0.0019230950652774613, + "learning_rate": 7.354874829220648e-07, + "loss": 0.0, + "step": 28158 + }, + { + "epoch": 11.451403009353395, + "grad_norm": 0.009989982425036946, + "learning_rate": 7.351061514728297e-07, + "loss": 0.0001, + "step": 28159 + }, + { + "epoch": 11.451809678731191, + "grad_norm": 0.003968750661878995, + "learning_rate": 7.347249151323921e-07, + "loss": 0.0, + "step": 28160 + }, + { + "epoch": 11.452216348108987, + "grad_norm": 0.0018121266219692305, + "learning_rate": 7.343437739046677e-07, + "loss": 0.0, + "step": 28161 + }, + { + "epoch": 11.452623017486783, + "grad_norm": 0.015422567729090468, + "learning_rate": 7.339627277935646e-07, + "loss": 0.0001, + "step": 28162 + }, + { + "epoch": 11.453029686864578, + "grad_norm": 0.08129325790641255, + "learning_rate": 7.335817768030007e-07, + "loss": 0.0007, + "step": 28163 + }, + { + "epoch": 11.453436356242374, + "grad_norm": 0.002012604883895373, + "learning_rate": 7.332009209368817e-07, + "loss": 0.0, + "step": 28164 + }, + { + "epoch": 11.45384302562017, + "grad_norm": 0.020691295243673302, + "learning_rate": 7.328201601991203e-07, + "loss": 0.0002, + "step": 28165 + }, + { + "epoch": 11.454249694997966, + "grad_norm": 0.001563783534681846, + "learning_rate": 7.324394945936242e-07, + "loss": 0.0, + "step": 28166 + }, + { + "epoch": 11.454656364375763, + "grad_norm": 0.009475716057525538, + "learning_rate": 7.320589241243015e-07, + "loss": 0.0001, + "step": 28167 + }, + { + "epoch": 11.455063033753559, + "grad_norm": 2.2455229167540722e-05, + "learning_rate": 7.316784487950568e-07, + "loss": 0.0, + "step": 28168 + }, + { + "epoch": 11.455469703131355, + "grad_norm": 0.005160814874502971, + "learning_rate": 7.31298068609797e-07, + "loss": 0.0, + "step": 28169 + }, + { + "epoch": 11.45587637250915, + "grad_norm": 0.0007802488866284464, + "learning_rate": 7.309177835724279e-07, + "loss": 0.0, + "step": 28170 + }, + { + "epoch": 11.456283041886946, + "grad_norm": 0.09371325801082103, + "learning_rate": 7.305375936868542e-07, + "loss": 0.0007, + "step": 28171 + }, + { + "epoch": 11.456689711264742, + "grad_norm": 0.00980902940018169, + "learning_rate": 7.301574989569759e-07, + "loss": 0.0001, + "step": 28172 + }, + { + "epoch": 11.457096380642538, + "grad_norm": 0.00836434342483553, + "learning_rate": 7.29777499386698e-07, + "loss": 0.0, + "step": 28173 + }, + { + "epoch": 11.457503050020334, + "grad_norm": 0.006629779450421286, + "learning_rate": 7.293975949799181e-07, + "loss": 0.0001, + "step": 28174 + }, + { + "epoch": 11.45790971939813, + "grad_norm": 0.002572609599593138, + "learning_rate": 7.290177857405367e-07, + "loss": 0.0, + "step": 28175 + }, + { + "epoch": 11.458316388775925, + "grad_norm": 0.05364612363050374, + "learning_rate": 7.28638071672455e-07, + "loss": 0.0002, + "step": 28176 + }, + { + "epoch": 11.458723058153721, + "grad_norm": 0.003589792077487234, + "learning_rate": 7.2825845277957e-07, + "loss": 0.0, + "step": 28177 + }, + { + "epoch": 11.459129727531517, + "grad_norm": 0.04946831487604276, + "learning_rate": 7.278789290657784e-07, + "loss": 0.0005, + "step": 28178 + }, + { + "epoch": 11.459536396909312, + "grad_norm": 0.015417829414225549, + "learning_rate": 7.274995005349761e-07, + "loss": 0.0002, + "step": 28179 + }, + { + "epoch": 11.459943066287108, + "grad_norm": 0.010539222569698142, + "learning_rate": 7.271201671910589e-07, + "loss": 0.0001, + "step": 28180 + }, + { + "epoch": 11.460349735664904, + "grad_norm": 0.09709216950886448, + "learning_rate": 7.26740929037919e-07, + "loss": 0.0012, + "step": 28181 + }, + { + "epoch": 11.4607564050427, + "grad_norm": 0.04664787274279412, + "learning_rate": 7.263617860794492e-07, + "loss": 0.0003, + "step": 28182 + }, + { + "epoch": 11.461163074420496, + "grad_norm": 0.00046263777066749906, + "learning_rate": 7.259827383195451e-07, + "loss": 0.0, + "step": 28183 + }, + { + "epoch": 11.461569743798291, + "grad_norm": 0.0005657050906761522, + "learning_rate": 7.256037857620946e-07, + "loss": 0.0, + "step": 28184 + }, + { + "epoch": 11.461976413176087, + "grad_norm": 0.017786890964586667, + "learning_rate": 7.252249284109902e-07, + "loss": 0.0002, + "step": 28185 + }, + { + "epoch": 11.462383082553885, + "grad_norm": 0.005707620497736914, + "learning_rate": 7.248461662701189e-07, + "loss": 0.0001, + "step": 28186 + }, + { + "epoch": 11.46278975193168, + "grad_norm": 0.0059827616570040515, + "learning_rate": 7.244674993433709e-07, + "loss": 0.0001, + "step": 28187 + }, + { + "epoch": 11.463196421309476, + "grad_norm": 0.05900375117132058, + "learning_rate": 7.240889276346296e-07, + "loss": 0.0006, + "step": 28188 + }, + { + "epoch": 11.463603090687272, + "grad_norm": 0.00016522419124969997, + "learning_rate": 7.237104511477855e-07, + "loss": 0.0, + "step": 28189 + }, + { + "epoch": 11.464009760065068, + "grad_norm": 0.014066348377167349, + "learning_rate": 7.23332069886723e-07, + "loss": 0.0001, + "step": 28190 + }, + { + "epoch": 11.464416429442863, + "grad_norm": 0.030310304468044988, + "learning_rate": 7.229537838553258e-07, + "loss": 0.0005, + "step": 28191 + }, + { + "epoch": 11.46482309882066, + "grad_norm": 0.006392397273293139, + "learning_rate": 7.225755930574762e-07, + "loss": 0.0, + "step": 28192 + }, + { + "epoch": 11.465229768198455, + "grad_norm": 0.0010690767401633902, + "learning_rate": 7.221974974970581e-07, + "loss": 0.0, + "step": 28193 + }, + { + "epoch": 11.46563643757625, + "grad_norm": 0.024526544186958255, + "learning_rate": 7.218194971779513e-07, + "loss": 0.0002, + "step": 28194 + }, + { + "epoch": 11.466043106954046, + "grad_norm": 0.03578470893668826, + "learning_rate": 7.214415921040363e-07, + "loss": 0.0003, + "step": 28195 + }, + { + "epoch": 11.466449776331842, + "grad_norm": 0.0026740068233706772, + "learning_rate": 7.210637822791944e-07, + "loss": 0.0, + "step": 28196 + }, + { + "epoch": 11.466856445709638, + "grad_norm": 0.05605062732796346, + "learning_rate": 7.206860677073025e-07, + "loss": 0.0004, + "step": 28197 + }, + { + "epoch": 11.467263115087434, + "grad_norm": 0.0043465607699278395, + "learning_rate": 7.203084483922385e-07, + "loss": 0.0, + "step": 28198 + }, + { + "epoch": 11.46766978446523, + "grad_norm": 0.022949489693915994, + "learning_rate": 7.199309243378783e-07, + "loss": 0.0001, + "step": 28199 + }, + { + "epoch": 11.468076453843025, + "grad_norm": 0.02652041609771797, + "learning_rate": 7.195534955480987e-07, + "loss": 0.0003, + "step": 28200 + }, + { + "epoch": 11.468483123220821, + "grad_norm": 0.007924661144524418, + "learning_rate": 7.191761620267701e-07, + "loss": 0.0001, + "step": 28201 + }, + { + "epoch": 11.468889792598617, + "grad_norm": 0.018674043534325068, + "learning_rate": 7.187989237777726e-07, + "loss": 0.0001, + "step": 28202 + }, + { + "epoch": 11.469296461976413, + "grad_norm": 0.014350122916967011, + "learning_rate": 7.184217808049743e-07, + "loss": 0.0001, + "step": 28203 + }, + { + "epoch": 11.469703131354208, + "grad_norm": 0.00011782215585386199, + "learning_rate": 7.180447331122475e-07, + "loss": 0.0, + "step": 28204 + }, + { + "epoch": 11.470109800732004, + "grad_norm": 0.0016131218440420946, + "learning_rate": 7.176677807034627e-07, + "loss": 0.0, + "step": 28205 + }, + { + "epoch": 11.4705164701098, + "grad_norm": 0.007555684088705581, + "learning_rate": 7.17290923582491e-07, + "loss": 0.0001, + "step": 28206 + }, + { + "epoch": 11.470923139487596, + "grad_norm": 0.01967546399884852, + "learning_rate": 7.169141617531994e-07, + "loss": 0.0001, + "step": 28207 + }, + { + "epoch": 11.471329808865393, + "grad_norm": 0.0006090855476093338, + "learning_rate": 7.165374952194537e-07, + "loss": 0.0, + "step": 28208 + }, + { + "epoch": 11.471736478243189, + "grad_norm": 0.022970395916169662, + "learning_rate": 7.161609239851253e-07, + "loss": 0.0001, + "step": 28209 + }, + { + "epoch": 11.472143147620985, + "grad_norm": 0.004500974427588951, + "learning_rate": 7.157844480540765e-07, + "loss": 0.0, + "step": 28210 + }, + { + "epoch": 11.47254981699878, + "grad_norm": 0.00034560295826750184, + "learning_rate": 7.154080674301745e-07, + "loss": 0.0, + "step": 28211 + }, + { + "epoch": 11.472956486376576, + "grad_norm": 0.0015016781818863278, + "learning_rate": 7.150317821172803e-07, + "loss": 0.0, + "step": 28212 + }, + { + "epoch": 11.473363155754372, + "grad_norm": 0.0648402955465763, + "learning_rate": 7.146555921192588e-07, + "loss": 0.0005, + "step": 28213 + }, + { + "epoch": 11.473769825132168, + "grad_norm": 0.02828290898017355, + "learning_rate": 7.14279497439968e-07, + "loss": 0.0003, + "step": 28214 + }, + { + "epoch": 11.474176494509964, + "grad_norm": 0.00764132135185693, + "learning_rate": 7.139034980832748e-07, + "loss": 0.0001, + "step": 28215 + }, + { + "epoch": 11.47458316388776, + "grad_norm": 0.0010552003078637077, + "learning_rate": 7.13527594053034e-07, + "loss": 0.0, + "step": 28216 + }, + { + "epoch": 11.474989833265555, + "grad_norm": 0.0012836716213295372, + "learning_rate": 7.131517853531078e-07, + "loss": 0.0, + "step": 28217 + }, + { + "epoch": 11.47539650264335, + "grad_norm": 0.011381239898532984, + "learning_rate": 7.127760719873512e-07, + "loss": 0.0, + "step": 28218 + }, + { + "epoch": 11.475803172021147, + "grad_norm": 0.061022929760718804, + "learning_rate": 7.12400453959623e-07, + "loss": 0.0005, + "step": 28219 + }, + { + "epoch": 11.476209841398942, + "grad_norm": 0.011653644173855294, + "learning_rate": 7.120249312737793e-07, + "loss": 0.0001, + "step": 28220 + }, + { + "epoch": 11.476616510776738, + "grad_norm": 0.013452439343228942, + "learning_rate": 7.116495039336713e-07, + "loss": 0.0001, + "step": 28221 + }, + { + "epoch": 11.477023180154534, + "grad_norm": 0.005011128168860481, + "learning_rate": 7.112741719431581e-07, + "loss": 0.0, + "step": 28222 + }, + { + "epoch": 11.47742984953233, + "grad_norm": 0.005501849090448943, + "learning_rate": 7.108989353060902e-07, + "loss": 0.0001, + "step": 28223 + }, + { + "epoch": 11.477836518910125, + "grad_norm": 0.0029063983374995783, + "learning_rate": 7.105237940263199e-07, + "loss": 0.0, + "step": 28224 + }, + { + "epoch": 11.478243188287921, + "grad_norm": 0.003965185068884465, + "learning_rate": 7.101487481076985e-07, + "loss": 0.0, + "step": 28225 + }, + { + "epoch": 11.478649857665717, + "grad_norm": 0.0001401317137007357, + "learning_rate": 7.097737975540753e-07, + "loss": 0.0, + "step": 28226 + }, + { + "epoch": 11.479056527043515, + "grad_norm": 0.09176346378581747, + "learning_rate": 7.093989423692982e-07, + "loss": 0.0008, + "step": 28227 + }, + { + "epoch": 11.47946319642131, + "grad_norm": 0.06925727491240131, + "learning_rate": 7.090241825572186e-07, + "loss": 0.0002, + "step": 28228 + }, + { + "epoch": 11.479869865799106, + "grad_norm": 0.0050999519715278375, + "learning_rate": 7.086495181216835e-07, + "loss": 0.0, + "step": 28229 + }, + { + "epoch": 11.480276535176902, + "grad_norm": 0.12074361100818487, + "learning_rate": 7.082749490665353e-07, + "loss": 0.0012, + "step": 28230 + }, + { + "epoch": 11.480683204554698, + "grad_norm": 0.10788351029924352, + "learning_rate": 7.07900475395622e-07, + "loss": 0.0011, + "step": 28231 + }, + { + "epoch": 11.481089873932493, + "grad_norm": 0.002266014405012785, + "learning_rate": 7.075260971127862e-07, + "loss": 0.0, + "step": 28232 + }, + { + "epoch": 11.48149654331029, + "grad_norm": 0.00031335274654575916, + "learning_rate": 7.071518142218703e-07, + "loss": 0.0, + "step": 28233 + }, + { + "epoch": 11.481903212688085, + "grad_norm": 0.0007523431761833968, + "learning_rate": 7.067776267267202e-07, + "loss": 0.0, + "step": 28234 + }, + { + "epoch": 11.48230988206588, + "grad_norm": 4.2299283175679834e-05, + "learning_rate": 7.064035346311737e-07, + "loss": 0.0, + "step": 28235 + }, + { + "epoch": 11.482716551443676, + "grad_norm": 0.0006841694495623087, + "learning_rate": 7.060295379390736e-07, + "loss": 0.0, + "step": 28236 + }, + { + "epoch": 11.483123220821472, + "grad_norm": 0.0038510627560219618, + "learning_rate": 7.056556366542566e-07, + "loss": 0.0, + "step": 28237 + }, + { + "epoch": 11.483529890199268, + "grad_norm": 0.41940037439821337, + "learning_rate": 7.052818307805631e-07, + "loss": 0.0006, + "step": 28238 + }, + { + "epoch": 11.483936559577064, + "grad_norm": 0.016782042221944876, + "learning_rate": 7.049081203218278e-07, + "loss": 0.0001, + "step": 28239 + }, + { + "epoch": 11.48434322895486, + "grad_norm": 0.01175202913966485, + "learning_rate": 7.045345052818908e-07, + "loss": 0.0002, + "step": 28240 + }, + { + "epoch": 11.484749898332655, + "grad_norm": 0.00890466439627248, + "learning_rate": 7.041609856645848e-07, + "loss": 0.0001, + "step": 28241 + }, + { + "epoch": 11.485156567710451, + "grad_norm": 0.0033549437198683294, + "learning_rate": 7.037875614737444e-07, + "loss": 0.0, + "step": 28242 + }, + { + "epoch": 11.485563237088247, + "grad_norm": 0.007817199264346538, + "learning_rate": 7.034142327132043e-07, + "loss": 0.0001, + "step": 28243 + }, + { + "epoch": 11.485969906466043, + "grad_norm": 0.07501983307366396, + "learning_rate": 7.030409993867948e-07, + "loss": 0.0006, + "step": 28244 + }, + { + "epoch": 11.486376575843838, + "grad_norm": 0.0006560485438490106, + "learning_rate": 7.026678614983496e-07, + "loss": 0.0, + "step": 28245 + }, + { + "epoch": 11.486783245221634, + "grad_norm": 0.00031290418986988536, + "learning_rate": 7.022948190516965e-07, + "loss": 0.0, + "step": 28246 + }, + { + "epoch": 11.48718991459943, + "grad_norm": 0.0025691940331944332, + "learning_rate": 7.019218720506671e-07, + "loss": 0.0, + "step": 28247 + }, + { + "epoch": 11.487596583977226, + "grad_norm": 0.00754818853115332, + "learning_rate": 7.015490204990905e-07, + "loss": 0.0001, + "step": 28248 + }, + { + "epoch": 11.488003253355023, + "grad_norm": 0.008142654623358453, + "learning_rate": 7.011762644007924e-07, + "loss": 0.0001, + "step": 28249 + }, + { + "epoch": 11.488409922732819, + "grad_norm": 0.007650826156824075, + "learning_rate": 7.008036037596e-07, + "loss": 0.0001, + "step": 28250 + }, + { + "epoch": 11.488816592110615, + "grad_norm": 0.000147304125478878, + "learning_rate": 7.004310385793389e-07, + "loss": 0.0, + "step": 28251 + }, + { + "epoch": 11.48922326148841, + "grad_norm": 0.0068823728139417815, + "learning_rate": 7.000585688638318e-07, + "loss": 0.0001, + "step": 28252 + }, + { + "epoch": 11.489629930866206, + "grad_norm": 0.7467322093834787, + "learning_rate": 6.996861946169054e-07, + "loss": 0.0087, + "step": 28253 + }, + { + "epoch": 11.490036600244002, + "grad_norm": 0.07440244799841352, + "learning_rate": 6.993139158423811e-07, + "loss": 0.0004, + "step": 28254 + }, + { + "epoch": 11.490443269621798, + "grad_norm": 5.517625945130092e-05, + "learning_rate": 6.989417325440806e-07, + "loss": 0.0, + "step": 28255 + }, + { + "epoch": 11.490849938999594, + "grad_norm": 0.00010233750610877355, + "learning_rate": 6.98569644725825e-07, + "loss": 0.0, + "step": 28256 + }, + { + "epoch": 11.49125660837739, + "grad_norm": 0.0001518028803398256, + "learning_rate": 6.981976523914325e-07, + "loss": 0.0, + "step": 28257 + }, + { + "epoch": 11.491663277755185, + "grad_norm": 0.0013017260173767292, + "learning_rate": 6.978257555447221e-07, + "loss": 0.0, + "step": 28258 + }, + { + "epoch": 11.49206994713298, + "grad_norm": 0.0037662982021555063, + "learning_rate": 6.974539541895108e-07, + "loss": 0.0, + "step": 28259 + }, + { + "epoch": 11.492476616510777, + "grad_norm": 0.04303060103464741, + "learning_rate": 6.970822483296191e-07, + "loss": 0.0003, + "step": 28260 + }, + { + "epoch": 11.492883285888572, + "grad_norm": 10.053606265849238, + "learning_rate": 6.967106379688593e-07, + "loss": 0.3442, + "step": 28261 + }, + { + "epoch": 11.493289955266368, + "grad_norm": 0.004284953079099519, + "learning_rate": 6.963391231110472e-07, + "loss": 0.0, + "step": 28262 + }, + { + "epoch": 11.493696624644164, + "grad_norm": 0.0005722998570371533, + "learning_rate": 6.959677037599965e-07, + "loss": 0.0, + "step": 28263 + }, + { + "epoch": 11.49410329402196, + "grad_norm": 0.18271315261727453, + "learning_rate": 6.955963799195198e-07, + "loss": 0.0013, + "step": 28264 + }, + { + "epoch": 11.494509963399755, + "grad_norm": 0.0065140736955174834, + "learning_rate": 6.952251515934272e-07, + "loss": 0.0001, + "step": 28265 + }, + { + "epoch": 11.494916632777551, + "grad_norm": 0.04056897162973569, + "learning_rate": 6.948540187855335e-07, + "loss": 0.0003, + "step": 28266 + }, + { + "epoch": 11.495323302155347, + "grad_norm": 0.004542906252373459, + "learning_rate": 6.944829814996468e-07, + "loss": 0.0, + "step": 28267 + }, + { + "epoch": 11.495729971533144, + "grad_norm": 0.048623312109469005, + "learning_rate": 6.941120397395751e-07, + "loss": 0.0004, + "step": 28268 + }, + { + "epoch": 11.49613664091094, + "grad_norm": 0.0015003111383140898, + "learning_rate": 6.937411935091265e-07, + "loss": 0.0, + "step": 28269 + }, + { + "epoch": 11.496543310288736, + "grad_norm": 0.0027761059518863944, + "learning_rate": 6.933704428121091e-07, + "loss": 0.0, + "step": 28270 + }, + { + "epoch": 11.496949979666532, + "grad_norm": 0.0397164403297489, + "learning_rate": 6.929997876523275e-07, + "loss": 0.0003, + "step": 28271 + }, + { + "epoch": 11.497356649044328, + "grad_norm": 0.01665071525011685, + "learning_rate": 6.926292280335856e-07, + "loss": 0.0002, + "step": 28272 + }, + { + "epoch": 11.497763318422123, + "grad_norm": 0.001820943379638832, + "learning_rate": 6.922587639596911e-07, + "loss": 0.0, + "step": 28273 + }, + { + "epoch": 11.498169987799919, + "grad_norm": 0.0064200734178756465, + "learning_rate": 6.918883954344446e-07, + "loss": 0.0, + "step": 28274 + }, + { + "epoch": 11.498576657177715, + "grad_norm": 0.007967359012863726, + "learning_rate": 6.915181224616485e-07, + "loss": 0.0001, + "step": 28275 + }, + { + "epoch": 11.49898332655551, + "grad_norm": 0.006052375452605448, + "learning_rate": 6.91147945045103e-07, + "loss": 0.0, + "step": 28276 + }, + { + "epoch": 11.499389995933306, + "grad_norm": 0.5537462589584279, + "learning_rate": 6.907778631886098e-07, + "loss": 0.0047, + "step": 28277 + }, + { + "epoch": 11.499796665311102, + "grad_norm": 0.007400713065061702, + "learning_rate": 6.904078768959643e-07, + "loss": 0.0001, + "step": 28278 + }, + { + "epoch": 11.500203334688898, + "grad_norm": 0.007308725521023254, + "learning_rate": 6.900379861709705e-07, + "loss": 0.0001, + "step": 28279 + }, + { + "epoch": 11.500610004066694, + "grad_norm": 0.032071367855395716, + "learning_rate": 6.896681910174218e-07, + "loss": 0.0003, + "step": 28280 + }, + { + "epoch": 11.50101667344449, + "grad_norm": 0.009972153921683876, + "learning_rate": 6.892984914391143e-07, + "loss": 0.0001, + "step": 28281 + }, + { + "epoch": 11.501423342822285, + "grad_norm": 0.0218845772146794, + "learning_rate": 6.889288874398448e-07, + "loss": 0.0001, + "step": 28282 + }, + { + "epoch": 11.501830012200081, + "grad_norm": 0.006469269977165579, + "learning_rate": 6.885593790234057e-07, + "loss": 0.0, + "step": 28283 + }, + { + "epoch": 11.502236681577877, + "grad_norm": 0.033038360634850994, + "learning_rate": 6.881899661935909e-07, + "loss": 0.0003, + "step": 28284 + }, + { + "epoch": 11.502643350955672, + "grad_norm": 0.0026410478522154355, + "learning_rate": 6.878206489541917e-07, + "loss": 0.0, + "step": 28285 + }, + { + "epoch": 11.503050020333468, + "grad_norm": 0.151919128481555, + "learning_rate": 6.874514273090004e-07, + "loss": 0.0008, + "step": 28286 + }, + { + "epoch": 11.503456689711264, + "grad_norm": 0.10094562896445987, + "learning_rate": 6.870823012618089e-07, + "loss": 0.0007, + "step": 28287 + }, + { + "epoch": 11.50386335908906, + "grad_norm": 0.0003745275050253791, + "learning_rate": 6.867132708164037e-07, + "loss": 0.0, + "step": 28288 + }, + { + "epoch": 11.504270028466856, + "grad_norm": 9.955537214920062e-05, + "learning_rate": 6.863443359765742e-07, + "loss": 0.0, + "step": 28289 + }, + { + "epoch": 11.504676697844653, + "grad_norm": 0.1314860860373007, + "learning_rate": 6.859754967461074e-07, + "loss": 0.0011, + "step": 28290 + }, + { + "epoch": 11.505083367222449, + "grad_norm": 0.00047928101974621743, + "learning_rate": 6.85606753128788e-07, + "loss": 0.0, + "step": 28291 + }, + { + "epoch": 11.505490036600245, + "grad_norm": 0.0043466904913135075, + "learning_rate": 6.852381051284052e-07, + "loss": 0.0, + "step": 28292 + }, + { + "epoch": 11.50589670597804, + "grad_norm": 0.0026771079645023267, + "learning_rate": 6.848695527487414e-07, + "loss": 0.0, + "step": 28293 + }, + { + "epoch": 11.506303375355836, + "grad_norm": 0.15860523192645043, + "learning_rate": 6.845010959935794e-07, + "loss": 0.0009, + "step": 28294 + }, + { + "epoch": 11.506710044733632, + "grad_norm": 0.0002690022039863989, + "learning_rate": 6.841327348667026e-07, + "loss": 0.0, + "step": 28295 + }, + { + "epoch": 11.507116714111428, + "grad_norm": 0.14846333489477584, + "learning_rate": 6.837644693718915e-07, + "loss": 0.0006, + "step": 28296 + }, + { + "epoch": 11.507523383489223, + "grad_norm": 0.02458243671652531, + "learning_rate": 6.833962995129262e-07, + "loss": 0.0002, + "step": 28297 + }, + { + "epoch": 11.50793005286702, + "grad_norm": 0.00014052700618532115, + "learning_rate": 6.83028225293586e-07, + "loss": 0.0, + "step": 28298 + }, + { + "epoch": 11.508336722244815, + "grad_norm": 0.0002529098384576845, + "learning_rate": 6.826602467176524e-07, + "loss": 0.0, + "step": 28299 + }, + { + "epoch": 11.50874339162261, + "grad_norm": 0.0017257720924118867, + "learning_rate": 6.822923637889001e-07, + "loss": 0.0, + "step": 28300 + }, + { + "epoch": 11.509150061000406, + "grad_norm": 0.0048531829535190816, + "learning_rate": 6.81924576511106e-07, + "loss": 0.0, + "step": 28301 + }, + { + "epoch": 11.509556730378202, + "grad_norm": 0.03558706208179572, + "learning_rate": 6.81556884888046e-07, + "loss": 0.0003, + "step": 28302 + }, + { + "epoch": 11.509963399755998, + "grad_norm": 0.4845877909309358, + "learning_rate": 6.811892889234939e-07, + "loss": 0.0042, + "step": 28303 + }, + { + "epoch": 11.510370069133794, + "grad_norm": 0.011989142662288687, + "learning_rate": 6.808217886212231e-07, + "loss": 0.0001, + "step": 28304 + }, + { + "epoch": 11.51077673851159, + "grad_norm": 0.0009594032249546856, + "learning_rate": 6.804543839850075e-07, + "loss": 0.0, + "step": 28305 + }, + { + "epoch": 11.511183407889385, + "grad_norm": 0.0006777988672483586, + "learning_rate": 6.800870750186184e-07, + "loss": 0.0, + "step": 28306 + }, + { + "epoch": 11.511590077267181, + "grad_norm": 0.000269667433593366, + "learning_rate": 6.797198617258272e-07, + "loss": 0.0, + "step": 28307 + }, + { + "epoch": 11.511996746644977, + "grad_norm": 0.05418931293553199, + "learning_rate": 6.793527441104009e-07, + "loss": 0.0004, + "step": 28308 + }, + { + "epoch": 11.512403416022774, + "grad_norm": 0.000556836486335409, + "learning_rate": 6.789857221761098e-07, + "loss": 0.0, + "step": 28309 + }, + { + "epoch": 11.51281008540057, + "grad_norm": 0.04072537122147375, + "learning_rate": 6.786187959267221e-07, + "loss": 0.0002, + "step": 28310 + }, + { + "epoch": 11.513216754778366, + "grad_norm": 0.0226822718337399, + "learning_rate": 6.782519653660025e-07, + "loss": 0.0002, + "step": 28311 + }, + { + "epoch": 11.513623424156162, + "grad_norm": 1.5320739734695934, + "learning_rate": 6.77885230497719e-07, + "loss": 0.0114, + "step": 28312 + }, + { + "epoch": 11.514030093533957, + "grad_norm": 0.09973755861605635, + "learning_rate": 6.775185913256354e-07, + "loss": 0.0006, + "step": 28313 + }, + { + "epoch": 11.514436762911753, + "grad_norm": 0.0038788305319936356, + "learning_rate": 6.771520478535154e-07, + "loss": 0.0, + "step": 28314 + }, + { + "epoch": 11.514843432289549, + "grad_norm": 0.0028489505359573183, + "learning_rate": 6.767856000851225e-07, + "loss": 0.0, + "step": 28315 + }, + { + "epoch": 11.515250101667345, + "grad_norm": 0.00797104024861134, + "learning_rate": 6.76419248024216e-07, + "loss": 0.0001, + "step": 28316 + }, + { + "epoch": 11.51565677104514, + "grad_norm": 0.0036590127298050243, + "learning_rate": 6.760529916745584e-07, + "loss": 0.0, + "step": 28317 + }, + { + "epoch": 11.516063440422936, + "grad_norm": 0.18910439992057423, + "learning_rate": 6.756868310399101e-07, + "loss": 0.0016, + "step": 28318 + }, + { + "epoch": 11.516470109800732, + "grad_norm": 0.006871055271861399, + "learning_rate": 6.753207661240301e-07, + "loss": 0.0, + "step": 28319 + }, + { + "epoch": 11.516876779178528, + "grad_norm": 0.15809751885380505, + "learning_rate": 6.749547969306747e-07, + "loss": 0.0005, + "step": 28320 + }, + { + "epoch": 11.517283448556324, + "grad_norm": 7.531495938639334e-05, + "learning_rate": 6.745889234636016e-07, + "loss": 0.0, + "step": 28321 + }, + { + "epoch": 11.51769011793412, + "grad_norm": 0.0014044661909231573, + "learning_rate": 6.742231457265669e-07, + "loss": 0.0, + "step": 28322 + }, + { + "epoch": 11.518096787311915, + "grad_norm": 0.009387898996398206, + "learning_rate": 6.738574637233252e-07, + "loss": 0.0001, + "step": 28323 + }, + { + "epoch": 11.51850345668971, + "grad_norm": 0.007351735124388617, + "learning_rate": 6.734918774576282e-07, + "loss": 0.0001, + "step": 28324 + }, + { + "epoch": 11.518910126067507, + "grad_norm": 0.0012973781656204974, + "learning_rate": 6.731263869332327e-07, + "loss": 0.0, + "step": 28325 + }, + { + "epoch": 11.519316795445302, + "grad_norm": 0.045188391344615775, + "learning_rate": 6.727609921538902e-07, + "loss": 0.0002, + "step": 28326 + }, + { + "epoch": 11.519723464823098, + "grad_norm": 0.006787847903551645, + "learning_rate": 6.723956931233488e-07, + "loss": 0.0001, + "step": 28327 + }, + { + "epoch": 11.520130134200894, + "grad_norm": 0.0002713329251646548, + "learning_rate": 6.720304898453622e-07, + "loss": 0.0, + "step": 28328 + }, + { + "epoch": 11.52053680357869, + "grad_norm": 8.093876612341794e-05, + "learning_rate": 6.716653823236752e-07, + "loss": 0.0, + "step": 28329 + }, + { + "epoch": 11.520943472956485, + "grad_norm": 0.4505154228532288, + "learning_rate": 6.71300370562038e-07, + "loss": 0.0022, + "step": 28330 + }, + { + "epoch": 11.521350142334283, + "grad_norm": 0.05479021986286013, + "learning_rate": 6.709354545641989e-07, + "loss": 0.0004, + "step": 28331 + }, + { + "epoch": 11.521756811712079, + "grad_norm": 0.022164889937961865, + "learning_rate": 6.705706343339013e-07, + "loss": 0.0003, + "step": 28332 + }, + { + "epoch": 11.522163481089875, + "grad_norm": 0.0001932270392815276, + "learning_rate": 6.702059098748925e-07, + "loss": 0.0, + "step": 28333 + }, + { + "epoch": 11.52257015046767, + "grad_norm": 0.005881237564281673, + "learning_rate": 6.698412811909149e-07, + "loss": 0.0, + "step": 28334 + }, + { + "epoch": 11.522976819845466, + "grad_norm": 0.0004990345562327853, + "learning_rate": 6.694767482857111e-07, + "loss": 0.0, + "step": 28335 + }, + { + "epoch": 11.523383489223262, + "grad_norm": 0.0009556351800241865, + "learning_rate": 6.69112311163026e-07, + "loss": 0.0, + "step": 28336 + }, + { + "epoch": 11.523790158601058, + "grad_norm": 0.10726575882519805, + "learning_rate": 6.687479698265986e-07, + "loss": 0.001, + "step": 28337 + }, + { + "epoch": 11.524196827978853, + "grad_norm": 0.03809552951797907, + "learning_rate": 6.683837242801705e-07, + "loss": 0.0004, + "step": 28338 + }, + { + "epoch": 11.52460349735665, + "grad_norm": 0.011869268419220412, + "learning_rate": 6.680195745274798e-07, + "loss": 0.0001, + "step": 28339 + }, + { + "epoch": 11.525010166734445, + "grad_norm": 0.0040947165187359465, + "learning_rate": 6.676555205722646e-07, + "loss": 0.0, + "step": 28340 + }, + { + "epoch": 11.52541683611224, + "grad_norm": 0.02784405437241263, + "learning_rate": 6.672915624182618e-07, + "loss": 0.0002, + "step": 28341 + }, + { + "epoch": 11.525823505490036, + "grad_norm": 0.06076451525328732, + "learning_rate": 6.669277000692076e-07, + "loss": 0.0005, + "step": 28342 + }, + { + "epoch": 11.526230174867832, + "grad_norm": 0.04282498860287354, + "learning_rate": 6.665639335288387e-07, + "loss": 0.0003, + "step": 28343 + }, + { + "epoch": 11.526636844245628, + "grad_norm": 0.002601509943754769, + "learning_rate": 6.66200262800889e-07, + "loss": 0.0, + "step": 28344 + }, + { + "epoch": 11.527043513623424, + "grad_norm": 0.12219747897499807, + "learning_rate": 6.658366878890909e-07, + "loss": 0.001, + "step": 28345 + }, + { + "epoch": 11.52745018300122, + "grad_norm": 0.00018701528741287894, + "learning_rate": 6.654732087971771e-07, + "loss": 0.0, + "step": 28346 + }, + { + "epoch": 11.527856852379015, + "grad_norm": 0.00395117435386327, + "learning_rate": 6.651098255288779e-07, + "loss": 0.0, + "step": 28347 + }, + { + "epoch": 11.528263521756811, + "grad_norm": 0.0011141459638694077, + "learning_rate": 6.647465380879258e-07, + "loss": 0.0, + "step": 28348 + }, + { + "epoch": 11.528670191134607, + "grad_norm": 0.0035229596089693515, + "learning_rate": 6.64383346478047e-07, + "loss": 0.0, + "step": 28349 + }, + { + "epoch": 11.529076860512404, + "grad_norm": 0.009026637107522046, + "learning_rate": 6.640202507029725e-07, + "loss": 0.0001, + "step": 28350 + }, + { + "epoch": 11.5294835298902, + "grad_norm": 0.05972830284513718, + "learning_rate": 6.636572507664296e-07, + "loss": 0.0005, + "step": 28351 + }, + { + "epoch": 11.529890199267996, + "grad_norm": 0.0032548272900117608, + "learning_rate": 6.632943466721431e-07, + "loss": 0.0, + "step": 28352 + }, + { + "epoch": 11.530296868645792, + "grad_norm": 0.015332654362230582, + "learning_rate": 6.629315384238399e-07, + "loss": 0.0001, + "step": 28353 + }, + { + "epoch": 11.530703538023587, + "grad_norm": 0.0016344223124024732, + "learning_rate": 6.625688260252438e-07, + "loss": 0.0, + "step": 28354 + }, + { + "epoch": 11.531110207401383, + "grad_norm": 0.0022170646871050794, + "learning_rate": 6.622062094800763e-07, + "loss": 0.0, + "step": 28355 + }, + { + "epoch": 11.531516876779179, + "grad_norm": 0.022584395176855537, + "learning_rate": 6.618436887920632e-07, + "loss": 0.0002, + "step": 28356 + }, + { + "epoch": 11.531923546156975, + "grad_norm": 0.07554128140154585, + "learning_rate": 6.614812639649238e-07, + "loss": 0.0006, + "step": 28357 + }, + { + "epoch": 11.53233021553477, + "grad_norm": 0.12805447380087534, + "learning_rate": 6.611189350023794e-07, + "loss": 0.0011, + "step": 28358 + }, + { + "epoch": 11.532736884912566, + "grad_norm": 0.0027729326695389157, + "learning_rate": 6.607567019081507e-07, + "loss": 0.0, + "step": 28359 + }, + { + "epoch": 11.533143554290362, + "grad_norm": 0.017313948459972354, + "learning_rate": 6.603945646859533e-07, + "loss": 0.0003, + "step": 28360 + }, + { + "epoch": 11.533550223668158, + "grad_norm": 0.003021659098241072, + "learning_rate": 6.600325233395066e-07, + "loss": 0.0, + "step": 28361 + }, + { + "epoch": 11.533956893045954, + "grad_norm": 0.010853697594946298, + "learning_rate": 6.596705778725255e-07, + "loss": 0.0001, + "step": 28362 + }, + { + "epoch": 11.53436356242375, + "grad_norm": 0.008404160383213656, + "learning_rate": 6.593087282887278e-07, + "loss": 0.0001, + "step": 28363 + }, + { + "epoch": 11.534770231801545, + "grad_norm": 0.2689638048732394, + "learning_rate": 6.589469745918275e-07, + "loss": 0.0026, + "step": 28364 + }, + { + "epoch": 11.53517690117934, + "grad_norm": 0.06311676438518372, + "learning_rate": 6.585853167855383e-07, + "loss": 0.0003, + "step": 28365 + }, + { + "epoch": 11.535583570557137, + "grad_norm": 0.020801625936631048, + "learning_rate": 6.582237548735715e-07, + "loss": 0.0002, + "step": 28366 + }, + { + "epoch": 11.535990239934932, + "grad_norm": 0.00038943991019286774, + "learning_rate": 6.578622888596397e-07, + "loss": 0.0, + "step": 28367 + }, + { + "epoch": 11.536396909312728, + "grad_norm": 0.012535552456979508, + "learning_rate": 6.575009187474513e-07, + "loss": 0.0001, + "step": 28368 + }, + { + "epoch": 11.536803578690524, + "grad_norm": 0.0001552206751859038, + "learning_rate": 6.571396445407196e-07, + "loss": 0.0, + "step": 28369 + }, + { + "epoch": 11.53721024806832, + "grad_norm": 0.001454110385818284, + "learning_rate": 6.56778466243152e-07, + "loss": 0.0, + "step": 28370 + }, + { + "epoch": 11.537616917446115, + "grad_norm": 0.001425976169956225, + "learning_rate": 6.564173838584564e-07, + "loss": 0.0, + "step": 28371 + }, + { + "epoch": 11.538023586823913, + "grad_norm": 0.002505143164431449, + "learning_rate": 6.560563973903377e-07, + "loss": 0.0, + "step": 28372 + }, + { + "epoch": 11.538430256201709, + "grad_norm": 0.00910422669997506, + "learning_rate": 6.55695506842503e-07, + "loss": 0.0001, + "step": 28373 + }, + { + "epoch": 11.538836925579504, + "grad_norm": 0.00109635488317636, + "learning_rate": 6.553347122186582e-07, + "loss": 0.0, + "step": 28374 + }, + { + "epoch": 11.5392435949573, + "grad_norm": 0.01984421709849421, + "learning_rate": 6.549740135225025e-07, + "loss": 0.0002, + "step": 28375 + }, + { + "epoch": 11.539650264335096, + "grad_norm": 0.0036212066189104932, + "learning_rate": 6.54613410757744e-07, + "loss": 0.0, + "step": 28376 + }, + { + "epoch": 11.540056933712892, + "grad_norm": 0.017246689331099845, + "learning_rate": 6.54252903928082e-07, + "loss": 0.0002, + "step": 28377 + }, + { + "epoch": 11.540463603090688, + "grad_norm": 0.019808207806414232, + "learning_rate": 6.538924930372182e-07, + "loss": 0.0002, + "step": 28378 + }, + { + "epoch": 11.540870272468483, + "grad_norm": 0.031011551195144354, + "learning_rate": 6.535321780888504e-07, + "loss": 0.0002, + "step": 28379 + }, + { + "epoch": 11.541276941846279, + "grad_norm": 3.6901539747831104e-05, + "learning_rate": 6.531719590866802e-07, + "loss": 0.0, + "step": 28380 + }, + { + "epoch": 11.541683611224075, + "grad_norm": 0.012474535268990013, + "learning_rate": 6.528118360344016e-07, + "loss": 0.0001, + "step": 28381 + }, + { + "epoch": 11.54209028060187, + "grad_norm": 0.007104301275818698, + "learning_rate": 6.524518089357157e-07, + "loss": 0.0001, + "step": 28382 + }, + { + "epoch": 11.542496949979666, + "grad_norm": 0.009795945094223516, + "learning_rate": 6.520918777943164e-07, + "loss": 0.0001, + "step": 28383 + }, + { + "epoch": 11.542903619357462, + "grad_norm": 0.04276282389997673, + "learning_rate": 6.517320426138985e-07, + "loss": 0.0004, + "step": 28384 + }, + { + "epoch": 11.543310288735258, + "grad_norm": 0.03575474065705624, + "learning_rate": 6.513723033981568e-07, + "loss": 0.0004, + "step": 28385 + }, + { + "epoch": 11.543716958113054, + "grad_norm": 0.01545754773820535, + "learning_rate": 6.510126601507827e-07, + "loss": 0.0001, + "step": 28386 + }, + { + "epoch": 11.54412362749085, + "grad_norm": 0.0927326676112725, + "learning_rate": 6.506531128754689e-07, + "loss": 0.0004, + "step": 28387 + }, + { + "epoch": 11.544530296868645, + "grad_norm": 0.3802375252215221, + "learning_rate": 6.502936615759048e-07, + "loss": 0.004, + "step": 28388 + }, + { + "epoch": 11.544936966246441, + "grad_norm": 8.475175361621044e-05, + "learning_rate": 6.499343062557839e-07, + "loss": 0.0, + "step": 28389 + }, + { + "epoch": 11.545343635624237, + "grad_norm": 0.00011298449186531977, + "learning_rate": 6.495750469187934e-07, + "loss": 0.0, + "step": 28390 + }, + { + "epoch": 11.545750305002034, + "grad_norm": 0.06345400383599874, + "learning_rate": 6.492158835686202e-07, + "loss": 0.0002, + "step": 28391 + }, + { + "epoch": 11.54615697437983, + "grad_norm": 0.011553218144984462, + "learning_rate": 6.488568162089525e-07, + "loss": 0.0001, + "step": 28392 + }, + { + "epoch": 11.546563643757626, + "grad_norm": 0.0018327946313718363, + "learning_rate": 6.484978448434753e-07, + "loss": 0.0, + "step": 28393 + }, + { + "epoch": 11.546970313135422, + "grad_norm": 0.0142718849026908, + "learning_rate": 6.481389694758733e-07, + "loss": 0.0, + "step": 28394 + }, + { + "epoch": 11.547376982513217, + "grad_norm": 0.0026106738874760588, + "learning_rate": 6.477801901098335e-07, + "loss": 0.0, + "step": 28395 + }, + { + "epoch": 11.547783651891013, + "grad_norm": 0.002792668236530568, + "learning_rate": 6.474215067490363e-07, + "loss": 0.0, + "step": 28396 + }, + { + "epoch": 11.548190321268809, + "grad_norm": 5.2818529567472625e-05, + "learning_rate": 6.470629193971645e-07, + "loss": 0.0, + "step": 28397 + }, + { + "epoch": 11.548596990646605, + "grad_norm": 0.10985488629387319, + "learning_rate": 6.467044280578993e-07, + "loss": 0.0009, + "step": 28398 + }, + { + "epoch": 11.5490036600244, + "grad_norm": 0.027954430507561516, + "learning_rate": 6.463460327349203e-07, + "loss": 0.0004, + "step": 28399 + }, + { + "epoch": 11.549410329402196, + "grad_norm": 0.32297269188632266, + "learning_rate": 6.459877334319065e-07, + "loss": 0.003, + "step": 28400 + }, + { + "epoch": 11.549816998779992, + "grad_norm": 0.022695394625148392, + "learning_rate": 6.45629530152535e-07, + "loss": 0.0001, + "step": 28401 + }, + { + "epoch": 11.550223668157788, + "grad_norm": 0.008130090794237363, + "learning_rate": 6.452714229004864e-07, + "loss": 0.0, + "step": 28402 + }, + { + "epoch": 11.550630337535583, + "grad_norm": 0.00020421432997103848, + "learning_rate": 6.449134116794343e-07, + "loss": 0.0, + "step": 28403 + }, + { + "epoch": 11.55103700691338, + "grad_norm": 0.024907795121378654, + "learning_rate": 6.445554964930545e-07, + "loss": 0.0002, + "step": 28404 + }, + { + "epoch": 11.551443676291175, + "grad_norm": 0.018029183209020347, + "learning_rate": 6.441976773450209e-07, + "loss": 0.0001, + "step": 28405 + }, + { + "epoch": 11.55185034566897, + "grad_norm": 0.0010420913577674344, + "learning_rate": 6.438399542390073e-07, + "loss": 0.0, + "step": 28406 + }, + { + "epoch": 11.552257015046766, + "grad_norm": 0.07195960603432125, + "learning_rate": 6.434823271786828e-07, + "loss": 0.0006, + "step": 28407 + }, + { + "epoch": 11.552663684424562, + "grad_norm": 1.3547080436524374e-05, + "learning_rate": 6.431247961677245e-07, + "loss": 0.0, + "step": 28408 + }, + { + "epoch": 11.553070353802358, + "grad_norm": 0.01577646690410623, + "learning_rate": 6.427673612097984e-07, + "loss": 0.0002, + "step": 28409 + }, + { + "epoch": 11.553477023180154, + "grad_norm": 0.0005089563493646486, + "learning_rate": 6.424100223085738e-07, + "loss": 0.0, + "step": 28410 + }, + { + "epoch": 11.55388369255795, + "grad_norm": 0.002891308979055614, + "learning_rate": 6.420527794677212e-07, + "loss": 0.0, + "step": 28411 + }, + { + "epoch": 11.554290361935745, + "grad_norm": 0.050933070660305106, + "learning_rate": 6.416956326909063e-07, + "loss": 0.0005, + "step": 28412 + }, + { + "epoch": 11.554697031313543, + "grad_norm": 0.028559370196716425, + "learning_rate": 6.413385819817952e-07, + "loss": 0.0001, + "step": 28413 + }, + { + "epoch": 11.555103700691339, + "grad_norm": 0.0003215819876455458, + "learning_rate": 6.409816273440528e-07, + "loss": 0.0, + "step": 28414 + }, + { + "epoch": 11.555510370069134, + "grad_norm": 0.04484281137870106, + "learning_rate": 6.406247687813461e-07, + "loss": 0.0003, + "step": 28415 + }, + { + "epoch": 11.55591703944693, + "grad_norm": 0.01372626518666352, + "learning_rate": 6.402680062973355e-07, + "loss": 0.0001, + "step": 28416 + }, + { + "epoch": 11.556323708824726, + "grad_norm": 0.06918758815329108, + "learning_rate": 6.399113398956847e-07, + "loss": 0.0004, + "step": 28417 + }, + { + "epoch": 11.556730378202522, + "grad_norm": 0.0019885650938662974, + "learning_rate": 6.395547695800553e-07, + "loss": 0.0, + "step": 28418 + }, + { + "epoch": 11.557137047580317, + "grad_norm": 0.0012850922963547969, + "learning_rate": 6.391982953541076e-07, + "loss": 0.0, + "step": 28419 + }, + { + "epoch": 11.557543716958113, + "grad_norm": 0.0016453303566027438, + "learning_rate": 6.388419172214988e-07, + "loss": 0.0, + "step": 28420 + }, + { + "epoch": 11.557950386335909, + "grad_norm": 0.028567965181242435, + "learning_rate": 6.384856351858904e-07, + "loss": 0.0002, + "step": 28421 + }, + { + "epoch": 11.558357055713705, + "grad_norm": 0.0021937826458262428, + "learning_rate": 6.381294492509382e-07, + "loss": 0.0, + "step": 28422 + }, + { + "epoch": 11.5587637250915, + "grad_norm": 0.0059600673634146156, + "learning_rate": 6.377733594202984e-07, + "loss": 0.0001, + "step": 28423 + }, + { + "epoch": 11.559170394469296, + "grad_norm": 0.2579893742757111, + "learning_rate": 6.374173656976279e-07, + "loss": 0.0024, + "step": 28424 + }, + { + "epoch": 11.559577063847092, + "grad_norm": 0.08060669445528967, + "learning_rate": 6.370614680865805e-07, + "loss": 0.0009, + "step": 28425 + }, + { + "epoch": 11.559983733224888, + "grad_norm": 0.0001154092086756334, + "learning_rate": 6.367056665908089e-07, + "loss": 0.0, + "step": 28426 + }, + { + "epoch": 11.560390402602684, + "grad_norm": 0.023867110895192393, + "learning_rate": 6.363499612139646e-07, + "loss": 0.0002, + "step": 28427 + }, + { + "epoch": 11.56079707198048, + "grad_norm": 0.014471360632837216, + "learning_rate": 6.359943519597033e-07, + "loss": 0.0001, + "step": 28428 + }, + { + "epoch": 11.561203741358275, + "grad_norm": 0.028879518983201, + "learning_rate": 6.356388388316725e-07, + "loss": 0.0002, + "step": 28429 + }, + { + "epoch": 11.56161041073607, + "grad_norm": 0.07234242045989657, + "learning_rate": 6.352834218335213e-07, + "loss": 0.0005, + "step": 28430 + }, + { + "epoch": 11.562017080113867, + "grad_norm": 0.006703379238706259, + "learning_rate": 6.349281009688979e-07, + "loss": 0.0001, + "step": 28431 + }, + { + "epoch": 11.562423749491664, + "grad_norm": 0.04982092842335928, + "learning_rate": 6.345728762414504e-07, + "loss": 0.0006, + "step": 28432 + }, + { + "epoch": 11.56283041886946, + "grad_norm": 0.009496010033809473, + "learning_rate": 6.342177476548272e-07, + "loss": 0.0001, + "step": 28433 + }, + { + "epoch": 11.563237088247256, + "grad_norm": 0.0009682690261138906, + "learning_rate": 6.338627152126731e-07, + "loss": 0.0, + "step": 28434 + }, + { + "epoch": 11.563643757625051, + "grad_norm": 0.0004955218867822675, + "learning_rate": 6.335077789186317e-07, + "loss": 0.0, + "step": 28435 + }, + { + "epoch": 11.564050427002847, + "grad_norm": 0.002347364692568808, + "learning_rate": 6.331529387763469e-07, + "loss": 0.0, + "step": 28436 + }, + { + "epoch": 11.564457096380643, + "grad_norm": 0.03506335594093068, + "learning_rate": 6.327981947894613e-07, + "loss": 0.0003, + "step": 28437 + }, + { + "epoch": 11.564863765758439, + "grad_norm": 0.004487981761115122, + "learning_rate": 6.324435469616141e-07, + "loss": 0.0, + "step": 28438 + }, + { + "epoch": 11.565270435136235, + "grad_norm": 0.008425461856821229, + "learning_rate": 6.320889952964515e-07, + "loss": 0.0001, + "step": 28439 + }, + { + "epoch": 11.56567710451403, + "grad_norm": 0.0011173523257214215, + "learning_rate": 6.317345397976104e-07, + "loss": 0.0, + "step": 28440 + }, + { + "epoch": 11.566083773891826, + "grad_norm": 0.006584513011772172, + "learning_rate": 6.31380180468728e-07, + "loss": 0.0001, + "step": 28441 + }, + { + "epoch": 11.566490443269622, + "grad_norm": 0.0004103790878846632, + "learning_rate": 6.310259173134437e-07, + "loss": 0.0, + "step": 28442 + }, + { + "epoch": 11.566897112647418, + "grad_norm": 0.26846294418246675, + "learning_rate": 6.306717503353931e-07, + "loss": 0.0016, + "step": 28443 + }, + { + "epoch": 11.567303782025213, + "grad_norm": 0.0005091367365728211, + "learning_rate": 6.303176795382137e-07, + "loss": 0.0, + "step": 28444 + }, + { + "epoch": 11.56771045140301, + "grad_norm": 0.006333225352306966, + "learning_rate": 6.29963704925537e-07, + "loss": 0.0, + "step": 28445 + }, + { + "epoch": 11.568117120780805, + "grad_norm": 0.03770912817485189, + "learning_rate": 6.296098265009998e-07, + "loss": 0.0003, + "step": 28446 + }, + { + "epoch": 11.5685237901586, + "grad_norm": 0.0007009240920836892, + "learning_rate": 6.292560442682338e-07, + "loss": 0.0, + "step": 28447 + }, + { + "epoch": 11.568930459536396, + "grad_norm": 0.008812258502502644, + "learning_rate": 6.289023582308707e-07, + "loss": 0.0001, + "step": 28448 + }, + { + "epoch": 11.569337128914192, + "grad_norm": 0.010280437866295627, + "learning_rate": 6.285487683925417e-07, + "loss": 0.0001, + "step": 28449 + }, + { + "epoch": 11.569743798291988, + "grad_norm": 0.006190272984544654, + "learning_rate": 6.281952747568764e-07, + "loss": 0.0001, + "step": 28450 + }, + { + "epoch": 11.570150467669784, + "grad_norm": 0.007610866671474196, + "learning_rate": 6.278418773275019e-07, + "loss": 0.0001, + "step": 28451 + }, + { + "epoch": 11.57055713704758, + "grad_norm": 0.0011593112858501661, + "learning_rate": 6.274885761080474e-07, + "loss": 0.0, + "step": 28452 + }, + { + "epoch": 11.570963806425375, + "grad_norm": 0.004137149748558203, + "learning_rate": 6.27135371102141e-07, + "loss": 0.0, + "step": 28453 + }, + { + "epoch": 11.571370475803173, + "grad_norm": 1.3833566188399038, + "learning_rate": 6.267822623134067e-07, + "loss": 0.015, + "step": 28454 + }, + { + "epoch": 11.571777145180969, + "grad_norm": 0.015356849009877716, + "learning_rate": 6.264292497454705e-07, + "loss": 0.0001, + "step": 28455 + }, + { + "epoch": 11.572183814558764, + "grad_norm": 0.01345004140867646, + "learning_rate": 6.260763334019549e-07, + "loss": 0.0001, + "step": 28456 + }, + { + "epoch": 11.57259048393656, + "grad_norm": 0.008696984284687348, + "learning_rate": 6.257235132864836e-07, + "loss": 0.0001, + "step": 28457 + }, + { + "epoch": 11.572997153314356, + "grad_norm": 0.010491547473044106, + "learning_rate": 6.253707894026784e-07, + "loss": 0.0001, + "step": 28458 + }, + { + "epoch": 11.573403822692152, + "grad_norm": 0.00038056066128551723, + "learning_rate": 6.250181617541596e-07, + "loss": 0.0, + "step": 28459 + }, + { + "epoch": 11.573810492069947, + "grad_norm": 0.0015382853962342775, + "learning_rate": 6.246656303445498e-07, + "loss": 0.0, + "step": 28460 + }, + { + "epoch": 11.574217161447743, + "grad_norm": 0.0002378099115960142, + "learning_rate": 6.243131951774639e-07, + "loss": 0.0, + "step": 28461 + }, + { + "epoch": 11.574623830825539, + "grad_norm": 0.0076525386060671, + "learning_rate": 6.239608562565225e-07, + "loss": 0.0001, + "step": 28462 + }, + { + "epoch": 11.575030500203335, + "grad_norm": 0.00027076129584387143, + "learning_rate": 6.236086135853425e-07, + "loss": 0.0, + "step": 28463 + }, + { + "epoch": 11.57543716958113, + "grad_norm": 0.01508451486282063, + "learning_rate": 6.232564671675378e-07, + "loss": 0.0002, + "step": 28464 + }, + { + "epoch": 11.575843838958926, + "grad_norm": 0.016628739721742804, + "learning_rate": 6.229044170067244e-07, + "loss": 0.0001, + "step": 28465 + }, + { + "epoch": 11.576250508336722, + "grad_norm": 0.0004232558825943296, + "learning_rate": 6.225524631065172e-07, + "loss": 0.0, + "step": 28466 + }, + { + "epoch": 11.576657177714518, + "grad_norm": 0.005719231921605169, + "learning_rate": 6.222006054705287e-07, + "loss": 0.0, + "step": 28467 + }, + { + "epoch": 11.577063847092314, + "grad_norm": 0.19072767545507885, + "learning_rate": 6.218488441023707e-07, + "loss": 0.0011, + "step": 28468 + }, + { + "epoch": 11.57747051647011, + "grad_norm": 0.017723344194337364, + "learning_rate": 6.214971790056545e-07, + "loss": 0.0001, + "step": 28469 + }, + { + "epoch": 11.577877185847905, + "grad_norm": 0.00015964200539676733, + "learning_rate": 6.211456101839897e-07, + "loss": 0.0, + "step": 28470 + }, + { + "epoch": 11.5782838552257, + "grad_norm": 0.008188676839613175, + "learning_rate": 6.207941376409843e-07, + "loss": 0.0001, + "step": 28471 + }, + { + "epoch": 11.578690524603497, + "grad_norm": 0.008890243646302266, + "learning_rate": 6.204427613802488e-07, + "loss": 0.0001, + "step": 28472 + }, + { + "epoch": 11.579097193981294, + "grad_norm": 0.005814135869893964, + "learning_rate": 6.200914814053893e-07, + "loss": 0.0, + "step": 28473 + }, + { + "epoch": 11.57950386335909, + "grad_norm": 0.0004209430779235547, + "learning_rate": 6.197402977200107e-07, + "loss": 0.0, + "step": 28474 + }, + { + "epoch": 11.579910532736886, + "grad_norm": 9.400320758874508e-05, + "learning_rate": 6.193892103277199e-07, + "loss": 0.0, + "step": 28475 + }, + { + "epoch": 11.580317202114681, + "grad_norm": 0.0018742132383115888, + "learning_rate": 6.190382192321198e-07, + "loss": 0.0, + "step": 28476 + }, + { + "epoch": 11.580723871492477, + "grad_norm": 0.07379697232348117, + "learning_rate": 6.18687324436813e-07, + "loss": 0.0006, + "step": 28477 + }, + { + "epoch": 11.581130540870273, + "grad_norm": 0.018174342324337407, + "learning_rate": 6.18336525945401e-07, + "loss": 0.0002, + "step": 28478 + }, + { + "epoch": 11.581537210248069, + "grad_norm": 0.006548871907223666, + "learning_rate": 6.179858237614888e-07, + "loss": 0.0001, + "step": 28479 + }, + { + "epoch": 11.581943879625864, + "grad_norm": 0.00394560555559393, + "learning_rate": 6.176352178886735e-07, + "loss": 0.0, + "step": 28480 + }, + { + "epoch": 11.58235054900366, + "grad_norm": 0.003506155024327733, + "learning_rate": 6.172847083305544e-07, + "loss": 0.0, + "step": 28481 + }, + { + "epoch": 11.582757218381456, + "grad_norm": 0.0001091524669739156, + "learning_rate": 6.169342950907298e-07, + "loss": 0.0, + "step": 28482 + }, + { + "epoch": 11.583163887759252, + "grad_norm": 0.06288220626845867, + "learning_rate": 6.165839781727979e-07, + "loss": 0.0004, + "step": 28483 + }, + { + "epoch": 11.583570557137048, + "grad_norm": 0.0011782742145362013, + "learning_rate": 6.162337575803512e-07, + "loss": 0.0, + "step": 28484 + }, + { + "epoch": 11.583977226514843, + "grad_norm": 0.0034084765320507118, + "learning_rate": 6.158836333169905e-07, + "loss": 0.0, + "step": 28485 + }, + { + "epoch": 11.584383895892639, + "grad_norm": 0.0014013260348467278, + "learning_rate": 6.155336053863071e-07, + "loss": 0.0, + "step": 28486 + }, + { + "epoch": 11.584790565270435, + "grad_norm": 0.011171306753871539, + "learning_rate": 6.151836737918937e-07, + "loss": 0.0001, + "step": 28487 + }, + { + "epoch": 11.58519723464823, + "grad_norm": 0.009830190228253724, + "learning_rate": 6.148338385373431e-07, + "loss": 0.0, + "step": 28488 + }, + { + "epoch": 11.585603904026026, + "grad_norm": 0.09113514035404575, + "learning_rate": 6.144840996262468e-07, + "loss": 0.0008, + "step": 28489 + }, + { + "epoch": 11.586010573403822, + "grad_norm": 0.0020113846145816674, + "learning_rate": 6.141344570621954e-07, + "loss": 0.0, + "step": 28490 + }, + { + "epoch": 11.586417242781618, + "grad_norm": 4.1111702621670135e-05, + "learning_rate": 6.137849108487759e-07, + "loss": 0.0, + "step": 28491 + }, + { + "epoch": 11.586823912159414, + "grad_norm": 9.914252212867299e-05, + "learning_rate": 6.134354609895798e-07, + "loss": 0.0, + "step": 28492 + }, + { + "epoch": 11.58723058153721, + "grad_norm": 0.006282459133094135, + "learning_rate": 6.130861074881922e-07, + "loss": 0.0001, + "step": 28493 + }, + { + "epoch": 11.587637250915005, + "grad_norm": 0.0023779501522741137, + "learning_rate": 6.127368503482e-07, + "loss": 0.0, + "step": 28494 + }, + { + "epoch": 11.588043920292803, + "grad_norm": 0.0001576994682840719, + "learning_rate": 6.123876895731895e-07, + "loss": 0.0, + "step": 28495 + }, + { + "epoch": 11.588450589670598, + "grad_norm": 0.04546231931941146, + "learning_rate": 6.120386251667443e-07, + "loss": 0.0006, + "step": 28496 + }, + { + "epoch": 11.588857259048394, + "grad_norm": 0.20181934744849098, + "learning_rate": 6.116896571324449e-07, + "loss": 0.0014, + "step": 28497 + }, + { + "epoch": 11.58926392842619, + "grad_norm": 0.0005786419238810363, + "learning_rate": 6.113407854738784e-07, + "loss": 0.0, + "step": 28498 + }, + { + "epoch": 11.589670597803986, + "grad_norm": 0.17060131105615495, + "learning_rate": 6.109920101946232e-07, + "loss": 0.0013, + "step": 28499 + }, + { + "epoch": 11.590077267181782, + "grad_norm": 0.043388206509475044, + "learning_rate": 6.106433312982607e-07, + "loss": 0.0003, + "step": 28500 + }, + { + "epoch": 11.590483936559577, + "grad_norm": 0.019866234395655366, + "learning_rate": 6.102947487883703e-07, + "loss": 0.0001, + "step": 28501 + }, + { + "epoch": 11.590890605937373, + "grad_norm": 0.08487152616567728, + "learning_rate": 6.099462626685293e-07, + "loss": 0.0006, + "step": 28502 + }, + { + "epoch": 11.591297275315169, + "grad_norm": 0.01614595260206037, + "learning_rate": 6.095978729423169e-07, + "loss": 0.0002, + "step": 28503 + }, + { + "epoch": 11.591703944692965, + "grad_norm": 3.5696521826537776e-05, + "learning_rate": 6.092495796133057e-07, + "loss": 0.0, + "step": 28504 + }, + { + "epoch": 11.59211061407076, + "grad_norm": 0.0004376781221356267, + "learning_rate": 6.089013826850765e-07, + "loss": 0.0, + "step": 28505 + }, + { + "epoch": 11.592517283448556, + "grad_norm": 0.00015070968506849757, + "learning_rate": 6.085532821612006e-07, + "loss": 0.0, + "step": 28506 + }, + { + "epoch": 11.592923952826352, + "grad_norm": 0.00415397699935422, + "learning_rate": 6.082052780452518e-07, + "loss": 0.0, + "step": 28507 + }, + { + "epoch": 11.593330622204148, + "grad_norm": 0.10771563910819437, + "learning_rate": 6.07857370340803e-07, + "loss": 0.0011, + "step": 28508 + }, + { + "epoch": 11.593737291581943, + "grad_norm": 1.3470653684200276, + "learning_rate": 6.075095590514246e-07, + "loss": 0.0135, + "step": 28509 + }, + { + "epoch": 11.59414396095974, + "grad_norm": 0.01282181553837556, + "learning_rate": 6.07161844180687e-07, + "loss": 0.0001, + "step": 28510 + }, + { + "epoch": 11.594550630337535, + "grad_norm": 0.006114969504681378, + "learning_rate": 6.068142257321608e-07, + "loss": 0.0001, + "step": 28511 + }, + { + "epoch": 11.59495729971533, + "grad_norm": 0.03848128973701953, + "learning_rate": 6.064667037094151e-07, + "loss": 0.0001, + "step": 28512 + }, + { + "epoch": 11.595363969093126, + "grad_norm": 0.0072029569960434525, + "learning_rate": 6.061192781160164e-07, + "loss": 0.0001, + "step": 28513 + }, + { + "epoch": 11.595770638470924, + "grad_norm": 0.002159437069813623, + "learning_rate": 6.057719489555302e-07, + "loss": 0.0, + "step": 28514 + }, + { + "epoch": 11.59617730784872, + "grad_norm": 0.015021528244102305, + "learning_rate": 6.054247162315229e-07, + "loss": 0.0001, + "step": 28515 + }, + { + "epoch": 11.596583977226516, + "grad_norm": 0.1457906546765074, + "learning_rate": 6.050775799475605e-07, + "loss": 0.001, + "step": 28516 + }, + { + "epoch": 11.596990646604311, + "grad_norm": 2.1554259386518894e-05, + "learning_rate": 6.047305401072023e-07, + "loss": 0.0, + "step": 28517 + }, + { + "epoch": 11.597397315982107, + "grad_norm": 0.003416753981719313, + "learning_rate": 6.043835967140155e-07, + "loss": 0.0, + "step": 28518 + }, + { + "epoch": 11.597803985359903, + "grad_norm": 0.04684799323444679, + "learning_rate": 6.040367497715605e-07, + "loss": 0.0004, + "step": 28519 + }, + { + "epoch": 11.598210654737699, + "grad_norm": 0.0018402002532785673, + "learning_rate": 6.036899992833967e-07, + "loss": 0.0, + "step": 28520 + }, + { + "epoch": 11.598617324115494, + "grad_norm": 0.0007245577903402676, + "learning_rate": 6.033433452530835e-07, + "loss": 0.0, + "step": 28521 + }, + { + "epoch": 11.59902399349329, + "grad_norm": 0.0012229253702608453, + "learning_rate": 6.029967876841803e-07, + "loss": 0.0, + "step": 28522 + }, + { + "epoch": 11.599430662871086, + "grad_norm": 0.0003156547756055498, + "learning_rate": 6.026503265802431e-07, + "loss": 0.0, + "step": 28523 + }, + { + "epoch": 11.599837332248882, + "grad_norm": 0.015440208015510517, + "learning_rate": 6.023039619448323e-07, + "loss": 0.0001, + "step": 28524 + }, + { + "epoch": 11.600244001626677, + "grad_norm": 0.0018777556220803428, + "learning_rate": 6.019576937814997e-07, + "loss": 0.0, + "step": 28525 + }, + { + "epoch": 11.600650671004473, + "grad_norm": 0.0008178957221617745, + "learning_rate": 6.016115220938024e-07, + "loss": 0.0, + "step": 28526 + }, + { + "epoch": 11.601057340382269, + "grad_norm": 0.45770064510537084, + "learning_rate": 6.012654468852918e-07, + "loss": 0.0045, + "step": 28527 + }, + { + "epoch": 11.601464009760065, + "grad_norm": 0.0033427015526174995, + "learning_rate": 6.009194681595231e-07, + "loss": 0.0, + "step": 28528 + }, + { + "epoch": 11.60187067913786, + "grad_norm": 0.2025191572505417, + "learning_rate": 6.005735859200456e-07, + "loss": 0.0018, + "step": 28529 + }, + { + "epoch": 11.602277348515656, + "grad_norm": 0.0024551843535863408, + "learning_rate": 6.002278001704109e-07, + "loss": 0.0, + "step": 28530 + }, + { + "epoch": 11.602684017893452, + "grad_norm": 0.004258496303282125, + "learning_rate": 5.998821109141683e-07, + "loss": 0.0, + "step": 28531 + }, + { + "epoch": 11.603090687271248, + "grad_norm": 0.003032959286184809, + "learning_rate": 5.995365181548673e-07, + "loss": 0.0, + "step": 28532 + }, + { + "epoch": 11.603497356649044, + "grad_norm": 0.0011438841838598292, + "learning_rate": 5.99191021896055e-07, + "loss": 0.0, + "step": 28533 + }, + { + "epoch": 11.60390402602684, + "grad_norm": 0.29443613514072275, + "learning_rate": 5.988456221412786e-07, + "loss": 0.0013, + "step": 28534 + }, + { + "epoch": 11.604310695404635, + "grad_norm": 0.0014631382776366746, + "learning_rate": 5.985003188940819e-07, + "loss": 0.0, + "step": 28535 + }, + { + "epoch": 11.604717364782433, + "grad_norm": 0.2840304966247259, + "learning_rate": 5.98155112158012e-07, + "loss": 0.0012, + "step": 28536 + }, + { + "epoch": 11.605124034160228, + "grad_norm": 0.0002771516856016321, + "learning_rate": 5.978100019366117e-07, + "loss": 0.0, + "step": 28537 + }, + { + "epoch": 11.605530703538024, + "grad_norm": 0.00470804137239316, + "learning_rate": 5.97464988233425e-07, + "loss": 0.0, + "step": 28538 + }, + { + "epoch": 11.60593737291582, + "grad_norm": 0.0028478193432094913, + "learning_rate": 5.971200710519909e-07, + "loss": 0.0, + "step": 28539 + }, + { + "epoch": 11.606344042293616, + "grad_norm": 0.12558914366753796, + "learning_rate": 5.967752503958523e-07, + "loss": 0.0006, + "step": 28540 + }, + { + "epoch": 11.606750711671411, + "grad_norm": 0.00229347387937901, + "learning_rate": 5.964305262685466e-07, + "loss": 0.0, + "step": 28541 + }, + { + "epoch": 11.607157381049207, + "grad_norm": 0.030660416167039215, + "learning_rate": 5.960858986736162e-07, + "loss": 0.0004, + "step": 28542 + }, + { + "epoch": 11.607564050427003, + "grad_norm": 0.20908531894604476, + "learning_rate": 5.957413676145962e-07, + "loss": 0.002, + "step": 28543 + }, + { + "epoch": 11.607970719804799, + "grad_norm": 0.0010384112849035655, + "learning_rate": 5.953969330950249e-07, + "loss": 0.0, + "step": 28544 + }, + { + "epoch": 11.608377389182595, + "grad_norm": 0.0002295160428157754, + "learning_rate": 5.950525951184361e-07, + "loss": 0.0, + "step": 28545 + }, + { + "epoch": 11.60878405856039, + "grad_norm": 0.016728853286899888, + "learning_rate": 5.947083536883669e-07, + "loss": 0.0001, + "step": 28546 + }, + { + "epoch": 11.609190727938186, + "grad_norm": 0.008197418958462844, + "learning_rate": 5.943642088083501e-07, + "loss": 0.0001, + "step": 28547 + }, + { + "epoch": 11.609597397315982, + "grad_norm": 0.009351938475109374, + "learning_rate": 5.940201604819162e-07, + "loss": 0.0001, + "step": 28548 + }, + { + "epoch": 11.610004066693778, + "grad_norm": 0.008986672559844163, + "learning_rate": 5.936762087126013e-07, + "loss": 0.0001, + "step": 28549 + }, + { + "epoch": 11.610410736071573, + "grad_norm": 0.0004597267811517949, + "learning_rate": 5.933323535039348e-07, + "loss": 0.0, + "step": 28550 + }, + { + "epoch": 11.61081740544937, + "grad_norm": 7.1914763738867125e-06, + "learning_rate": 5.929885948594449e-07, + "loss": 0.0, + "step": 28551 + }, + { + "epoch": 11.611224074827165, + "grad_norm": 0.045794114753530575, + "learning_rate": 5.926449327826622e-07, + "loss": 0.0005, + "step": 28552 + }, + { + "epoch": 11.61163074420496, + "grad_norm": 0.023739501476873773, + "learning_rate": 5.923013672771128e-07, + "loss": 0.0002, + "step": 28553 + }, + { + "epoch": 11.612037413582756, + "grad_norm": 0.0005296057531634768, + "learning_rate": 5.919578983463247e-07, + "loss": 0.0, + "step": 28554 + }, + { + "epoch": 11.612444082960554, + "grad_norm": 0.04996982951161352, + "learning_rate": 5.916145259938244e-07, + "loss": 0.0006, + "step": 28555 + }, + { + "epoch": 11.61285075233835, + "grad_norm": 0.001795911134732134, + "learning_rate": 5.912712502231366e-07, + "loss": 0.0, + "step": 28556 + }, + { + "epoch": 11.613257421716146, + "grad_norm": 0.0020455979468008588, + "learning_rate": 5.90928071037784e-07, + "loss": 0.0, + "step": 28557 + }, + { + "epoch": 11.613664091093941, + "grad_norm": 0.0005379295782999408, + "learning_rate": 5.905849884412907e-07, + "loss": 0.0, + "step": 28558 + }, + { + "epoch": 11.614070760471737, + "grad_norm": 0.027000775810593843, + "learning_rate": 5.90242002437178e-07, + "loss": 0.0002, + "step": 28559 + }, + { + "epoch": 11.614477429849533, + "grad_norm": 0.06743752143229513, + "learning_rate": 5.898991130289666e-07, + "loss": 0.0007, + "step": 28560 + }, + { + "epoch": 11.614884099227329, + "grad_norm": 0.16151169250646052, + "learning_rate": 5.895563202201759e-07, + "loss": 0.0014, + "step": 28561 + }, + { + "epoch": 11.615290768605124, + "grad_norm": 0.0015060126134030633, + "learning_rate": 5.892136240143275e-07, + "loss": 0.0, + "step": 28562 + }, + { + "epoch": 11.61569743798292, + "grad_norm": 0.0006048680091734488, + "learning_rate": 5.888710244149376e-07, + "loss": 0.0, + "step": 28563 + }, + { + "epoch": 11.616104107360716, + "grad_norm": 0.002933946109931358, + "learning_rate": 5.885285214255221e-07, + "loss": 0.0, + "step": 28564 + }, + { + "epoch": 11.616510776738512, + "grad_norm": 0.014526046958192356, + "learning_rate": 5.881861150495994e-07, + "loss": 0.0001, + "step": 28565 + }, + { + "epoch": 11.616917446116307, + "grad_norm": 0.015929921923352075, + "learning_rate": 5.878438052906821e-07, + "loss": 0.0001, + "step": 28566 + }, + { + "epoch": 11.617324115494103, + "grad_norm": 0.007424428888302819, + "learning_rate": 5.875015921522853e-07, + "loss": 0.0001, + "step": 28567 + }, + { + "epoch": 11.617730784871899, + "grad_norm": 0.0033085521635196637, + "learning_rate": 5.871594756379207e-07, + "loss": 0.0, + "step": 28568 + }, + { + "epoch": 11.618137454249695, + "grad_norm": 0.005591054352897819, + "learning_rate": 5.86817455751103e-07, + "loss": 0.0001, + "step": 28569 + }, + { + "epoch": 11.61854412362749, + "grad_norm": 0.003625796305820937, + "learning_rate": 5.864755324953408e-07, + "loss": 0.0, + "step": 28570 + }, + { + "epoch": 11.618950793005286, + "grad_norm": 0.04436689257467045, + "learning_rate": 5.861337058741457e-07, + "loss": 0.0004, + "step": 28571 + }, + { + "epoch": 11.619357462383082, + "grad_norm": 0.027217758733365547, + "learning_rate": 5.857919758910258e-07, + "loss": 0.0002, + "step": 28572 + }, + { + "epoch": 11.619764131760878, + "grad_norm": 0.0017536572624909867, + "learning_rate": 5.854503425494896e-07, + "loss": 0.0, + "step": 28573 + }, + { + "epoch": 11.620170801138674, + "grad_norm": 0.016292915393511905, + "learning_rate": 5.851088058530429e-07, + "loss": 0.0001, + "step": 28574 + }, + { + "epoch": 11.62057747051647, + "grad_norm": 0.000983479551853757, + "learning_rate": 5.847673658051933e-07, + "loss": 0.0, + "step": 28575 + }, + { + "epoch": 11.620984139894265, + "grad_norm": 7.915107454242794e-05, + "learning_rate": 5.844260224094456e-07, + "loss": 0.0, + "step": 28576 + }, + { + "epoch": 11.621390809272063, + "grad_norm": 0.0003081907007804461, + "learning_rate": 5.840847756693035e-07, + "loss": 0.0, + "step": 28577 + }, + { + "epoch": 11.621797478649858, + "grad_norm": 0.13708171075070677, + "learning_rate": 5.8374362558827e-07, + "loss": 0.001, + "step": 28578 + }, + { + "epoch": 11.622204148027654, + "grad_norm": 0.0517113884338901, + "learning_rate": 5.834025721698478e-07, + "loss": 0.0005, + "step": 28579 + }, + { + "epoch": 11.62261081740545, + "grad_norm": 0.0013558517068334335, + "learning_rate": 5.830616154175372e-07, + "loss": 0.0, + "step": 28580 + }, + { + "epoch": 11.623017486783246, + "grad_norm": 0.02284036597489386, + "learning_rate": 5.827207553348368e-07, + "loss": 0.0002, + "step": 28581 + }, + { + "epoch": 11.623424156161041, + "grad_norm": 0.013201268395905538, + "learning_rate": 5.823799919252493e-07, + "loss": 0.0001, + "step": 28582 + }, + { + "epoch": 11.623830825538837, + "grad_norm": 0.0006626328149718919, + "learning_rate": 5.820393251922707e-07, + "loss": 0.0, + "step": 28583 + }, + { + "epoch": 11.624237494916633, + "grad_norm": 0.02963547883255961, + "learning_rate": 5.816987551393982e-07, + "loss": 0.0002, + "step": 28584 + }, + { + "epoch": 11.624644164294429, + "grad_norm": 2.132974221056346, + "learning_rate": 5.81358281770128e-07, + "loss": 0.0277, + "step": 28585 + }, + { + "epoch": 11.625050833672224, + "grad_norm": 8.622720971489981e-05, + "learning_rate": 5.810179050879561e-07, + "loss": 0.0, + "step": 28586 + }, + { + "epoch": 11.62545750305002, + "grad_norm": 0.018129417259334202, + "learning_rate": 5.806776250963742e-07, + "loss": 0.0001, + "step": 28587 + }, + { + "epoch": 11.625864172427816, + "grad_norm": 0.01341468108160583, + "learning_rate": 5.803374417988783e-07, + "loss": 0.0001, + "step": 28588 + }, + { + "epoch": 11.626270841805612, + "grad_norm": 8.327099025179641e-05, + "learning_rate": 5.799973551989602e-07, + "loss": 0.0, + "step": 28589 + }, + { + "epoch": 11.626677511183408, + "grad_norm": 0.0006486678944887154, + "learning_rate": 5.796573653001091e-07, + "loss": 0.0, + "step": 28590 + }, + { + "epoch": 11.627084180561203, + "grad_norm": 0.0009015983344985211, + "learning_rate": 5.79317472105817e-07, + "loss": 0.0, + "step": 28591 + }, + { + "epoch": 11.627490849938999, + "grad_norm": 0.007827524921030098, + "learning_rate": 5.78977675619572e-07, + "loss": 0.0001, + "step": 28592 + }, + { + "epoch": 11.627897519316795, + "grad_norm": 0.06786372692699039, + "learning_rate": 5.786379758448624e-07, + "loss": 0.0007, + "step": 28593 + }, + { + "epoch": 11.62830418869459, + "grad_norm": 0.00017585381529474462, + "learning_rate": 5.782983727851743e-07, + "loss": 0.0, + "step": 28594 + }, + { + "epoch": 11.628710858072386, + "grad_norm": 0.005045030058151903, + "learning_rate": 5.779588664439972e-07, + "loss": 0.0, + "step": 28595 + }, + { + "epoch": 11.629117527450184, + "grad_norm": 0.09447912609073202, + "learning_rate": 5.776194568248139e-07, + "loss": 0.0008, + "step": 28596 + }, + { + "epoch": 11.62952419682798, + "grad_norm": 0.007734158098035072, + "learning_rate": 5.772801439311094e-07, + "loss": 0.0, + "step": 28597 + }, + { + "epoch": 11.629930866205775, + "grad_norm": 8.792861867117335e-05, + "learning_rate": 5.769409277663673e-07, + "loss": 0.0, + "step": 28598 + }, + { + "epoch": 11.630337535583571, + "grad_norm": 0.053555702206193594, + "learning_rate": 5.766018083340675e-07, + "loss": 0.0005, + "step": 28599 + }, + { + "epoch": 11.630744204961367, + "grad_norm": 0.020315779037778165, + "learning_rate": 5.762627856376923e-07, + "loss": 0.0001, + "step": 28600 + }, + { + "epoch": 11.631150874339163, + "grad_norm": 0.008166592086151673, + "learning_rate": 5.759238596807238e-07, + "loss": 0.0001, + "step": 28601 + }, + { + "epoch": 11.631557543716958, + "grad_norm": 0.05511542629538253, + "learning_rate": 5.755850304666399e-07, + "loss": 0.0003, + "step": 28602 + }, + { + "epoch": 11.631964213094754, + "grad_norm": 0.004887341126734707, + "learning_rate": 5.752462979989193e-07, + "loss": 0.0, + "step": 28603 + }, + { + "epoch": 11.63237088247255, + "grad_norm": 0.06626446526971473, + "learning_rate": 5.749076622810378e-07, + "loss": 0.0008, + "step": 28604 + }, + { + "epoch": 11.632777551850346, + "grad_norm": 0.0008473495403736669, + "learning_rate": 5.74569123316474e-07, + "loss": 0.0, + "step": 28605 + }, + { + "epoch": 11.633184221228142, + "grad_norm": 0.011223400856758804, + "learning_rate": 5.742306811087006e-07, + "loss": 0.0001, + "step": 28606 + }, + { + "epoch": 11.633590890605937, + "grad_norm": 0.0036460101087974935, + "learning_rate": 5.738923356611925e-07, + "loss": 0.0, + "step": 28607 + }, + { + "epoch": 11.633997559983733, + "grad_norm": 0.02338451593187133, + "learning_rate": 5.735540869774248e-07, + "loss": 0.0003, + "step": 28608 + }, + { + "epoch": 11.634404229361529, + "grad_norm": 0.001645353170413464, + "learning_rate": 5.73215935060869e-07, + "loss": 0.0, + "step": 28609 + }, + { + "epoch": 11.634810898739325, + "grad_norm": 0.005040529633393446, + "learning_rate": 5.728778799149959e-07, + "loss": 0.0001, + "step": 28610 + }, + { + "epoch": 11.63521756811712, + "grad_norm": 0.004381686173324936, + "learning_rate": 5.725399215432759e-07, + "loss": 0.0, + "step": 28611 + }, + { + "epoch": 11.635624237494916, + "grad_norm": 0.2141500605831163, + "learning_rate": 5.722020599491773e-07, + "loss": 0.0022, + "step": 28612 + }, + { + "epoch": 11.636030906872712, + "grad_norm": 0.07344204977828193, + "learning_rate": 5.718642951361697e-07, + "loss": 0.0007, + "step": 28613 + }, + { + "epoch": 11.636437576250508, + "grad_norm": 0.0011886955801931147, + "learning_rate": 5.715266271077202e-07, + "loss": 0.0, + "step": 28614 + }, + { + "epoch": 11.636844245628303, + "grad_norm": 0.009759263022964692, + "learning_rate": 5.711890558672961e-07, + "loss": 0.0001, + "step": 28615 + }, + { + "epoch": 11.6372509150061, + "grad_norm": 0.006018468752677807, + "learning_rate": 5.708515814183612e-07, + "loss": 0.0001, + "step": 28616 + }, + { + "epoch": 11.637657584383895, + "grad_norm": 0.0062397326653312836, + "learning_rate": 5.705142037643807e-07, + "loss": 0.0, + "step": 28617 + }, + { + "epoch": 11.638064253761693, + "grad_norm": 0.01122659113907579, + "learning_rate": 5.701769229088183e-07, + "loss": 0.0001, + "step": 28618 + }, + { + "epoch": 11.638470923139488, + "grad_norm": 0.005716942385847239, + "learning_rate": 5.698397388551347e-07, + "loss": 0.0001, + "step": 28619 + }, + { + "epoch": 11.638877592517284, + "grad_norm": 0.0019180789059676713, + "learning_rate": 5.695026516067914e-07, + "loss": 0.0, + "step": 28620 + }, + { + "epoch": 11.63928426189508, + "grad_norm": 0.0007206492447504717, + "learning_rate": 5.691656611672503e-07, + "loss": 0.0, + "step": 28621 + }, + { + "epoch": 11.639690931272876, + "grad_norm": 0.012450447981203138, + "learning_rate": 5.688287675399706e-07, + "loss": 0.0001, + "step": 28622 + }, + { + "epoch": 11.640097600650671, + "grad_norm": 0.0002719910742343679, + "learning_rate": 5.684919707284109e-07, + "loss": 0.0, + "step": 28623 + }, + { + "epoch": 11.640504270028467, + "grad_norm": 0.05080817701423609, + "learning_rate": 5.681552707360271e-07, + "loss": 0.0006, + "step": 28624 + }, + { + "epoch": 11.640910939406263, + "grad_norm": 0.002292869808048092, + "learning_rate": 5.678186675662767e-07, + "loss": 0.0, + "step": 28625 + }, + { + "epoch": 11.641317608784059, + "grad_norm": 6.841523364015299, + "learning_rate": 5.674821612226133e-07, + "loss": 0.0624, + "step": 28626 + }, + { + "epoch": 11.641724278161854, + "grad_norm": 0.0011622011128366043, + "learning_rate": 5.671457517084944e-07, + "loss": 0.0, + "step": 28627 + }, + { + "epoch": 11.64213094753965, + "grad_norm": 0.09851305935544054, + "learning_rate": 5.668094390273727e-07, + "loss": 0.0006, + "step": 28628 + }, + { + "epoch": 11.642537616917446, + "grad_norm": 0.15603491078621845, + "learning_rate": 5.664732231826986e-07, + "loss": 0.0013, + "step": 28629 + }, + { + "epoch": 11.642944286295242, + "grad_norm": 0.30326989924478076, + "learning_rate": 5.661371041779251e-07, + "loss": 0.0011, + "step": 28630 + }, + { + "epoch": 11.643350955673037, + "grad_norm": 0.0009513921651130082, + "learning_rate": 5.658010820164994e-07, + "loss": 0.0, + "step": 28631 + }, + { + "epoch": 11.643757625050833, + "grad_norm": 0.0015056304266901322, + "learning_rate": 5.654651567018765e-07, + "loss": 0.0, + "step": 28632 + }, + { + "epoch": 11.644164294428629, + "grad_norm": 0.002020113981848837, + "learning_rate": 5.651293282375015e-07, + "loss": 0.0, + "step": 28633 + }, + { + "epoch": 11.644570963806425, + "grad_norm": 0.000845085993377217, + "learning_rate": 5.647935966268225e-07, + "loss": 0.0, + "step": 28634 + }, + { + "epoch": 11.64497763318422, + "grad_norm": 0.002539693503832482, + "learning_rate": 5.644579618732859e-07, + "loss": 0.0, + "step": 28635 + }, + { + "epoch": 11.645384302562016, + "grad_norm": 0.041398195017554984, + "learning_rate": 5.641224239803378e-07, + "loss": 0.0004, + "step": 28636 + }, + { + "epoch": 11.645790971939814, + "grad_norm": 0.1028838610848241, + "learning_rate": 5.637869829514209e-07, + "loss": 0.0008, + "step": 28637 + }, + { + "epoch": 11.64619764131761, + "grad_norm": 0.03571482327665763, + "learning_rate": 5.634516387899791e-07, + "loss": 0.0004, + "step": 28638 + }, + { + "epoch": 11.646604310695405, + "grad_norm": 0.00026973422341301643, + "learning_rate": 5.631163914994564e-07, + "loss": 0.0, + "step": 28639 + }, + { + "epoch": 11.647010980073201, + "grad_norm": 0.0008883518998922466, + "learning_rate": 5.627812410832944e-07, + "loss": 0.0, + "step": 28640 + }, + { + "epoch": 11.647417649450997, + "grad_norm": 0.00458854227147413, + "learning_rate": 5.624461875449316e-07, + "loss": 0.0, + "step": 28641 + }, + { + "epoch": 11.647824318828793, + "grad_norm": 0.08675868416351704, + "learning_rate": 5.621112308878085e-07, + "loss": 0.0005, + "step": 28642 + }, + { + "epoch": 11.648230988206588, + "grad_norm": 0.05018204077166309, + "learning_rate": 5.617763711153645e-07, + "loss": 0.0001, + "step": 28643 + }, + { + "epoch": 11.648637657584384, + "grad_norm": 0.08289427534315993, + "learning_rate": 5.614416082310348e-07, + "loss": 0.0007, + "step": 28644 + }, + { + "epoch": 11.64904432696218, + "grad_norm": 0.09128869209134668, + "learning_rate": 5.611069422382576e-07, + "loss": 0.0009, + "step": 28645 + }, + { + "epoch": 11.649450996339976, + "grad_norm": 0.00198891239556115, + "learning_rate": 5.60772373140469e-07, + "loss": 0.0, + "step": 28646 + }, + { + "epoch": 11.649857665717771, + "grad_norm": 0.01020105951414055, + "learning_rate": 5.604379009411021e-07, + "loss": 0.0001, + "step": 28647 + }, + { + "epoch": 11.650264335095567, + "grad_norm": 0.007368120886725682, + "learning_rate": 5.601035256435916e-07, + "loss": 0.0, + "step": 28648 + }, + { + "epoch": 11.650671004473363, + "grad_norm": 0.25831580485252614, + "learning_rate": 5.597692472513683e-07, + "loss": 0.0027, + "step": 28649 + }, + { + "epoch": 11.651077673851159, + "grad_norm": 0.010210775126644431, + "learning_rate": 5.59435065767866e-07, + "loss": 0.0001, + "step": 28650 + }, + { + "epoch": 11.651484343228955, + "grad_norm": 0.02853824505560567, + "learning_rate": 5.59100981196512e-07, + "loss": 0.0003, + "step": 28651 + }, + { + "epoch": 11.65189101260675, + "grad_norm": 0.009765679091138091, + "learning_rate": 5.587669935407403e-07, + "loss": 0.0001, + "step": 28652 + }, + { + "epoch": 11.652297681984546, + "grad_norm": 0.014067733281906815, + "learning_rate": 5.584331028039758e-07, + "loss": 0.0001, + "step": 28653 + }, + { + "epoch": 11.652704351362342, + "grad_norm": 0.0005450787520404599, + "learning_rate": 5.58099308989648e-07, + "loss": 0.0, + "step": 28654 + }, + { + "epoch": 11.653111020740138, + "grad_norm": 0.022224385162136092, + "learning_rate": 5.577656121011832e-07, + "loss": 0.0002, + "step": 28655 + }, + { + "epoch": 11.653517690117933, + "grad_norm": 0.001893934692839406, + "learning_rate": 5.574320121420063e-07, + "loss": 0.0, + "step": 28656 + }, + { + "epoch": 11.65392435949573, + "grad_norm": 0.05320693575583469, + "learning_rate": 5.570985091155401e-07, + "loss": 0.0003, + "step": 28657 + }, + { + "epoch": 11.654331028873525, + "grad_norm": 0.08678764105463316, + "learning_rate": 5.567651030252119e-07, + "loss": 0.0007, + "step": 28658 + }, + { + "epoch": 11.654737698251322, + "grad_norm": 0.4457356041679, + "learning_rate": 5.564317938744435e-07, + "loss": 0.0057, + "step": 28659 + }, + { + "epoch": 11.655144367629118, + "grad_norm": 0.0060784971449056235, + "learning_rate": 5.560985816666542e-07, + "loss": 0.0, + "step": 28660 + }, + { + "epoch": 11.655551037006914, + "grad_norm": 0.009395391184162251, + "learning_rate": 5.55765466405267e-07, + "loss": 0.0001, + "step": 28661 + }, + { + "epoch": 11.65595770638471, + "grad_norm": 5.631097094868941, + "learning_rate": 5.554324480937001e-07, + "loss": 0.13, + "step": 28662 + }, + { + "epoch": 11.656364375762506, + "grad_norm": 0.019402879265641663, + "learning_rate": 5.55099526735372e-07, + "loss": 0.0002, + "step": 28663 + }, + { + "epoch": 11.656771045140301, + "grad_norm": 0.016872922031952165, + "learning_rate": 5.547667023337e-07, + "loss": 0.0001, + "step": 28664 + }, + { + "epoch": 11.657177714518097, + "grad_norm": 7.428655866841631e-05, + "learning_rate": 5.544339748921035e-07, + "loss": 0.0, + "step": 28665 + }, + { + "epoch": 11.657584383895893, + "grad_norm": 2.791368667051661, + "learning_rate": 5.541013444139953e-07, + "loss": 0.0215, + "step": 28666 + }, + { + "epoch": 11.657991053273689, + "grad_norm": 0.004433799825736294, + "learning_rate": 5.537688109027905e-07, + "loss": 0.0, + "step": 28667 + }, + { + "epoch": 11.658397722651484, + "grad_norm": 0.047134269730004424, + "learning_rate": 5.534363743619031e-07, + "loss": 0.0003, + "step": 28668 + }, + { + "epoch": 11.65880439202928, + "grad_norm": 6.593016614152896e-05, + "learning_rate": 5.531040347947459e-07, + "loss": 0.0, + "step": 28669 + }, + { + "epoch": 11.659211061407076, + "grad_norm": 0.047759011674943085, + "learning_rate": 5.527717922047293e-07, + "loss": 0.0004, + "step": 28670 + }, + { + "epoch": 11.659617730784872, + "grad_norm": 0.0017355304191690856, + "learning_rate": 5.524396465952664e-07, + "loss": 0.0, + "step": 28671 + }, + { + "epoch": 11.660024400162667, + "grad_norm": 0.007759670812675765, + "learning_rate": 5.521075979697643e-07, + "loss": 0.0, + "step": 28672 + }, + { + "epoch": 11.660431069540463, + "grad_norm": 0.002323348799767359, + "learning_rate": 5.517756463316337e-07, + "loss": 0.0, + "step": 28673 + }, + { + "epoch": 11.660837738918259, + "grad_norm": 0.0082211165471824, + "learning_rate": 5.514437916842807e-07, + "loss": 0.0001, + "step": 28674 + }, + { + "epoch": 11.661244408296055, + "grad_norm": 0.0039038541440496563, + "learning_rate": 5.511120340311138e-07, + "loss": 0.0, + "step": 28675 + }, + { + "epoch": 11.66165107767385, + "grad_norm": 0.006380433617490138, + "learning_rate": 5.507803733755357e-07, + "loss": 0.0001, + "step": 28676 + }, + { + "epoch": 11.662057747051646, + "grad_norm": 0.13629899692902253, + "learning_rate": 5.504488097209526e-07, + "loss": 0.0006, + "step": 28677 + }, + { + "epoch": 11.662464416429444, + "grad_norm": 0.00832112033178735, + "learning_rate": 5.501173430707696e-07, + "loss": 0.0001, + "step": 28678 + }, + { + "epoch": 11.66287108580724, + "grad_norm": 0.00010056939353150264, + "learning_rate": 5.497859734283873e-07, + "loss": 0.0, + "step": 28679 + }, + { + "epoch": 11.663277755185035, + "grad_norm": 0.1649441796133369, + "learning_rate": 5.494547007972084e-07, + "loss": 0.0018, + "step": 28680 + }, + { + "epoch": 11.663684424562831, + "grad_norm": 0.0031021717190181184, + "learning_rate": 5.491235251806338e-07, + "loss": 0.0, + "step": 28681 + }, + { + "epoch": 11.664091093940627, + "grad_norm": 0.03045727016995621, + "learning_rate": 5.487924465820627e-07, + "loss": 0.0003, + "step": 28682 + }, + { + "epoch": 11.664497763318423, + "grad_norm": 9.27670951403554e-05, + "learning_rate": 5.484614650048936e-07, + "loss": 0.0, + "step": 28683 + }, + { + "epoch": 11.664904432696218, + "grad_norm": 0.0006075091838889299, + "learning_rate": 5.481305804525228e-07, + "loss": 0.0, + "step": 28684 + }, + { + "epoch": 11.665311102074014, + "grad_norm": 3.178467995516113e-06, + "learning_rate": 5.477997929283507e-07, + "loss": 0.0, + "step": 28685 + }, + { + "epoch": 11.66571777145181, + "grad_norm": 0.043137142628265596, + "learning_rate": 5.474691024357704e-07, + "loss": 0.0004, + "step": 28686 + }, + { + "epoch": 11.666124440829606, + "grad_norm": 0.0007647385523796113, + "learning_rate": 5.471385089781777e-07, + "loss": 0.0, + "step": 28687 + }, + { + "epoch": 11.666531110207401, + "grad_norm": 0.0013068008566576926, + "learning_rate": 5.468080125589647e-07, + "loss": 0.0, + "step": 28688 + }, + { + "epoch": 11.666937779585197, + "grad_norm": 0.016123045644562628, + "learning_rate": 5.464776131815263e-07, + "loss": 0.0002, + "step": 28689 + }, + { + "epoch": 11.667344448962993, + "grad_norm": 0.0007284385019313131, + "learning_rate": 5.461473108492509e-07, + "loss": 0.0, + "step": 28690 + }, + { + "epoch": 11.667751118340789, + "grad_norm": 0.01800568981477033, + "learning_rate": 5.458171055655326e-07, + "loss": 0.0001, + "step": 28691 + }, + { + "epoch": 11.668157787718584, + "grad_norm": 0.0008129273784182361, + "learning_rate": 5.454869973337607e-07, + "loss": 0.0, + "step": 28692 + }, + { + "epoch": 11.66856445709638, + "grad_norm": 0.023063602049128717, + "learning_rate": 5.451569861573225e-07, + "loss": 0.0001, + "step": 28693 + }, + { + "epoch": 11.668971126474176, + "grad_norm": 0.0001909213448986994, + "learning_rate": 5.448270720396065e-07, + "loss": 0.0, + "step": 28694 + }, + { + "epoch": 11.669377795851972, + "grad_norm": 0.0005313154433376072, + "learning_rate": 5.444972549839988e-07, + "loss": 0.0, + "step": 28695 + }, + { + "epoch": 11.669784465229768, + "grad_norm": 2.3561060958625592e-05, + "learning_rate": 5.441675349938868e-07, + "loss": 0.0, + "step": 28696 + }, + { + "epoch": 11.670191134607563, + "grad_norm": 0.2086015835991447, + "learning_rate": 5.438379120726522e-07, + "loss": 0.0014, + "step": 28697 + }, + { + "epoch": 11.670597803985359, + "grad_norm": 6.749748366023312, + "learning_rate": 5.435083862236812e-07, + "loss": 0.018, + "step": 28698 + }, + { + "epoch": 11.671004473363155, + "grad_norm": 0.00896014234169454, + "learning_rate": 5.431789574503576e-07, + "loss": 0.0001, + "step": 28699 + }, + { + "epoch": 11.671411142740952, + "grad_norm": 0.00024504161858462573, + "learning_rate": 5.428496257560601e-07, + "loss": 0.0, + "step": 28700 + }, + { + "epoch": 11.671817812118748, + "grad_norm": 0.0012569548571633113, + "learning_rate": 5.425203911441712e-07, + "loss": 0.0, + "step": 28701 + }, + { + "epoch": 11.672224481496544, + "grad_norm": 0.003674501733310929, + "learning_rate": 5.421912536180706e-07, + "loss": 0.0, + "step": 28702 + }, + { + "epoch": 11.67263115087434, + "grad_norm": 0.000539304301574838, + "learning_rate": 5.418622131811358e-07, + "loss": 0.0, + "step": 28703 + }, + { + "epoch": 11.673037820252135, + "grad_norm": 0.1483639480820886, + "learning_rate": 5.41533269836746e-07, + "loss": 0.0012, + "step": 28704 + }, + { + "epoch": 11.673444489629931, + "grad_norm": 0.0017611415609203032, + "learning_rate": 5.412044235882774e-07, + "loss": 0.0, + "step": 28705 + }, + { + "epoch": 11.673851159007727, + "grad_norm": 0.004925383485551421, + "learning_rate": 5.408756744391075e-07, + "loss": 0.0, + "step": 28706 + }, + { + "epoch": 11.674257828385523, + "grad_norm": 0.007614981420342145, + "learning_rate": 5.405470223926079e-07, + "loss": 0.0001, + "step": 28707 + }, + { + "epoch": 11.674664497763318, + "grad_norm": 0.008768321288206607, + "learning_rate": 5.402184674521549e-07, + "loss": 0.0001, + "step": 28708 + }, + { + "epoch": 11.675071167141114, + "grad_norm": 4.8899467086233536e-05, + "learning_rate": 5.398900096211201e-07, + "loss": 0.0, + "step": 28709 + }, + { + "epoch": 11.67547783651891, + "grad_norm": 0.0004987444781552202, + "learning_rate": 5.395616489028732e-07, + "loss": 0.0, + "step": 28710 + }, + { + "epoch": 11.675884505896706, + "grad_norm": 0.00990652364852676, + "learning_rate": 5.392333853007902e-07, + "loss": 0.0001, + "step": 28711 + }, + { + "epoch": 11.676291175274502, + "grad_norm": 0.03575787783388547, + "learning_rate": 5.389052188182364e-07, + "loss": 0.0004, + "step": 28712 + }, + { + "epoch": 11.676697844652297, + "grad_norm": 6.192192623708473e-05, + "learning_rate": 5.385771494585835e-07, + "loss": 0.0, + "step": 28713 + }, + { + "epoch": 11.677104514030093, + "grad_norm": 0.004504138072889191, + "learning_rate": 5.382491772251974e-07, + "loss": 0.0, + "step": 28714 + }, + { + "epoch": 11.677511183407889, + "grad_norm": 0.006200478791978556, + "learning_rate": 5.379213021214457e-07, + "loss": 0.0, + "step": 28715 + }, + { + "epoch": 11.677917852785685, + "grad_norm": 0.026465292636199414, + "learning_rate": 5.375935241506924e-07, + "loss": 0.0003, + "step": 28716 + }, + { + "epoch": 11.67832452216348, + "grad_norm": 0.0018231146275915935, + "learning_rate": 5.372658433163058e-07, + "loss": 0.0, + "step": 28717 + }, + { + "epoch": 11.678731191541276, + "grad_norm": 0.00023375202732856844, + "learning_rate": 5.369382596216477e-07, + "loss": 0.0, + "step": 28718 + }, + { + "epoch": 11.679137860919074, + "grad_norm": 0.0008901726557607836, + "learning_rate": 5.36610773070081e-07, + "loss": 0.0, + "step": 28719 + }, + { + "epoch": 11.67954453029687, + "grad_norm": 0.002390825438275464, + "learning_rate": 5.362833836649673e-07, + "loss": 0.0, + "step": 28720 + }, + { + "epoch": 11.679951199674665, + "grad_norm": 0.0032521700174838967, + "learning_rate": 5.359560914096673e-07, + "loss": 0.0, + "step": 28721 + }, + { + "epoch": 11.680357869052461, + "grad_norm": 0.02603346231794516, + "learning_rate": 5.356288963075418e-07, + "loss": 0.0002, + "step": 28722 + }, + { + "epoch": 11.680764538430257, + "grad_norm": 0.12088339490928471, + "learning_rate": 5.353017983619479e-07, + "loss": 0.001, + "step": 28723 + }, + { + "epoch": 11.681171207808053, + "grad_norm": 0.03346816507627039, + "learning_rate": 5.349747975762454e-07, + "loss": 0.0003, + "step": 28724 + }, + { + "epoch": 11.681577877185848, + "grad_norm": 0.10576310355486926, + "learning_rate": 5.346478939537902e-07, + "loss": 0.001, + "step": 28725 + }, + { + "epoch": 11.681984546563644, + "grad_norm": 0.06872933691132666, + "learning_rate": 5.343210874979377e-07, + "loss": 0.0005, + "step": 28726 + }, + { + "epoch": 11.68239121594144, + "grad_norm": 0.00010493856369980063, + "learning_rate": 5.339943782120449e-07, + "loss": 0.0, + "step": 28727 + }, + { + "epoch": 11.682797885319236, + "grad_norm": 0.00031845861469473156, + "learning_rate": 5.336677660994616e-07, + "loss": 0.0, + "step": 28728 + }, + { + "epoch": 11.683204554697031, + "grad_norm": 0.0021076415706509518, + "learning_rate": 5.33341251163545e-07, + "loss": 0.0, + "step": 28729 + }, + { + "epoch": 11.683611224074827, + "grad_norm": 0.01022868036996517, + "learning_rate": 5.330148334076434e-07, + "loss": 0.0001, + "step": 28730 + }, + { + "epoch": 11.684017893452623, + "grad_norm": 0.01852198843265899, + "learning_rate": 5.3268851283511e-07, + "loss": 0.0001, + "step": 28731 + }, + { + "epoch": 11.684424562830419, + "grad_norm": 0.006677021936226553, + "learning_rate": 5.323622894492941e-07, + "loss": 0.0001, + "step": 28732 + }, + { + "epoch": 11.684831232208214, + "grad_norm": 0.0153345884467, + "learning_rate": 5.320361632535432e-07, + "loss": 0.0001, + "step": 28733 + }, + { + "epoch": 11.68523790158601, + "grad_norm": 0.001081600448318949, + "learning_rate": 5.317101342512055e-07, + "loss": 0.0, + "step": 28734 + }, + { + "epoch": 11.685644570963806, + "grad_norm": 0.20831681123883797, + "learning_rate": 5.313842024456306e-07, + "loss": 0.0013, + "step": 28735 + }, + { + "epoch": 11.686051240341602, + "grad_norm": 0.001185747681487201, + "learning_rate": 5.310583678401615e-07, + "loss": 0.0, + "step": 28736 + }, + { + "epoch": 11.686457909719397, + "grad_norm": 0.002904681112481277, + "learning_rate": 5.307326304381444e-07, + "loss": 0.0, + "step": 28737 + }, + { + "epoch": 11.686864579097193, + "grad_norm": 0.03770344396352848, + "learning_rate": 5.30406990242922e-07, + "loss": 0.0004, + "step": 28738 + }, + { + "epoch": 11.687271248474989, + "grad_norm": 0.0003625639502318816, + "learning_rate": 5.300814472578375e-07, + "loss": 0.0, + "step": 28739 + }, + { + "epoch": 11.687677917852785, + "grad_norm": 0.0018491097494849338, + "learning_rate": 5.297560014862335e-07, + "loss": 0.0, + "step": 28740 + }, + { + "epoch": 11.688084587230582, + "grad_norm": 0.17314797645517052, + "learning_rate": 5.294306529314497e-07, + "loss": 0.0016, + "step": 28741 + }, + { + "epoch": 11.688491256608378, + "grad_norm": 0.007200443485115141, + "learning_rate": 5.291054015968277e-07, + "loss": 0.0001, + "step": 28742 + }, + { + "epoch": 11.688897925986174, + "grad_norm": 0.09562238655821881, + "learning_rate": 5.287802474857052e-07, + "loss": 0.0008, + "step": 28743 + }, + { + "epoch": 11.68930459536397, + "grad_norm": 0.05850232348714669, + "learning_rate": 5.284551906014202e-07, + "loss": 0.0003, + "step": 28744 + }, + { + "epoch": 11.689711264741765, + "grad_norm": 0.0006956376827361745, + "learning_rate": 5.281302309473091e-07, + "loss": 0.0, + "step": 28745 + }, + { + "epoch": 11.690117934119561, + "grad_norm": 0.11483287037875703, + "learning_rate": 5.278053685267093e-07, + "loss": 0.0004, + "step": 28746 + }, + { + "epoch": 11.690524603497357, + "grad_norm": 0.0008064244696047608, + "learning_rate": 5.274806033429525e-07, + "loss": 0.0, + "step": 28747 + }, + { + "epoch": 11.690931272875153, + "grad_norm": 0.00032834916402181774, + "learning_rate": 5.271559353993761e-07, + "loss": 0.0, + "step": 28748 + }, + { + "epoch": 11.691337942252948, + "grad_norm": 0.00016729645041099135, + "learning_rate": 5.268313646993117e-07, + "loss": 0.0, + "step": 28749 + }, + { + "epoch": 11.691744611630744, + "grad_norm": 0.004051501152107541, + "learning_rate": 5.265068912460913e-07, + "loss": 0.0, + "step": 28750 + }, + { + "epoch": 11.69215128100854, + "grad_norm": 0.0042444663939773634, + "learning_rate": 5.261825150430455e-07, + "loss": 0.0, + "step": 28751 + }, + { + "epoch": 11.692557950386336, + "grad_norm": 0.017506316408866275, + "learning_rate": 5.25858236093505e-07, + "loss": 0.0001, + "step": 28752 + }, + { + "epoch": 11.692964619764131, + "grad_norm": 0.003914593442538284, + "learning_rate": 5.25534054400797e-07, + "loss": 0.0, + "step": 28753 + }, + { + "epoch": 11.693371289141927, + "grad_norm": 0.008642323273723745, + "learning_rate": 5.252099699682489e-07, + "loss": 0.0001, + "step": 28754 + }, + { + "epoch": 11.693777958519723, + "grad_norm": 0.012596721795556353, + "learning_rate": 5.248859827991903e-07, + "loss": 0.0001, + "step": 28755 + }, + { + "epoch": 11.694184627897519, + "grad_norm": 0.019637078560585053, + "learning_rate": 5.245620928969463e-07, + "loss": 0.0001, + "step": 28756 + }, + { + "epoch": 11.694591297275315, + "grad_norm": 0.0033228153985377616, + "learning_rate": 5.242383002648411e-07, + "loss": 0.0, + "step": 28757 + }, + { + "epoch": 11.69499796665311, + "grad_norm": 0.013890310118230324, + "learning_rate": 5.239146049061994e-07, + "loss": 0.0001, + "step": 28758 + }, + { + "epoch": 11.695404636030906, + "grad_norm": 0.027214107647966515, + "learning_rate": 5.235910068243421e-07, + "loss": 0.0002, + "step": 28759 + }, + { + "epoch": 11.695811305408704, + "grad_norm": 0.009098177438285532, + "learning_rate": 5.232675060225923e-07, + "loss": 0.0001, + "step": 28760 + }, + { + "epoch": 11.6962179747865, + "grad_norm": 0.010603327369821428, + "learning_rate": 5.229441025042725e-07, + "loss": 0.0001, + "step": 28761 + }, + { + "epoch": 11.696624644164295, + "grad_norm": 0.0026600984135479433, + "learning_rate": 5.226207962727004e-07, + "loss": 0.0, + "step": 28762 + }, + { + "epoch": 11.697031313542091, + "grad_norm": 0.0035425887786432147, + "learning_rate": 5.222975873311964e-07, + "loss": 0.0, + "step": 28763 + }, + { + "epoch": 11.697437982919887, + "grad_norm": 0.004841946768110585, + "learning_rate": 5.219744756830769e-07, + "loss": 0.0, + "step": 28764 + }, + { + "epoch": 11.697844652297682, + "grad_norm": 0.050068076705626456, + "learning_rate": 5.216514613316603e-07, + "loss": 0.0004, + "step": 28765 + }, + { + "epoch": 11.698251321675478, + "grad_norm": 0.0015230394057684773, + "learning_rate": 5.213285442802618e-07, + "loss": 0.0, + "step": 28766 + }, + { + "epoch": 11.698657991053274, + "grad_norm": 0.003444926069419291, + "learning_rate": 5.210057245321953e-07, + "loss": 0.0, + "step": 28767 + }, + { + "epoch": 11.69906466043107, + "grad_norm": 7.22526484715382e-06, + "learning_rate": 5.20683002090776e-07, + "loss": 0.0, + "step": 28768 + }, + { + "epoch": 11.699471329808866, + "grad_norm": 0.3338512054425538, + "learning_rate": 5.203603769593169e-07, + "loss": 0.0031, + "step": 28769 + }, + { + "epoch": 11.699877999186661, + "grad_norm": 0.010994569300653, + "learning_rate": 5.200378491411295e-07, + "loss": 0.0001, + "step": 28770 + }, + { + "epoch": 11.700284668564457, + "grad_norm": 0.025861902459611136, + "learning_rate": 5.197154186395259e-07, + "loss": 0.0001, + "step": 28771 + }, + { + "epoch": 11.700691337942253, + "grad_norm": 0.004506090794639073, + "learning_rate": 5.193930854578133e-07, + "loss": 0.0, + "step": 28772 + }, + { + "epoch": 11.701098007320049, + "grad_norm": 0.009359170042526288, + "learning_rate": 5.190708495993013e-07, + "loss": 0.0, + "step": 28773 + }, + { + "epoch": 11.701504676697844, + "grad_norm": 4.6246149484308785e-05, + "learning_rate": 5.187487110673007e-07, + "loss": 0.0, + "step": 28774 + }, + { + "epoch": 11.70191134607564, + "grad_norm": 1.4166430630489624, + "learning_rate": 5.184266698651153e-07, + "loss": 0.0143, + "step": 28775 + }, + { + "epoch": 11.702318015453436, + "grad_norm": 0.032017163108576475, + "learning_rate": 5.181047259960526e-07, + "loss": 0.0002, + "step": 28776 + }, + { + "epoch": 11.702724684831232, + "grad_norm": 0.013649585848786049, + "learning_rate": 5.177828794634176e-07, + "loss": 0.0001, + "step": 28777 + }, + { + "epoch": 11.703131354209027, + "grad_norm": 0.15292875384757731, + "learning_rate": 5.174611302705135e-07, + "loss": 0.0017, + "step": 28778 + }, + { + "epoch": 11.703538023586823, + "grad_norm": 0.0006268311140753936, + "learning_rate": 5.17139478420643e-07, + "loss": 0.0, + "step": 28779 + }, + { + "epoch": 11.703944692964619, + "grad_norm": 0.03873234451031615, + "learning_rate": 5.168179239171067e-07, + "loss": 0.0003, + "step": 28780 + }, + { + "epoch": 11.704351362342415, + "grad_norm": 0.03773148349603199, + "learning_rate": 5.164964667632088e-07, + "loss": 0.0004, + "step": 28781 + }, + { + "epoch": 11.704758031720212, + "grad_norm": 0.0030086082328450573, + "learning_rate": 5.161751069622478e-07, + "loss": 0.0, + "step": 28782 + }, + { + "epoch": 11.705164701098008, + "grad_norm": 0.013778512434375565, + "learning_rate": 5.158538445175221e-07, + "loss": 0.0001, + "step": 28783 + }, + { + "epoch": 11.705571370475804, + "grad_norm": 0.0032615592090436893, + "learning_rate": 5.155326794323301e-07, + "loss": 0.0, + "step": 28784 + }, + { + "epoch": 11.7059780398536, + "grad_norm": 0.019087154481493923, + "learning_rate": 5.152116117099681e-07, + "loss": 0.0001, + "step": 28785 + }, + { + "epoch": 11.706384709231395, + "grad_norm": 0.0002022115565219239, + "learning_rate": 5.148906413537335e-07, + "loss": 0.0, + "step": 28786 + }, + { + "epoch": 11.706791378609191, + "grad_norm": 0.41060492432144313, + "learning_rate": 5.145697683669182e-07, + "loss": 0.0039, + "step": 28787 + }, + { + "epoch": 11.707198047986987, + "grad_norm": 0.0003226787315228204, + "learning_rate": 5.142489927528205e-07, + "loss": 0.0, + "step": 28788 + }, + { + "epoch": 11.707604717364783, + "grad_norm": 0.0034866120795026636, + "learning_rate": 5.1392831451473e-07, + "loss": 0.0, + "step": 28789 + }, + { + "epoch": 11.708011386742578, + "grad_norm": 0.0020169803400707114, + "learning_rate": 5.136077336559397e-07, + "loss": 0.0, + "step": 28790 + }, + { + "epoch": 11.708418056120374, + "grad_norm": 0.01686528165656437, + "learning_rate": 5.132872501797403e-07, + "loss": 0.0001, + "step": 28791 + }, + { + "epoch": 11.70882472549817, + "grad_norm": 0.018197521388897762, + "learning_rate": 5.129668640894215e-07, + "loss": 0.0001, + "step": 28792 + }, + { + "epoch": 11.709231394875966, + "grad_norm": 0.002081887225260546, + "learning_rate": 5.126465753882725e-07, + "loss": 0.0, + "step": 28793 + }, + { + "epoch": 11.709638064253761, + "grad_norm": 0.00025012557903396294, + "learning_rate": 5.123263840795811e-07, + "loss": 0.0, + "step": 28794 + }, + { + "epoch": 11.710044733631557, + "grad_norm": 0.001508385232509078, + "learning_rate": 5.120062901666356e-07, + "loss": 0.0, + "step": 28795 + }, + { + "epoch": 11.710451403009353, + "grad_norm": 0.0030380198917387994, + "learning_rate": 5.1168629365272e-07, + "loss": 0.0, + "step": 28796 + }, + { + "epoch": 11.710858072387149, + "grad_norm": 0.0017002532154967451, + "learning_rate": 5.113663945411207e-07, + "loss": 0.0, + "step": 28797 + }, + { + "epoch": 11.711264741764944, + "grad_norm": 0.0074077611556887975, + "learning_rate": 5.110465928351205e-07, + "loss": 0.0001, + "step": 28798 + }, + { + "epoch": 11.71167141114274, + "grad_norm": 0.004454201115442922, + "learning_rate": 5.107268885380024e-07, + "loss": 0.0001, + "step": 28799 + }, + { + "epoch": 11.712078080520536, + "grad_norm": 0.0924381015415058, + "learning_rate": 5.104072816530481e-07, + "loss": 0.0008, + "step": 28800 + }, + { + "epoch": 11.712484749898334, + "grad_norm": 0.02528896184765881, + "learning_rate": 5.100877721835395e-07, + "loss": 0.0002, + "step": 28801 + }, + { + "epoch": 11.71289141927613, + "grad_norm": 0.004655267850031769, + "learning_rate": 5.097683601327563e-07, + "loss": 0.0, + "step": 28802 + }, + { + "epoch": 11.713298088653925, + "grad_norm": 0.0009750836321019579, + "learning_rate": 5.09449045503978e-07, + "loss": 0.0, + "step": 28803 + }, + { + "epoch": 11.71370475803172, + "grad_norm": 0.02491062262131653, + "learning_rate": 5.091298283004808e-07, + "loss": 0.0001, + "step": 28804 + }, + { + "epoch": 11.714111427409517, + "grad_norm": 0.00013747188399814792, + "learning_rate": 5.088107085255434e-07, + "loss": 0.0, + "step": 28805 + }, + { + "epoch": 11.714518096787312, + "grad_norm": 0.00838425994482696, + "learning_rate": 5.084916861824385e-07, + "loss": 0.0001, + "step": 28806 + }, + { + "epoch": 11.714924766165108, + "grad_norm": 0.00014367294764468144, + "learning_rate": 5.081727612744458e-07, + "loss": 0.0, + "step": 28807 + }, + { + "epoch": 11.715331435542904, + "grad_norm": 0.0019074169254742863, + "learning_rate": 5.07853933804836e-07, + "loss": 0.0, + "step": 28808 + }, + { + "epoch": 11.7157381049207, + "grad_norm": 0.31664240872372434, + "learning_rate": 5.075352037768833e-07, + "loss": 0.0027, + "step": 28809 + }, + { + "epoch": 11.716144774298495, + "grad_norm": 0.0019349715039057222, + "learning_rate": 5.072165711938592e-07, + "loss": 0.0, + "step": 28810 + }, + { + "epoch": 11.716551443676291, + "grad_norm": 0.002765709805925621, + "learning_rate": 5.068980360590347e-07, + "loss": 0.0, + "step": 28811 + }, + { + "epoch": 11.716958113054087, + "grad_norm": 0.15026439501496253, + "learning_rate": 5.065795983756794e-07, + "loss": 0.0016, + "step": 28812 + }, + { + "epoch": 11.717364782431883, + "grad_norm": 0.04145434284462686, + "learning_rate": 5.062612581470616e-07, + "loss": 0.0002, + "step": 28813 + }, + { + "epoch": 11.717771451809678, + "grad_norm": 0.029716793326648536, + "learning_rate": 5.059430153764511e-07, + "loss": 0.0003, + "step": 28814 + }, + { + "epoch": 11.718178121187474, + "grad_norm": 2.132434425941175, + "learning_rate": 5.056248700671129e-07, + "loss": 0.0179, + "step": 28815 + }, + { + "epoch": 11.71858479056527, + "grad_norm": 0.26064390268264304, + "learning_rate": 5.053068222223157e-07, + "loss": 0.0022, + "step": 28816 + }, + { + "epoch": 11.718991459943066, + "grad_norm": 0.0005445350391906882, + "learning_rate": 5.049888718453211e-07, + "loss": 0.0, + "step": 28817 + }, + { + "epoch": 11.719398129320862, + "grad_norm": 0.0026251484353559482, + "learning_rate": 5.046710189393944e-07, + "loss": 0.0, + "step": 28818 + }, + { + "epoch": 11.719804798698657, + "grad_norm": 0.012097546227722985, + "learning_rate": 5.043532635077985e-07, + "loss": 0.0001, + "step": 28819 + }, + { + "epoch": 11.720211468076453, + "grad_norm": 0.0037271130713458596, + "learning_rate": 5.040356055537954e-07, + "loss": 0.0, + "step": 28820 + }, + { + "epoch": 11.720618137454249, + "grad_norm": 0.0008466815203523695, + "learning_rate": 5.037180450806467e-07, + "loss": 0.0, + "step": 28821 + }, + { + "epoch": 11.721024806832045, + "grad_norm": 0.0018849945238559357, + "learning_rate": 5.034005820916121e-07, + "loss": 0.0, + "step": 28822 + }, + { + "epoch": 11.721431476209842, + "grad_norm": 0.04316575901042842, + "learning_rate": 5.030832165899491e-07, + "loss": 0.0004, + "step": 28823 + }, + { + "epoch": 11.721838145587638, + "grad_norm": 0.0016748801943796332, + "learning_rate": 5.027659485789172e-07, + "loss": 0.0, + "step": 28824 + }, + { + "epoch": 11.722244814965434, + "grad_norm": 0.0005385554297645269, + "learning_rate": 5.024487780617726e-07, + "loss": 0.0, + "step": 28825 + }, + { + "epoch": 11.72265148434323, + "grad_norm": 0.0016959007085089188, + "learning_rate": 5.021317050417696e-07, + "loss": 0.0, + "step": 28826 + }, + { + "epoch": 11.723058153721025, + "grad_norm": 0.009550664607304495, + "learning_rate": 5.018147295221676e-07, + "loss": 0.0001, + "step": 28827 + }, + { + "epoch": 11.723464823098821, + "grad_norm": 0.9001688342469595, + "learning_rate": 5.014978515062163e-07, + "loss": 0.0095, + "step": 28828 + }, + { + "epoch": 11.723871492476617, + "grad_norm": 0.002452432905929398, + "learning_rate": 5.011810709971709e-07, + "loss": 0.0, + "step": 28829 + }, + { + "epoch": 11.724278161854413, + "grad_norm": 0.003482182951195205, + "learning_rate": 5.008643879982821e-07, + "loss": 0.0, + "step": 28830 + }, + { + "epoch": 11.724684831232208, + "grad_norm": 0.015879805737968867, + "learning_rate": 5.005478025127997e-07, + "loss": 0.0001, + "step": 28831 + }, + { + "epoch": 11.725091500610004, + "grad_norm": 0.11294016431272948, + "learning_rate": 5.002313145439763e-07, + "loss": 0.0009, + "step": 28832 + }, + { + "epoch": 11.7254981699878, + "grad_norm": 0.0021165983441253948, + "learning_rate": 4.999149240950595e-07, + "loss": 0.0, + "step": 28833 + }, + { + "epoch": 11.725904839365596, + "grad_norm": 0.1437782985137532, + "learning_rate": 4.995986311692969e-07, + "loss": 0.0012, + "step": 28834 + }, + { + "epoch": 11.726311508743391, + "grad_norm": 0.04552981618094604, + "learning_rate": 4.992824357699366e-07, + "loss": 0.0004, + "step": 28835 + }, + { + "epoch": 11.726718178121187, + "grad_norm": 0.004725511463489265, + "learning_rate": 4.98966337900224e-07, + "loss": 0.0001, + "step": 28836 + }, + { + "epoch": 11.727124847498983, + "grad_norm": 8.76911491229012e-06, + "learning_rate": 4.98650337563401e-07, + "loss": 0.0, + "step": 28837 + }, + { + "epoch": 11.727531516876779, + "grad_norm": 0.025587008129164392, + "learning_rate": 4.983344347627173e-07, + "loss": 0.0001, + "step": 28838 + }, + { + "epoch": 11.727938186254574, + "grad_norm": 0.011872532772419559, + "learning_rate": 4.980186295014111e-07, + "loss": 0.0001, + "step": 28839 + }, + { + "epoch": 11.72834485563237, + "grad_norm": 0.00338621704953346, + "learning_rate": 4.977029217827268e-07, + "loss": 0.0, + "step": 28840 + }, + { + "epoch": 11.728751525010166, + "grad_norm": 0.0067268848370178385, + "learning_rate": 4.973873116099038e-07, + "loss": 0.0, + "step": 28841 + }, + { + "epoch": 11.729158194387963, + "grad_norm": 0.0007551039121370952, + "learning_rate": 4.97071798986184e-07, + "loss": 0.0, + "step": 28842 + }, + { + "epoch": 11.72956486376576, + "grad_norm": 9.912086395323304e-05, + "learning_rate": 4.967563839148038e-07, + "loss": 0.0, + "step": 28843 + }, + { + "epoch": 11.729971533143555, + "grad_norm": 0.0046799172680461205, + "learning_rate": 4.964410663990005e-07, + "loss": 0.0, + "step": 28844 + }, + { + "epoch": 11.73037820252135, + "grad_norm": 0.12528997808675202, + "learning_rate": 4.961258464420149e-07, + "loss": 0.001, + "step": 28845 + }, + { + "epoch": 11.730784871899147, + "grad_norm": 0.0001911093679392442, + "learning_rate": 4.9581072404708e-07, + "loss": 0.0, + "step": 28846 + }, + { + "epoch": 11.731191541276942, + "grad_norm": 0.009171195629660622, + "learning_rate": 4.95495699217432e-07, + "loss": 0.0001, + "step": 28847 + }, + { + "epoch": 11.731598210654738, + "grad_norm": 0.023685858824946254, + "learning_rate": 4.95180771956304e-07, + "loss": 0.0002, + "step": 28848 + }, + { + "epoch": 11.732004880032534, + "grad_norm": 0.010086949995150868, + "learning_rate": 4.948659422669288e-07, + "loss": 0.0001, + "step": 28849 + }, + { + "epoch": 11.73241154941033, + "grad_norm": 0.006933250031657889, + "learning_rate": 4.945512101525373e-07, + "loss": 0.0001, + "step": 28850 + }, + { + "epoch": 11.732818218788125, + "grad_norm": 0.023859984447612174, + "learning_rate": 4.942365756163626e-07, + "loss": 0.0002, + "step": 28851 + }, + { + "epoch": 11.733224888165921, + "grad_norm": 0.012362774170633013, + "learning_rate": 4.939220386616339e-07, + "loss": 0.0001, + "step": 28852 + }, + { + "epoch": 11.733631557543717, + "grad_norm": 0.07738749652559034, + "learning_rate": 4.9360759929158e-07, + "loss": 0.0006, + "step": 28853 + }, + { + "epoch": 11.734038226921513, + "grad_norm": 0.546429000350754, + "learning_rate": 4.932932575094285e-07, + "loss": 0.004, + "step": 28854 + }, + { + "epoch": 11.734444896299308, + "grad_norm": 0.16366261756612113, + "learning_rate": 4.929790133184064e-07, + "loss": 0.0011, + "step": 28855 + }, + { + "epoch": 11.734851565677104, + "grad_norm": 0.037113638058841984, + "learning_rate": 4.926648667217393e-07, + "loss": 0.0001, + "step": 28856 + }, + { + "epoch": 11.7352582350549, + "grad_norm": 0.07055046410692552, + "learning_rate": 4.923508177226511e-07, + "loss": 0.0006, + "step": 28857 + }, + { + "epoch": 11.735664904432696, + "grad_norm": 0.008485707100792942, + "learning_rate": 4.92036866324368e-07, + "loss": 0.0, + "step": 28858 + }, + { + "epoch": 11.736071573810491, + "grad_norm": 0.006256735267975183, + "learning_rate": 4.91723012530112e-07, + "loss": 0.0001, + "step": 28859 + }, + { + "epoch": 11.736478243188287, + "grad_norm": 0.00620064451793486, + "learning_rate": 4.914092563431039e-07, + "loss": 0.0, + "step": 28860 + }, + { + "epoch": 11.736884912566083, + "grad_norm": 0.005482350434929355, + "learning_rate": 4.910955977665666e-07, + "loss": 0.0, + "step": 28861 + }, + { + "epoch": 11.737291581943879, + "grad_norm": 0.0009761743626802524, + "learning_rate": 4.907820368037175e-07, + "loss": 0.0, + "step": 28862 + }, + { + "epoch": 11.737698251321675, + "grad_norm": 3.4299683193967875e-05, + "learning_rate": 4.904685734577763e-07, + "loss": 0.0, + "step": 28863 + }, + { + "epoch": 11.738104920699472, + "grad_norm": 0.02666157866161903, + "learning_rate": 4.901552077319627e-07, + "loss": 0.0001, + "step": 28864 + }, + { + "epoch": 11.738511590077268, + "grad_norm": 0.0003005000957325118, + "learning_rate": 4.898419396294907e-07, + "loss": 0.0, + "step": 28865 + }, + { + "epoch": 11.738918259455064, + "grad_norm": 0.0017720736457060716, + "learning_rate": 4.895287691535788e-07, + "loss": 0.0, + "step": 28866 + }, + { + "epoch": 11.73932492883286, + "grad_norm": 0.029312910383508237, + "learning_rate": 4.892156963074401e-07, + "loss": 0.0003, + "step": 28867 + }, + { + "epoch": 11.739731598210655, + "grad_norm": 0.0049377563902789365, + "learning_rate": 4.889027210942898e-07, + "loss": 0.0, + "step": 28868 + }, + { + "epoch": 11.740138267588451, + "grad_norm": 0.03435696719115627, + "learning_rate": 4.885898435173386e-07, + "loss": 0.0003, + "step": 28869 + }, + { + "epoch": 11.740544936966247, + "grad_norm": 0.0020239000286756824, + "learning_rate": 4.882770635797995e-07, + "loss": 0.0, + "step": 28870 + }, + { + "epoch": 11.740951606344042, + "grad_norm": 0.0034926747145559117, + "learning_rate": 4.879643812848844e-07, + "loss": 0.0, + "step": 28871 + }, + { + "epoch": 11.741358275721838, + "grad_norm": 0.009523865592942998, + "learning_rate": 4.876517966358018e-07, + "loss": 0.0001, + "step": 28872 + }, + { + "epoch": 11.741764945099634, + "grad_norm": 0.004086215339432813, + "learning_rate": 4.873393096357615e-07, + "loss": 0.0, + "step": 28873 + }, + { + "epoch": 11.74217161447743, + "grad_norm": 0.04271961862342488, + "learning_rate": 4.870269202879708e-07, + "loss": 0.0005, + "step": 28874 + }, + { + "epoch": 11.742578283855226, + "grad_norm": 0.05295916243445433, + "learning_rate": 4.867146285956358e-07, + "loss": 0.0004, + "step": 28875 + }, + { + "epoch": 11.742984953233021, + "grad_norm": 0.04426405595910785, + "learning_rate": 4.864024345619623e-07, + "loss": 0.0004, + "step": 28876 + }, + { + "epoch": 11.743391622610817, + "grad_norm": 0.0011463921880829509, + "learning_rate": 4.860903381901572e-07, + "loss": 0.0, + "step": 28877 + }, + { + "epoch": 11.743798291988613, + "grad_norm": 0.008846870336961021, + "learning_rate": 4.857783394834225e-07, + "loss": 0.0001, + "step": 28878 + }, + { + "epoch": 11.744204961366409, + "grad_norm": 0.003696280757310116, + "learning_rate": 4.854664384449626e-07, + "loss": 0.0, + "step": 28879 + }, + { + "epoch": 11.744611630744204, + "grad_norm": 0.0003885619279031633, + "learning_rate": 4.851546350779779e-07, + "loss": 0.0, + "step": 28880 + }, + { + "epoch": 11.745018300122, + "grad_norm": 0.09304116743643932, + "learning_rate": 4.848429293856693e-07, + "loss": 0.0008, + "step": 28881 + }, + { + "epoch": 11.745424969499796, + "grad_norm": 0.002288446020399839, + "learning_rate": 4.845313213712366e-07, + "loss": 0.0, + "step": 28882 + }, + { + "epoch": 11.745831638877593, + "grad_norm": 0.006016310516285847, + "learning_rate": 4.842198110378782e-07, + "loss": 0.0, + "step": 28883 + }, + { + "epoch": 11.74623830825539, + "grad_norm": 0.023684442484835328, + "learning_rate": 4.839083983887937e-07, + "loss": 0.0002, + "step": 28884 + }, + { + "epoch": 11.746644977633185, + "grad_norm": 5.117746470196522e-06, + "learning_rate": 4.835970834271797e-07, + "loss": 0.0, + "step": 28885 + }, + { + "epoch": 11.74705164701098, + "grad_norm": 0.0024834174434626944, + "learning_rate": 4.832858661562301e-07, + "loss": 0.0, + "step": 28886 + }, + { + "epoch": 11.747458316388776, + "grad_norm": 1.8180394964675622e-05, + "learning_rate": 4.829747465791413e-07, + "loss": 0.0, + "step": 28887 + }, + { + "epoch": 11.747864985766572, + "grad_norm": 0.0019392511762646506, + "learning_rate": 4.826637246991061e-07, + "loss": 0.0, + "step": 28888 + }, + { + "epoch": 11.748271655144368, + "grad_norm": 0.00037381374898748967, + "learning_rate": 4.823528005193168e-07, + "loss": 0.0, + "step": 28889 + }, + { + "epoch": 11.748678324522164, + "grad_norm": 0.003517887793775715, + "learning_rate": 4.820419740429671e-07, + "loss": 0.0, + "step": 28890 + }, + { + "epoch": 11.74908499389996, + "grad_norm": 0.060667505637493964, + "learning_rate": 4.81731245273247e-07, + "loss": 0.0003, + "step": 28891 + }, + { + "epoch": 11.749491663277755, + "grad_norm": 0.004692339870283307, + "learning_rate": 4.81420614213346e-07, + "loss": 0.0, + "step": 28892 + }, + { + "epoch": 11.749898332655551, + "grad_norm": 0.002253750883528252, + "learning_rate": 4.811100808664537e-07, + "loss": 0.0, + "step": 28893 + }, + { + "epoch": 11.750305002033347, + "grad_norm": 0.003190045005267126, + "learning_rate": 4.807996452357566e-07, + "loss": 0.0, + "step": 28894 + }, + { + "epoch": 11.750711671411143, + "grad_norm": 0.15976599621085685, + "learning_rate": 4.804893073244421e-07, + "loss": 0.0016, + "step": 28895 + }, + { + "epoch": 11.751118340788938, + "grad_norm": 0.00991191033474379, + "learning_rate": 4.801790671356943e-07, + "loss": 0.0001, + "step": 28896 + }, + { + "epoch": 11.751525010166734, + "grad_norm": 0.00023417299745958308, + "learning_rate": 4.798689246727006e-07, + "loss": 0.0, + "step": 28897 + }, + { + "epoch": 11.75193167954453, + "grad_norm": 0.0006711616588098591, + "learning_rate": 4.795588799386442e-07, + "loss": 0.0, + "step": 28898 + }, + { + "epoch": 11.752338348922326, + "grad_norm": 7.180989645955598e-05, + "learning_rate": 4.792489329367078e-07, + "loss": 0.0, + "step": 28899 + }, + { + "epoch": 11.752745018300121, + "grad_norm": 0.03721819598455618, + "learning_rate": 4.789390836700725e-07, + "loss": 0.0003, + "step": 28900 + }, + { + "epoch": 11.753151687677917, + "grad_norm": 0.00437665526804691, + "learning_rate": 4.7862933214192e-07, + "loss": 0.0, + "step": 28901 + }, + { + "epoch": 11.753558357055713, + "grad_norm": 0.00012184921098394317, + "learning_rate": 4.78319678355429e-07, + "loss": 0.0, + "step": 28902 + }, + { + "epoch": 11.753965026433509, + "grad_norm": 0.02362037136871115, + "learning_rate": 4.780101223137768e-07, + "loss": 0.0002, + "step": 28903 + }, + { + "epoch": 11.754371695811304, + "grad_norm": 0.02967567936469167, + "learning_rate": 4.777006640201442e-07, + "loss": 0.0003, + "step": 28904 + }, + { + "epoch": 11.754778365189102, + "grad_norm": 0.000156766571497559, + "learning_rate": 4.773913034777078e-07, + "loss": 0.0, + "step": 28905 + }, + { + "epoch": 11.755185034566898, + "grad_norm": 0.004734831814108792, + "learning_rate": 4.770820406896414e-07, + "loss": 0.0, + "step": 28906 + }, + { + "epoch": 11.755591703944694, + "grad_norm": 0.11409940796680781, + "learning_rate": 4.7677287565912165e-07, + "loss": 0.0008, + "step": 28907 + }, + { + "epoch": 11.75599837332249, + "grad_norm": 0.010813461308164341, + "learning_rate": 4.764638083893203e-07, + "loss": 0.0001, + "step": 28908 + }, + { + "epoch": 11.756405042700285, + "grad_norm": 0.5791160527430935, + "learning_rate": 4.7615483888341027e-07, + "loss": 0.0066, + "step": 28909 + }, + { + "epoch": 11.75681171207808, + "grad_norm": 0.06928256293537549, + "learning_rate": 4.7584596714456475e-07, + "loss": 0.0004, + "step": 28910 + }, + { + "epoch": 11.757218381455877, + "grad_norm": 6.6268183286647515, + "learning_rate": 4.755371931759545e-07, + "loss": 0.0963, + "step": 28911 + }, + { + "epoch": 11.757625050833672, + "grad_norm": 0.011669209819929945, + "learning_rate": 4.75228516980748e-07, + "loss": 0.0001, + "step": 28912 + }, + { + "epoch": 11.758031720211468, + "grad_norm": 0.019969003831269816, + "learning_rate": 4.7491993856211505e-07, + "loss": 0.0001, + "step": 28913 + }, + { + "epoch": 11.758438389589264, + "grad_norm": 0.009075384788733746, + "learning_rate": 4.7461145792322195e-07, + "loss": 0.0001, + "step": 28914 + }, + { + "epoch": 11.75884505896706, + "grad_norm": 0.00036338216953989887, + "learning_rate": 4.7430307506723725e-07, + "loss": 0.0, + "step": 28915 + }, + { + "epoch": 11.759251728344855, + "grad_norm": 0.027528365732591945, + "learning_rate": 4.7399478999732405e-07, + "loss": 0.0003, + "step": 28916 + }, + { + "epoch": 11.759658397722651, + "grad_norm": 0.050352856685518456, + "learning_rate": 4.736866027166498e-07, + "loss": 0.0004, + "step": 28917 + }, + { + "epoch": 11.760065067100447, + "grad_norm": 0.0012453279552624068, + "learning_rate": 4.7337851322837747e-07, + "loss": 0.0, + "step": 28918 + }, + { + "epoch": 11.760471736478243, + "grad_norm": 0.01952453402702942, + "learning_rate": 4.73070521535669e-07, + "loss": 0.0002, + "step": 28919 + }, + { + "epoch": 11.760878405856038, + "grad_norm": 0.0024634245586754866, + "learning_rate": 4.7276262764168633e-07, + "loss": 0.0, + "step": 28920 + }, + { + "epoch": 11.761285075233834, + "grad_norm": 0.0019603766898615084, + "learning_rate": 4.7245483154959026e-07, + "loss": 0.0, + "step": 28921 + }, + { + "epoch": 11.76169174461163, + "grad_norm": 0.015427981267299884, + "learning_rate": 4.721471332625394e-07, + "loss": 0.0002, + "step": 28922 + }, + { + "epoch": 11.762098413989426, + "grad_norm": 0.0025828974782495726, + "learning_rate": 4.7183953278369444e-07, + "loss": 0.0, + "step": 28923 + }, + { + "epoch": 11.762505083367223, + "grad_norm": 0.016164824385848393, + "learning_rate": 4.7153203011621185e-07, + "loss": 0.0001, + "step": 28924 + }, + { + "epoch": 11.762911752745019, + "grad_norm": 0.0005005801066214952, + "learning_rate": 4.7122462526324905e-07, + "loss": 0.0, + "step": 28925 + }, + { + "epoch": 11.763318422122815, + "grad_norm": 0.01668094509640739, + "learning_rate": 4.709173182279614e-07, + "loss": 0.0001, + "step": 28926 + }, + { + "epoch": 11.76372509150061, + "grad_norm": 0.02987227892047008, + "learning_rate": 4.706101090135029e-07, + "loss": 0.0003, + "step": 28927 + }, + { + "epoch": 11.764131760878406, + "grad_norm": 0.03538174336634879, + "learning_rate": 4.7030299762302777e-07, + "loss": 0.0003, + "step": 28928 + }, + { + "epoch": 11.764538430256202, + "grad_norm": 0.010047585751153191, + "learning_rate": 4.69995984059689e-07, + "loss": 0.0001, + "step": 28929 + }, + { + "epoch": 11.764945099633998, + "grad_norm": 2.2052363395358806e-05, + "learning_rate": 4.6968906832663753e-07, + "loss": 0.0, + "step": 28930 + }, + { + "epoch": 11.765351769011794, + "grad_norm": 0.007958134069170671, + "learning_rate": 4.6938225042702403e-07, + "loss": 0.0001, + "step": 28931 + }, + { + "epoch": 11.76575843838959, + "grad_norm": 3.1947696728994774e-05, + "learning_rate": 4.690755303639982e-07, + "loss": 0.0, + "step": 28932 + }, + { + "epoch": 11.766165107767385, + "grad_norm": 0.019931067785495492, + "learning_rate": 4.6876890814070985e-07, + "loss": 0.0002, + "step": 28933 + }, + { + "epoch": 11.766571777145181, + "grad_norm": 0.0084933235920915, + "learning_rate": 4.684623837603031e-07, + "loss": 0.0001, + "step": 28934 + }, + { + "epoch": 11.766978446522977, + "grad_norm": 0.0014228031848970187, + "learning_rate": 4.6815595722592865e-07, + "loss": 0.0, + "step": 28935 + }, + { + "epoch": 11.767385115900773, + "grad_norm": 0.0010456072427777344, + "learning_rate": 4.678496285407319e-07, + "loss": 0.0, + "step": 28936 + }, + { + "epoch": 11.767791785278568, + "grad_norm": 0.00035443865166639803, + "learning_rate": 4.6754339770785474e-07, + "loss": 0.0, + "step": 28937 + }, + { + "epoch": 11.768198454656364, + "grad_norm": 0.003390560930170584, + "learning_rate": 4.672372647304424e-07, + "loss": 0.0, + "step": 28938 + }, + { + "epoch": 11.76860512403416, + "grad_norm": 0.01481388833465778, + "learning_rate": 4.6693122961163796e-07, + "loss": 0.0001, + "step": 28939 + }, + { + "epoch": 11.769011793411956, + "grad_norm": 0.0030239349545473078, + "learning_rate": 4.6662529235457997e-07, + "loss": 0.0, + "step": 28940 + }, + { + "epoch": 11.769418462789751, + "grad_norm": 0.041358214496750714, + "learning_rate": 4.6631945296241374e-07, + "loss": 0.0003, + "step": 28941 + }, + { + "epoch": 11.769825132167547, + "grad_norm": 0.016264334637856405, + "learning_rate": 4.660137114382757e-07, + "loss": 0.0001, + "step": 28942 + }, + { + "epoch": 11.770231801545343, + "grad_norm": 0.03239158093445039, + "learning_rate": 4.657080677853065e-07, + "loss": 0.0003, + "step": 28943 + }, + { + "epoch": 11.770638470923139, + "grad_norm": 0.00023824107461574537, + "learning_rate": 4.654025220066416e-07, + "loss": 0.0, + "step": 28944 + }, + { + "epoch": 11.771045140300934, + "grad_norm": 0.012393984402247794, + "learning_rate": 4.6509707410541836e-07, + "loss": 0.0001, + "step": 28945 + }, + { + "epoch": 11.771451809678732, + "grad_norm": 0.004873849735957061, + "learning_rate": 4.6479172408477214e-07, + "loss": 0.0, + "step": 28946 + }, + { + "epoch": 11.771858479056528, + "grad_norm": 0.0002494344001601507, + "learning_rate": 4.64486471947837e-07, + "loss": 0.0, + "step": 28947 + }, + { + "epoch": 11.772265148434323, + "grad_norm": 0.00020889403205690663, + "learning_rate": 4.6418131769774946e-07, + "loss": 0.0, + "step": 28948 + }, + { + "epoch": 11.77267181781212, + "grad_norm": 0.006385213081809444, + "learning_rate": 4.6387626133763907e-07, + "loss": 0.0001, + "step": 28949 + }, + { + "epoch": 11.773078487189915, + "grad_norm": 0.026957400108385893, + "learning_rate": 4.635713028706379e-07, + "loss": 0.0002, + "step": 28950 + }, + { + "epoch": 11.77348515656771, + "grad_norm": 0.004252097166396303, + "learning_rate": 4.6326644229987783e-07, + "loss": 0.0, + "step": 28951 + }, + { + "epoch": 11.773891825945507, + "grad_norm": 0.0030799737579976032, + "learning_rate": 4.629616796284875e-07, + "loss": 0.0, + "step": 28952 + }, + { + "epoch": 11.774298495323302, + "grad_norm": 0.00026817961100925427, + "learning_rate": 4.6265701485959324e-07, + "loss": 0.0, + "step": 28953 + }, + { + "epoch": 11.774705164701098, + "grad_norm": 0.08688600336933053, + "learning_rate": 4.6235244799632594e-07, + "loss": 0.0007, + "step": 28954 + }, + { + "epoch": 11.775111834078894, + "grad_norm": 0.01205134473384788, + "learning_rate": 4.62047979041812e-07, + "loss": 0.0001, + "step": 28955 + }, + { + "epoch": 11.77551850345669, + "grad_norm": 0.005383483496174327, + "learning_rate": 4.617436079991755e-07, + "loss": 0.0, + "step": 28956 + }, + { + "epoch": 11.775925172834485, + "grad_norm": 0.013196168554117413, + "learning_rate": 4.614393348715407e-07, + "loss": 0.0002, + "step": 28957 + }, + { + "epoch": 11.776331842212281, + "grad_norm": 0.011430407611320848, + "learning_rate": 4.611351596620328e-07, + "loss": 0.0001, + "step": 28958 + }, + { + "epoch": 11.776738511590077, + "grad_norm": 0.002001962835416243, + "learning_rate": 4.608310823737727e-07, + "loss": 0.0, + "step": 28959 + }, + { + "epoch": 11.777145180967873, + "grad_norm": 0.0038497251075962843, + "learning_rate": 4.6052710300988126e-07, + "loss": 0.0, + "step": 28960 + }, + { + "epoch": 11.777551850345668, + "grad_norm": 0.0015462895879580073, + "learning_rate": 4.6022322157348256e-07, + "loss": 0.0, + "step": 28961 + }, + { + "epoch": 11.777958519723464, + "grad_norm": 0.03821544497219748, + "learning_rate": 4.599194380676919e-07, + "loss": 0.0003, + "step": 28962 + }, + { + "epoch": 11.77836518910126, + "grad_norm": 0.00015344624696795518, + "learning_rate": 4.596157524956313e-07, + "loss": 0.0, + "step": 28963 + }, + { + "epoch": 11.778771858479056, + "grad_norm": 0.0002763957901559, + "learning_rate": 4.59312164860416e-07, + "loss": 0.0, + "step": 28964 + }, + { + "epoch": 11.779178527856853, + "grad_norm": 0.003892872093902855, + "learning_rate": 4.5900867516516236e-07, + "loss": 0.0, + "step": 28965 + }, + { + "epoch": 11.779585197234649, + "grad_norm": 0.016406287242743667, + "learning_rate": 4.587052834129857e-07, + "loss": 0.0001, + "step": 28966 + }, + { + "epoch": 11.779991866612445, + "grad_norm": 0.006277965070075089, + "learning_rate": 4.5840198960700244e-07, + "loss": 0.0001, + "step": 28967 + }, + { + "epoch": 11.78039853599024, + "grad_norm": 0.002570441830446603, + "learning_rate": 4.5809879375032563e-07, + "loss": 0.0, + "step": 28968 + }, + { + "epoch": 11.780805205368036, + "grad_norm": 0.01071793114141833, + "learning_rate": 4.577956958460672e-07, + "loss": 0.0001, + "step": 28969 + }, + { + "epoch": 11.781211874745832, + "grad_norm": 0.022417255671350583, + "learning_rate": 4.57492695897338e-07, + "loss": 0.0003, + "step": 28970 + }, + { + "epoch": 11.781618544123628, + "grad_norm": 0.030594986090197257, + "learning_rate": 4.571897939072478e-07, + "loss": 0.0002, + "step": 28971 + }, + { + "epoch": 11.782025213501424, + "grad_norm": 0.002773353125845115, + "learning_rate": 4.5688698987890854e-07, + "loss": 0.0, + "step": 28972 + }, + { + "epoch": 11.78243188287922, + "grad_norm": 0.0048905298088758225, + "learning_rate": 4.5658428381542554e-07, + "loss": 0.0, + "step": 28973 + }, + { + "epoch": 11.782838552257015, + "grad_norm": 0.27039671438614676, + "learning_rate": 4.5628167571990955e-07, + "loss": 0.0018, + "step": 28974 + }, + { + "epoch": 11.783245221634811, + "grad_norm": 0.014469845394606414, + "learning_rate": 4.559791655954648e-07, + "loss": 0.0001, + "step": 28975 + }, + { + "epoch": 11.783651891012607, + "grad_norm": 0.09339053010938755, + "learning_rate": 4.556767534451978e-07, + "loss": 0.0007, + "step": 28976 + }, + { + "epoch": 11.784058560390402, + "grad_norm": 0.003100528724304159, + "learning_rate": 4.553744392722126e-07, + "loss": 0.0, + "step": 28977 + }, + { + "epoch": 11.784465229768198, + "grad_norm": 0.0010655140793315694, + "learning_rate": 4.550722230796123e-07, + "loss": 0.0, + "step": 28978 + }, + { + "epoch": 11.784871899145994, + "grad_norm": 0.021570839871862667, + "learning_rate": 4.547701048704989e-07, + "loss": 0.0002, + "step": 28979 + }, + { + "epoch": 11.78527856852379, + "grad_norm": 0.07858068161932982, + "learning_rate": 4.5446808464797433e-07, + "loss": 0.0004, + "step": 28980 + }, + { + "epoch": 11.785685237901586, + "grad_norm": 0.0017744405052793047, + "learning_rate": 4.541661624151394e-07, + "loss": 0.0, + "step": 28981 + }, + { + "epoch": 11.786091907279381, + "grad_norm": 0.00019184119727698324, + "learning_rate": 4.53864338175094e-07, + "loss": 0.0, + "step": 28982 + }, + { + "epoch": 11.786498576657177, + "grad_norm": 0.005019351323123738, + "learning_rate": 4.535626119309344e-07, + "loss": 0.0, + "step": 28983 + }, + { + "epoch": 11.786905246034973, + "grad_norm": 0.0058632203153999645, + "learning_rate": 4.5326098368576046e-07, + "loss": 0.0, + "step": 28984 + }, + { + "epoch": 11.787311915412769, + "grad_norm": 0.0009372406326034313, + "learning_rate": 4.529594534426662e-07, + "loss": 0.0, + "step": 28985 + }, + { + "epoch": 11.787718584790564, + "grad_norm": 0.10149873811863654, + "learning_rate": 4.5265802120474824e-07, + "loss": 0.0008, + "step": 28986 + }, + { + "epoch": 11.788125254168362, + "grad_norm": 0.001101317003742758, + "learning_rate": 4.523566869751006e-07, + "loss": 0.0, + "step": 28987 + }, + { + "epoch": 11.788531923546158, + "grad_norm": 0.0023015743480294987, + "learning_rate": 4.520554507568176e-07, + "loss": 0.0, + "step": 28988 + }, + { + "epoch": 11.788938592923953, + "grad_norm": 0.0002073316871922118, + "learning_rate": 4.5175431255298994e-07, + "loss": 0.0, + "step": 28989 + }, + { + "epoch": 11.78934526230175, + "grad_norm": 0.09219149328284267, + "learning_rate": 4.514532723667109e-07, + "loss": 0.001, + "step": 28990 + }, + { + "epoch": 11.789751931679545, + "grad_norm": 0.005307470282991968, + "learning_rate": 4.5115233020106894e-07, + "loss": 0.0, + "step": 28991 + }, + { + "epoch": 11.79015860105734, + "grad_norm": 0.0076687482000648045, + "learning_rate": 4.5085148605915284e-07, + "loss": 0.0001, + "step": 28992 + }, + { + "epoch": 11.790565270435136, + "grad_norm": 0.060018616328257, + "learning_rate": 4.505507399440534e-07, + "loss": 0.0005, + "step": 28993 + }, + { + "epoch": 11.790971939812932, + "grad_norm": 0.15170140422535197, + "learning_rate": 4.5025009185885706e-07, + "loss": 0.0017, + "step": 28994 + }, + { + "epoch": 11.791378609190728, + "grad_norm": 2.524500061008174e-05, + "learning_rate": 4.499495418066502e-07, + "loss": 0.0, + "step": 28995 + }, + { + "epoch": 11.791785278568524, + "grad_norm": 0.00043727295834086207, + "learning_rate": 4.4964908979051704e-07, + "loss": 0.0, + "step": 28996 + }, + { + "epoch": 11.79219194794632, + "grad_norm": 0.07522737077289642, + "learning_rate": 4.493487358135429e-07, + "loss": 0.0007, + "step": 28997 + }, + { + "epoch": 11.792598617324115, + "grad_norm": 0.0003122394078116321, + "learning_rate": 4.490484798788108e-07, + "loss": 0.0, + "step": 28998 + }, + { + "epoch": 11.793005286701911, + "grad_norm": 0.0038206507344542653, + "learning_rate": 4.4874832198940176e-07, + "loss": 0.0, + "step": 28999 + }, + { + "epoch": 11.793411956079707, + "grad_norm": 3.2969857022934465e-06, + "learning_rate": 4.4844826214839987e-07, + "loss": 0.0, + "step": 29000 + }, + { + "epoch": 11.793818625457503, + "grad_norm": 0.4271633811740299, + "learning_rate": 4.481483003588827e-07, + "loss": 0.0038, + "step": 29001 + }, + { + "epoch": 11.794225294835298, + "grad_norm": 0.0011429731542803925, + "learning_rate": 4.4784843662393105e-07, + "loss": 0.0, + "step": 29002 + }, + { + "epoch": 11.794631964213094, + "grad_norm": 0.06545771225222227, + "learning_rate": 4.475486709466226e-07, + "loss": 0.0006, + "step": 29003 + }, + { + "epoch": 11.79503863359089, + "grad_norm": 1.7341853404670843e-05, + "learning_rate": 4.472490033300358e-07, + "loss": 0.0, + "step": 29004 + }, + { + "epoch": 11.795445302968686, + "grad_norm": 0.0036037573002298574, + "learning_rate": 4.4694943377724397e-07, + "loss": 0.0, + "step": 29005 + }, + { + "epoch": 11.795851972346483, + "grad_norm": 0.0002280254031774309, + "learning_rate": 4.466499622913245e-07, + "loss": 0.0, + "step": 29006 + }, + { + "epoch": 11.796258641724279, + "grad_norm": 0.0015713020080328928, + "learning_rate": 4.463505888753517e-07, + "loss": 0.0, + "step": 29007 + }, + { + "epoch": 11.796665311102075, + "grad_norm": 0.0022148814430124806, + "learning_rate": 4.460513135323985e-07, + "loss": 0.0, + "step": 29008 + }, + { + "epoch": 11.79707198047987, + "grad_norm": 0.46877954131192934, + "learning_rate": 4.4575213626553704e-07, + "loss": 0.0051, + "step": 29009 + }, + { + "epoch": 11.797478649857666, + "grad_norm": 0.18928491227872973, + "learning_rate": 4.454530570778381e-07, + "loss": 0.0015, + "step": 29010 + }, + { + "epoch": 11.797885319235462, + "grad_norm": 0.0021866339467479508, + "learning_rate": 4.451540759723727e-07, + "loss": 0.0, + "step": 29011 + }, + { + "epoch": 11.798291988613258, + "grad_norm": 0.001132852314783265, + "learning_rate": 4.448551929522071e-07, + "loss": 0.0, + "step": 29012 + }, + { + "epoch": 11.798698657991054, + "grad_norm": 0.004555679056436329, + "learning_rate": 4.4455640802041455e-07, + "loss": 0.0, + "step": 29013 + }, + { + "epoch": 11.79910532736885, + "grad_norm": 0.0005925931741493649, + "learning_rate": 4.4425772118005917e-07, + "loss": 0.0, + "step": 29014 + }, + { + "epoch": 11.799511996746645, + "grad_norm": 0.04332023676038154, + "learning_rate": 4.439591324342074e-07, + "loss": 0.0003, + "step": 29015 + }, + { + "epoch": 11.79991866612444, + "grad_norm": 0.0007985864618085749, + "learning_rate": 4.436606417859257e-07, + "loss": 0.0, + "step": 29016 + }, + { + "epoch": 11.800325335502237, + "grad_norm": 0.01068928593153541, + "learning_rate": 4.433622492382761e-07, + "loss": 0.0001, + "step": 29017 + }, + { + "epoch": 11.800732004880032, + "grad_norm": 0.019028157348348482, + "learning_rate": 4.430639547943227e-07, + "loss": 0.0002, + "step": 29018 + }, + { + "epoch": 11.801138674257828, + "grad_norm": 0.0015164084544883234, + "learning_rate": 4.4276575845712653e-07, + "loss": 0.0, + "step": 29019 + }, + { + "epoch": 11.801545343635624, + "grad_norm": 0.0017805759175833242, + "learning_rate": 4.4246766022975176e-07, + "loss": 0.0, + "step": 29020 + }, + { + "epoch": 11.80195201301342, + "grad_norm": 0.0011108348786134344, + "learning_rate": 4.4216966011525586e-07, + "loss": 0.0, + "step": 29021 + }, + { + "epoch": 11.802358682391215, + "grad_norm": 0.02176426435528255, + "learning_rate": 4.418717581166987e-07, + "loss": 0.0002, + "step": 29022 + }, + { + "epoch": 11.802765351769011, + "grad_norm": 0.005448560018646791, + "learning_rate": 4.415739542371389e-07, + "loss": 0.0, + "step": 29023 + }, + { + "epoch": 11.803172021146807, + "grad_norm": 0.31527580740156763, + "learning_rate": 4.4127624847963403e-07, + "loss": 0.0031, + "step": 29024 + }, + { + "epoch": 11.803578690524603, + "grad_norm": 0.018215822442471278, + "learning_rate": 4.409786408472361e-07, + "loss": 0.0001, + "step": 29025 + }, + { + "epoch": 11.803985359902398, + "grad_norm": 0.0011689443083570637, + "learning_rate": 4.406811313430071e-07, + "loss": 0.0, + "step": 29026 + }, + { + "epoch": 11.804392029280194, + "grad_norm": 1.5109206796565794e-06, + "learning_rate": 4.4038371996999563e-07, + "loss": 0.0, + "step": 29027 + }, + { + "epoch": 11.804798698657992, + "grad_norm": 0.015513962434745048, + "learning_rate": 4.4008640673125715e-07, + "loss": 0.0001, + "step": 29028 + }, + { + "epoch": 11.805205368035788, + "grad_norm": 0.02023342041879469, + "learning_rate": 4.3978919162984245e-07, + "loss": 0.0002, + "step": 29029 + }, + { + "epoch": 11.805612037413583, + "grad_norm": 0.10635902456404157, + "learning_rate": 4.3949207466880253e-07, + "loss": 0.0007, + "step": 29030 + }, + { + "epoch": 11.806018706791379, + "grad_norm": 0.029093151036580642, + "learning_rate": 4.3919505585118926e-07, + "loss": 0.0002, + "step": 29031 + }, + { + "epoch": 11.806425376169175, + "grad_norm": 0.0015629848471170177, + "learning_rate": 4.388981351800503e-07, + "loss": 0.0, + "step": 29032 + }, + { + "epoch": 11.80683204554697, + "grad_norm": 0.023179077172655507, + "learning_rate": 4.386013126584343e-07, + "loss": 0.0002, + "step": 29033 + }, + { + "epoch": 11.807238714924766, + "grad_norm": 0.00769063652402471, + "learning_rate": 4.383045882893877e-07, + "loss": 0.0001, + "step": 29034 + }, + { + "epoch": 11.807645384302562, + "grad_norm": 0.00038809767528087083, + "learning_rate": 4.3800796207595695e-07, + "loss": 0.0, + "step": 29035 + }, + { + "epoch": 11.808052053680358, + "grad_norm": 0.0011423451196962986, + "learning_rate": 4.3771143402118745e-07, + "loss": 0.0, + "step": 29036 + }, + { + "epoch": 11.808458723058154, + "grad_norm": 0.027764045629425086, + "learning_rate": 4.374150041281211e-07, + "loss": 0.0003, + "step": 29037 + }, + { + "epoch": 11.80886539243595, + "grad_norm": 0.007464574869640771, + "learning_rate": 4.3711867239980335e-07, + "loss": 0.0001, + "step": 29038 + }, + { + "epoch": 11.809272061813745, + "grad_norm": 0.26357370437620997, + "learning_rate": 4.36822438839275e-07, + "loss": 0.001, + "step": 29039 + }, + { + "epoch": 11.809678731191541, + "grad_norm": 0.001445923767400908, + "learning_rate": 4.3652630344957813e-07, + "loss": 0.0, + "step": 29040 + }, + { + "epoch": 11.810085400569337, + "grad_norm": 4.610025857744291e-05, + "learning_rate": 4.3623026623375255e-07, + "loss": 0.0, + "step": 29041 + }, + { + "epoch": 11.810492069947133, + "grad_norm": 0.04373976579703438, + "learning_rate": 4.359343271948358e-07, + "loss": 0.0003, + "step": 29042 + }, + { + "epoch": 11.810898739324928, + "grad_norm": 0.11946657739102472, + "learning_rate": 4.3563848633586536e-07, + "loss": 0.0007, + "step": 29043 + }, + { + "epoch": 11.811305408702724, + "grad_norm": 0.021117826648499507, + "learning_rate": 4.3534274365988004e-07, + "loss": 0.0001, + "step": 29044 + }, + { + "epoch": 11.81171207808052, + "grad_norm": 0.3013296215196321, + "learning_rate": 4.3504709916991627e-07, + "loss": 0.0014, + "step": 29045 + }, + { + "epoch": 11.812118747458316, + "grad_norm": 0.0018533381215113487, + "learning_rate": 4.347515528690083e-07, + "loss": 0.0, + "step": 29046 + }, + { + "epoch": 11.812525416836113, + "grad_norm": 5.284132142555685e-05, + "learning_rate": 4.344561047601892e-07, + "loss": 0.0, + "step": 29047 + }, + { + "epoch": 11.812932086213909, + "grad_norm": 0.01643093621602002, + "learning_rate": 4.341607548464921e-07, + "loss": 0.0001, + "step": 29048 + }, + { + "epoch": 11.813338755591705, + "grad_norm": 0.0009152876701952805, + "learning_rate": 4.338655031309491e-07, + "loss": 0.0, + "step": 29049 + }, + { + "epoch": 11.8137454249695, + "grad_norm": 0.0030309847159973604, + "learning_rate": 4.3357034961659105e-07, + "loss": 0.0, + "step": 29050 + }, + { + "epoch": 11.814152094347296, + "grad_norm": 0.001647316536593746, + "learning_rate": 4.332752943064478e-07, + "loss": 0.0, + "step": 29051 + }, + { + "epoch": 11.814558763725092, + "grad_norm": 0.10266158428922793, + "learning_rate": 4.329803372035501e-07, + "loss": 0.0009, + "step": 29052 + }, + { + "epoch": 11.814965433102888, + "grad_norm": 0.08032553454034931, + "learning_rate": 4.3268547831092356e-07, + "loss": 0.0007, + "step": 29053 + }, + { + "epoch": 11.815372102480683, + "grad_norm": 0.02935415379421301, + "learning_rate": 4.3239071763159555e-07, + "loss": 0.0003, + "step": 29054 + }, + { + "epoch": 11.81577877185848, + "grad_norm": 0.014825367409148311, + "learning_rate": 4.3209605516859153e-07, + "loss": 0.0001, + "step": 29055 + }, + { + "epoch": 11.816185441236275, + "grad_norm": 0.006380672799765449, + "learning_rate": 4.3180149092493684e-07, + "loss": 0.0, + "step": 29056 + }, + { + "epoch": 11.81659211061407, + "grad_norm": 0.01987264979187094, + "learning_rate": 4.315070249036557e-07, + "loss": 0.0001, + "step": 29057 + }, + { + "epoch": 11.816998779991867, + "grad_norm": 0.0003529160291118765, + "learning_rate": 4.312126571077713e-07, + "loss": 0.0, + "step": 29058 + }, + { + "epoch": 11.817405449369662, + "grad_norm": 0.03088404644143976, + "learning_rate": 4.3091838754030445e-07, + "loss": 0.0003, + "step": 29059 + }, + { + "epoch": 11.817812118747458, + "grad_norm": 0.009939651445347307, + "learning_rate": 4.306242162042773e-07, + "loss": 0.0001, + "step": 29060 + }, + { + "epoch": 11.818218788125254, + "grad_norm": 0.006356294917933393, + "learning_rate": 4.3033014310270736e-07, + "loss": 0.0001, + "step": 29061 + }, + { + "epoch": 11.81862545750305, + "grad_norm": 0.8071836010155691, + "learning_rate": 4.300361682386156e-07, + "loss": 0.0047, + "step": 29062 + }, + { + "epoch": 11.819032126880845, + "grad_norm": 0.013237547570556878, + "learning_rate": 4.2974229161501737e-07, + "loss": 0.0001, + "step": 29063 + }, + { + "epoch": 11.819438796258641, + "grad_norm": 0.011439106198276634, + "learning_rate": 4.294485132349335e-07, + "loss": 0.0001, + "step": 29064 + }, + { + "epoch": 11.819845465636437, + "grad_norm": 0.00014228089309849574, + "learning_rate": 4.291548331013762e-07, + "loss": 0.0, + "step": 29065 + }, + { + "epoch": 11.820252135014233, + "grad_norm": 0.0008643267960556909, + "learning_rate": 4.2886125121736176e-07, + "loss": 0.0, + "step": 29066 + }, + { + "epoch": 11.820658804392028, + "grad_norm": 0.0024124165585438714, + "learning_rate": 4.2856776758590347e-07, + "loss": 0.0, + "step": 29067 + }, + { + "epoch": 11.821065473769824, + "grad_norm": 0.0668977377643599, + "learning_rate": 4.282743822100144e-07, + "loss": 0.0003, + "step": 29068 + }, + { + "epoch": 11.821472143147622, + "grad_norm": 0.010897355639062226, + "learning_rate": 4.279810950927055e-07, + "loss": 0.0001, + "step": 29069 + }, + { + "epoch": 11.821878812525418, + "grad_norm": 0.0032919300875790636, + "learning_rate": 4.2768790623698877e-07, + "loss": 0.0, + "step": 29070 + }, + { + "epoch": 11.822285481903213, + "grad_norm": 0.0006822801228053573, + "learning_rate": 4.273948156458729e-07, + "loss": 0.0, + "step": 29071 + }, + { + "epoch": 11.822692151281009, + "grad_norm": 0.0004983804435942576, + "learning_rate": 4.271018233223678e-07, + "loss": 0.0, + "step": 29072 + }, + { + "epoch": 11.823098820658805, + "grad_norm": 0.006262062765358512, + "learning_rate": 4.268089292694799e-07, + "loss": 0.0, + "step": 29073 + }, + { + "epoch": 11.8235054900366, + "grad_norm": 0.03408903069950826, + "learning_rate": 4.265161334902168e-07, + "loss": 0.0003, + "step": 29074 + }, + { + "epoch": 11.823912159414396, + "grad_norm": 0.005052378378408292, + "learning_rate": 4.2622343598758386e-07, + "loss": 0.0, + "step": 29075 + }, + { + "epoch": 11.824318828792192, + "grad_norm": 0.0014256458202621643, + "learning_rate": 4.2593083676458426e-07, + "loss": 0.0, + "step": 29076 + }, + { + "epoch": 11.824725498169988, + "grad_norm": 0.06280222104607651, + "learning_rate": 4.256383358242244e-07, + "loss": 0.0004, + "step": 29077 + }, + { + "epoch": 11.825132167547784, + "grad_norm": 0.11923538664119214, + "learning_rate": 4.2534593316950643e-07, + "loss": 0.0011, + "step": 29078 + }, + { + "epoch": 11.82553883692558, + "grad_norm": 0.018004876798558257, + "learning_rate": 4.2505362880343017e-07, + "loss": 0.0002, + "step": 29079 + }, + { + "epoch": 11.825945506303375, + "grad_norm": 0.0037911149898243626, + "learning_rate": 4.2476142272899755e-07, + "loss": 0.0, + "step": 29080 + }, + { + "epoch": 11.826352175681171, + "grad_norm": 0.0025860418873674793, + "learning_rate": 4.244693149492085e-07, + "loss": 0.0, + "step": 29081 + }, + { + "epoch": 11.826758845058967, + "grad_norm": 0.04782127771985793, + "learning_rate": 4.2417730546705947e-07, + "loss": 0.0003, + "step": 29082 + }, + { + "epoch": 11.827165514436762, + "grad_norm": 0.0007310392241475254, + "learning_rate": 4.238853942855514e-07, + "loss": 0.0, + "step": 29083 + }, + { + "epoch": 11.827572183814558, + "grad_norm": 0.0009709923943685602, + "learning_rate": 4.235935814076797e-07, + "loss": 0.0, + "step": 29084 + }, + { + "epoch": 11.827978853192354, + "grad_norm": 0.00016576281676814014, + "learning_rate": 4.233018668364397e-07, + "loss": 0.0, + "step": 29085 + }, + { + "epoch": 11.82838552257015, + "grad_norm": 0.027190719389930437, + "learning_rate": 4.230102505748246e-07, + "loss": 0.0002, + "step": 29086 + }, + { + "epoch": 11.828792191947946, + "grad_norm": 0.0040174325409521715, + "learning_rate": 4.2271873262583084e-07, + "loss": 0.0, + "step": 29087 + }, + { + "epoch": 11.829198861325743, + "grad_norm": 0.0006047426145836429, + "learning_rate": 4.2242731299244833e-07, + "loss": 0.0, + "step": 29088 + }, + { + "epoch": 11.829605530703539, + "grad_norm": 0.018767662170099862, + "learning_rate": 4.2213599167766904e-07, + "loss": 0.0001, + "step": 29089 + }, + { + "epoch": 11.830012200081335, + "grad_norm": 0.013394195562408947, + "learning_rate": 4.218447686844851e-07, + "loss": 0.0001, + "step": 29090 + }, + { + "epoch": 11.83041886945913, + "grad_norm": 2.7360809432218455e-05, + "learning_rate": 4.2155364401588515e-07, + "loss": 0.0, + "step": 29091 + }, + { + "epoch": 11.830825538836926, + "grad_norm": 0.009195190162334924, + "learning_rate": 4.21262617674858e-07, + "loss": 0.0001, + "step": 29092 + }, + { + "epoch": 11.831232208214722, + "grad_norm": 0.0018800169279620572, + "learning_rate": 4.209716896643912e-07, + "loss": 0.0, + "step": 29093 + }, + { + "epoch": 11.831638877592518, + "grad_norm": 0.055029885173037835, + "learning_rate": 4.206808599874701e-07, + "loss": 0.0005, + "step": 29094 + }, + { + "epoch": 11.832045546970313, + "grad_norm": 0.0026516980467254344, + "learning_rate": 4.203901286470813e-07, + "loss": 0.0, + "step": 29095 + }, + { + "epoch": 11.83245221634811, + "grad_norm": 0.16354791440216276, + "learning_rate": 4.20099495646209e-07, + "loss": 0.0012, + "step": 29096 + }, + { + "epoch": 11.832858885725905, + "grad_norm": 0.0007997289485525229, + "learning_rate": 4.1980896098783755e-07, + "loss": 0.0, + "step": 29097 + }, + { + "epoch": 11.8332655551037, + "grad_norm": 0.0007803997452012662, + "learning_rate": 4.195185246749489e-07, + "loss": 0.0, + "step": 29098 + }, + { + "epoch": 11.833672224481496, + "grad_norm": 4.54591878046627e-05, + "learning_rate": 4.192281867105241e-07, + "loss": 0.0, + "step": 29099 + }, + { + "epoch": 11.834078893859292, + "grad_norm": 0.02346970135438624, + "learning_rate": 4.189379470975441e-07, + "loss": 0.0002, + "step": 29100 + }, + { + "epoch": 11.834485563237088, + "grad_norm": 5.5958222815770895e-05, + "learning_rate": 4.1864780583898755e-07, + "loss": 0.0, + "step": 29101 + }, + { + "epoch": 11.834892232614884, + "grad_norm": 0.0006876540226188074, + "learning_rate": 4.1835776293783216e-07, + "loss": 0.0, + "step": 29102 + }, + { + "epoch": 11.83529890199268, + "grad_norm": 0.0011286854243735941, + "learning_rate": 4.180678183970588e-07, + "loss": 0.0, + "step": 29103 + }, + { + "epoch": 11.835705571370475, + "grad_norm": 0.0055639336423144, + "learning_rate": 4.1777797221964176e-07, + "loss": 0.0, + "step": 29104 + }, + { + "epoch": 11.836112240748271, + "grad_norm": 0.01940154079406213, + "learning_rate": 4.174882244085554e-07, + "loss": 0.0002, + "step": 29105 + }, + { + "epoch": 11.836518910126067, + "grad_norm": 0.0004260290472698521, + "learning_rate": 4.1719857496677616e-07, + "loss": 0.0, + "step": 29106 + }, + { + "epoch": 11.836925579503863, + "grad_norm": 0.18712298226287352, + "learning_rate": 4.169090238972762e-07, + "loss": 0.0017, + "step": 29107 + }, + { + "epoch": 11.837332248881658, + "grad_norm": 0.007772023509349918, + "learning_rate": 4.166195712030263e-07, + "loss": 0.0001, + "step": 29108 + }, + { + "epoch": 11.837738918259454, + "grad_norm": 0.05318323182784142, + "learning_rate": 4.1633021688700206e-07, + "loss": 0.0005, + "step": 29109 + }, + { + "epoch": 11.838145587637252, + "grad_norm": 0.008154504537126366, + "learning_rate": 4.1604096095217096e-07, + "loss": 0.0001, + "step": 29110 + }, + { + "epoch": 11.838552257015047, + "grad_norm": 0.023060623207381746, + "learning_rate": 4.157518034015029e-07, + "loss": 0.0001, + "step": 29111 + }, + { + "epoch": 11.838958926392843, + "grad_norm": 0.012768087953049732, + "learning_rate": 4.1546274423796553e-07, + "loss": 0.0001, + "step": 29112 + }, + { + "epoch": 11.839365595770639, + "grad_norm": 0.1069551949349116, + "learning_rate": 4.151737834645275e-07, + "loss": 0.0009, + "step": 29113 + }, + { + "epoch": 11.839772265148435, + "grad_norm": 0.0008977168089859394, + "learning_rate": 4.1488492108415545e-07, + "loss": 0.0, + "step": 29114 + }, + { + "epoch": 11.84017893452623, + "grad_norm": 0.005053336803193595, + "learning_rate": 4.145961570998114e-07, + "loss": 0.0, + "step": 29115 + }, + { + "epoch": 11.840585603904026, + "grad_norm": 0.001641925810765075, + "learning_rate": 4.1430749151446294e-07, + "loss": 0.0, + "step": 29116 + }, + { + "epoch": 11.840992273281822, + "grad_norm": 0.008987260619556072, + "learning_rate": 4.1401892433107327e-07, + "loss": 0.0, + "step": 29117 + }, + { + "epoch": 11.841398942659618, + "grad_norm": 0.00023678534890688772, + "learning_rate": 4.137304555526034e-07, + "loss": 0.0, + "step": 29118 + }, + { + "epoch": 11.841805612037414, + "grad_norm": 0.004243428399807518, + "learning_rate": 4.1344208518201533e-07, + "loss": 0.0, + "step": 29119 + }, + { + "epoch": 11.84221228141521, + "grad_norm": 0.0006176477771321392, + "learning_rate": 4.131538132222701e-07, + "loss": 0.0, + "step": 29120 + }, + { + "epoch": 11.842618950793005, + "grad_norm": 0.11334633808929724, + "learning_rate": 4.1286563967632423e-07, + "loss": 0.0011, + "step": 29121 + }, + { + "epoch": 11.8430256201708, + "grad_norm": 0.0009678155913088398, + "learning_rate": 4.125775645471375e-07, + "loss": 0.0, + "step": 29122 + }, + { + "epoch": 11.843432289548597, + "grad_norm": 0.0013201345192707273, + "learning_rate": 4.122895878376676e-07, + "loss": 0.0, + "step": 29123 + }, + { + "epoch": 11.843838958926392, + "grad_norm": 0.03313610850308032, + "learning_rate": 4.1200170955087104e-07, + "loss": 0.0002, + "step": 29124 + }, + { + "epoch": 11.844245628304188, + "grad_norm": 0.03304264150905106, + "learning_rate": 4.1171392968970323e-07, + "loss": 0.0002, + "step": 29125 + }, + { + "epoch": 11.844652297681984, + "grad_norm": 0.1103053862145214, + "learning_rate": 4.114262482571174e-07, + "loss": 0.0011, + "step": 29126 + }, + { + "epoch": 11.84505896705978, + "grad_norm": 0.0037976890815842835, + "learning_rate": 4.111386652560656e-07, + "loss": 0.0, + "step": 29127 + }, + { + "epoch": 11.845465636437575, + "grad_norm": 0.13200553040302465, + "learning_rate": 4.108511806895021e-07, + "loss": 0.0005, + "step": 29128 + }, + { + "epoch": 11.845872305815373, + "grad_norm": 0.009449281867264759, + "learning_rate": 4.10563794560378e-07, + "loss": 0.0001, + "step": 29129 + }, + { + "epoch": 11.846278975193169, + "grad_norm": 0.009161758395428016, + "learning_rate": 4.1027650687164187e-07, + "loss": 0.0001, + "step": 29130 + }, + { + "epoch": 11.846685644570965, + "grad_norm": 0.04307192111041294, + "learning_rate": 4.0998931762624374e-07, + "loss": 0.0004, + "step": 29131 + }, + { + "epoch": 11.84709231394876, + "grad_norm": 0.0028217928929670536, + "learning_rate": 4.097022268271322e-07, + "loss": 0.0, + "step": 29132 + }, + { + "epoch": 11.847498983326556, + "grad_norm": 0.0004184718798859344, + "learning_rate": 4.094152344772528e-07, + "loss": 0.0, + "step": 29133 + }, + { + "epoch": 11.847905652704352, + "grad_norm": 0.006429920236219773, + "learning_rate": 4.091283405795543e-07, + "loss": 0.0001, + "step": 29134 + }, + { + "epoch": 11.848312322082148, + "grad_norm": 0.002740336370023255, + "learning_rate": 4.0884154513698094e-07, + "loss": 0.0, + "step": 29135 + }, + { + "epoch": 11.848718991459943, + "grad_norm": 0.0007632280322395172, + "learning_rate": 4.085548481524748e-07, + "loss": 0.0, + "step": 29136 + }, + { + "epoch": 11.849125660837739, + "grad_norm": 1.0196548990810598e-05, + "learning_rate": 4.0826824962898135e-07, + "loss": 0.0, + "step": 29137 + }, + { + "epoch": 11.849532330215535, + "grad_norm": 0.01915990561603669, + "learning_rate": 4.079817495694416e-07, + "loss": 0.0001, + "step": 29138 + }, + { + "epoch": 11.84993899959333, + "grad_norm": 0.024340084748234347, + "learning_rate": 4.076953479767964e-07, + "loss": 0.0002, + "step": 29139 + }, + { + "epoch": 11.850345668971126, + "grad_norm": 0.0008977562275579012, + "learning_rate": 4.074090448539858e-07, + "loss": 0.0, + "step": 29140 + }, + { + "epoch": 11.850752338348922, + "grad_norm": 0.0006358379398586457, + "learning_rate": 4.0712284020394953e-07, + "loss": 0.0, + "step": 29141 + }, + { + "epoch": 11.851159007726718, + "grad_norm": 0.0024704160964482986, + "learning_rate": 4.068367340296253e-07, + "loss": 0.0, + "step": 29142 + }, + { + "epoch": 11.851565677104514, + "grad_norm": 0.4054816720753352, + "learning_rate": 4.0655072633395077e-07, + "loss": 0.0036, + "step": 29143 + }, + { + "epoch": 11.85197234648231, + "grad_norm": 0.10431064040728606, + "learning_rate": 4.0626481711986135e-07, + "loss": 0.0008, + "step": 29144 + }, + { + "epoch": 11.852379015860105, + "grad_norm": 0.03886702958241213, + "learning_rate": 4.059790063902913e-07, + "loss": 0.0004, + "step": 29145 + }, + { + "epoch": 11.852785685237901, + "grad_norm": 8.682033633054272e-05, + "learning_rate": 4.056932941481739e-07, + "loss": 0.0, + "step": 29146 + }, + { + "epoch": 11.853192354615697, + "grad_norm": 5.317776909078147e-05, + "learning_rate": 4.0540768039644575e-07, + "loss": 0.0, + "step": 29147 + }, + { + "epoch": 11.853599023993493, + "grad_norm": 0.020096026492925488, + "learning_rate": 4.051221651380366e-07, + "loss": 0.0001, + "step": 29148 + }, + { + "epoch": 11.854005693371288, + "grad_norm": 0.00039783941415136434, + "learning_rate": 4.048367483758764e-07, + "loss": 0.0, + "step": 29149 + }, + { + "epoch": 11.854412362749084, + "grad_norm": 0.02005413920525303, + "learning_rate": 4.045514301128972e-07, + "loss": 0.0002, + "step": 29150 + }, + { + "epoch": 11.854819032126882, + "grad_norm": 0.04068989118571378, + "learning_rate": 4.0426621035202564e-07, + "loss": 0.0005, + "step": 29151 + }, + { + "epoch": 11.855225701504677, + "grad_norm": 0.002593106124564445, + "learning_rate": 4.0398108909619147e-07, + "loss": 0.0, + "step": 29152 + }, + { + "epoch": 11.855632370882473, + "grad_norm": 0.0009203289700647386, + "learning_rate": 4.0369606634832024e-07, + "loss": 0.0, + "step": 29153 + }, + { + "epoch": 11.856039040260269, + "grad_norm": 0.11758047233427613, + "learning_rate": 4.0341114211133957e-07, + "loss": 0.001, + "step": 29154 + }, + { + "epoch": 11.856445709638065, + "grad_norm": 0.0677774290400044, + "learning_rate": 4.0312631638817377e-07, + "loss": 0.0006, + "step": 29155 + }, + { + "epoch": 11.85685237901586, + "grad_norm": 0.5574253162877697, + "learning_rate": 4.0284158918174497e-07, + "loss": 0.0039, + "step": 29156 + }, + { + "epoch": 11.857259048393656, + "grad_norm": 0.2586828087204565, + "learning_rate": 4.025569604949786e-07, + "loss": 0.0025, + "step": 29157 + }, + { + "epoch": 11.857665717771452, + "grad_norm": 0.010595171370430832, + "learning_rate": 4.022724303307957e-07, + "loss": 0.0001, + "step": 29158 + }, + { + "epoch": 11.858072387149248, + "grad_norm": 0.001770561631522249, + "learning_rate": 4.019879986921149e-07, + "loss": 0.0, + "step": 29159 + }, + { + "epoch": 11.858479056527043, + "grad_norm": 0.002015701092853469, + "learning_rate": 4.017036655818585e-07, + "loss": 0.0, + "step": 29160 + }, + { + "epoch": 11.85888572590484, + "grad_norm": 0.0036981728466187903, + "learning_rate": 4.0141943100294514e-07, + "loss": 0.0, + "step": 29161 + }, + { + "epoch": 11.859292395282635, + "grad_norm": 0.06787921702653975, + "learning_rate": 4.011352949582925e-07, + "loss": 0.0006, + "step": 29162 + }, + { + "epoch": 11.85969906466043, + "grad_norm": 0.019314164924245994, + "learning_rate": 4.008512574508172e-07, + "loss": 0.0002, + "step": 29163 + }, + { + "epoch": 11.860105734038227, + "grad_norm": 0.001117385652885703, + "learning_rate": 4.005673184834347e-07, + "loss": 0.0, + "step": 29164 + }, + { + "epoch": 11.860512403416022, + "grad_norm": 0.0006720432823728315, + "learning_rate": 4.002834780590592e-07, + "loss": 0.0, + "step": 29165 + }, + { + "epoch": 11.860919072793818, + "grad_norm": 0.04856777966899055, + "learning_rate": 3.999997361806052e-07, + "loss": 0.0002, + "step": 29166 + }, + { + "epoch": 11.861325742171614, + "grad_norm": 0.1301501041606539, + "learning_rate": 3.997160928509858e-07, + "loss": 0.0012, + "step": 29167 + }, + { + "epoch": 11.86173241154941, + "grad_norm": 0.010964263228490844, + "learning_rate": 3.9943254807311313e-07, + "loss": 0.0001, + "step": 29168 + }, + { + "epoch": 11.862139080927205, + "grad_norm": 0.4330132729117379, + "learning_rate": 3.9914910184989607e-07, + "loss": 0.0045, + "step": 29169 + }, + { + "epoch": 11.862545750305003, + "grad_norm": 0.03108530192177059, + "learning_rate": 3.988657541842467e-07, + "loss": 0.0003, + "step": 29170 + }, + { + "epoch": 11.862952419682799, + "grad_norm": 0.00012860092397188873, + "learning_rate": 3.9858250507907147e-07, + "loss": 0.0, + "step": 29171 + }, + { + "epoch": 11.863359089060594, + "grad_norm": 0.0015181890725361753, + "learning_rate": 3.982993545372793e-07, + "loss": 0.0, + "step": 29172 + }, + { + "epoch": 11.86376575843839, + "grad_norm": 0.0018883540666623237, + "learning_rate": 3.980163025617767e-07, + "loss": 0.0, + "step": 29173 + }, + { + "epoch": 11.864172427816186, + "grad_norm": 0.01605004306368996, + "learning_rate": 3.977333491554702e-07, + "loss": 0.0001, + "step": 29174 + }, + { + "epoch": 11.864579097193982, + "grad_norm": 0.00029443405787367335, + "learning_rate": 3.9745049432126313e-07, + "loss": 0.0, + "step": 29175 + }, + { + "epoch": 11.864985766571778, + "grad_norm": 0.031701307367933015, + "learning_rate": 3.971677380620598e-07, + "loss": 0.0002, + "step": 29176 + }, + { + "epoch": 11.865392435949573, + "grad_norm": 0.011194401578485618, + "learning_rate": 3.968850803807622e-07, + "loss": 0.0001, + "step": 29177 + }, + { + "epoch": 11.865799105327369, + "grad_norm": 0.004271619421102945, + "learning_rate": 3.9660252128027264e-07, + "loss": 0.0001, + "step": 29178 + }, + { + "epoch": 11.866205774705165, + "grad_norm": 0.0006828888267206709, + "learning_rate": 3.963200607634898e-07, + "loss": 0.0, + "step": 29179 + }, + { + "epoch": 11.86661244408296, + "grad_norm": 0.014360596241647303, + "learning_rate": 3.96037698833317e-07, + "loss": 0.0001, + "step": 29180 + }, + { + "epoch": 11.867019113460756, + "grad_norm": 9.200396235074549e-05, + "learning_rate": 3.957554354926507e-07, + "loss": 0.0, + "step": 29181 + }, + { + "epoch": 11.867425782838552, + "grad_norm": 7.71612788984452e-05, + "learning_rate": 3.9547327074438757e-07, + "loss": 0.0, + "step": 29182 + }, + { + "epoch": 11.867832452216348, + "grad_norm": 0.011316992364581885, + "learning_rate": 3.9519120459142636e-07, + "loss": 0.0001, + "step": 29183 + }, + { + "epoch": 11.868239121594144, + "grad_norm": 0.0011283148538633004, + "learning_rate": 3.949092370366614e-07, + "loss": 0.0, + "step": 29184 + }, + { + "epoch": 11.86864579097194, + "grad_norm": 0.004235513019297699, + "learning_rate": 3.9462736808298484e-07, + "loss": 0.0, + "step": 29185 + }, + { + "epoch": 11.869052460349735, + "grad_norm": 0.022701062341294886, + "learning_rate": 3.9434559773329553e-07, + "loss": 0.0003, + "step": 29186 + }, + { + "epoch": 11.869459129727531, + "grad_norm": 0.005780766553191996, + "learning_rate": 3.940639259904822e-07, + "loss": 0.0001, + "step": 29187 + }, + { + "epoch": 11.869865799105327, + "grad_norm": 5.127553889027984e-06, + "learning_rate": 3.937823528574369e-07, + "loss": 0.0, + "step": 29188 + }, + { + "epoch": 11.870272468483122, + "grad_norm": 0.003098967860562699, + "learning_rate": 3.9350087833705085e-07, + "loss": 0.0, + "step": 29189 + }, + { + "epoch": 11.870679137860918, + "grad_norm": 0.0004088747270255087, + "learning_rate": 3.9321950243221277e-07, + "loss": 0.0, + "step": 29190 + }, + { + "epoch": 11.871085807238714, + "grad_norm": 0.008018700654224305, + "learning_rate": 3.929382251458125e-07, + "loss": 0.0001, + "step": 29191 + }, + { + "epoch": 11.871492476616512, + "grad_norm": 0.0003795727262181469, + "learning_rate": 3.9265704648073445e-07, + "loss": 0.0, + "step": 29192 + }, + { + "epoch": 11.871899145994307, + "grad_norm": 0.004943800555248754, + "learning_rate": 3.9237596643986855e-07, + "loss": 0.0001, + "step": 29193 + }, + { + "epoch": 11.872305815372103, + "grad_norm": 0.03250848261243364, + "learning_rate": 3.92094985026098e-07, + "loss": 0.0002, + "step": 29194 + }, + { + "epoch": 11.872712484749899, + "grad_norm": 0.0006116404761409762, + "learning_rate": 3.918141022423094e-07, + "loss": 0.0, + "step": 29195 + }, + { + "epoch": 11.873119154127695, + "grad_norm": 0.07227316702299133, + "learning_rate": 3.915333180913838e-07, + "loss": 0.0006, + "step": 29196 + }, + { + "epoch": 11.87352582350549, + "grad_norm": 0.001079143722095702, + "learning_rate": 3.912526325762045e-07, + "loss": 0.0, + "step": 29197 + }, + { + "epoch": 11.873932492883286, + "grad_norm": 0.011623897639766971, + "learning_rate": 3.909720456996524e-07, + "loss": 0.0001, + "step": 29198 + }, + { + "epoch": 11.874339162261082, + "grad_norm": 0.21164549717043382, + "learning_rate": 3.9069155746460863e-07, + "loss": 0.0011, + "step": 29199 + }, + { + "epoch": 11.874745831638878, + "grad_norm": 0.0021994439726881927, + "learning_rate": 3.9041116787395304e-07, + "loss": 0.0, + "step": 29200 + }, + { + "epoch": 11.875152501016673, + "grad_norm": 9.273491485828294e-05, + "learning_rate": 3.9013087693056227e-07, + "loss": 0.0, + "step": 29201 + }, + { + "epoch": 11.87555917039447, + "grad_norm": 0.01173418845543462, + "learning_rate": 3.898506846373151e-07, + "loss": 0.0001, + "step": 29202 + }, + { + "epoch": 11.875965839772265, + "grad_norm": 0.007087035259717159, + "learning_rate": 3.8957059099708815e-07, + "loss": 0.0, + "step": 29203 + }, + { + "epoch": 11.87637250915006, + "grad_norm": 0.0017345419591476856, + "learning_rate": 3.8929059601275463e-07, + "loss": 0.0, + "step": 29204 + }, + { + "epoch": 11.876779178527856, + "grad_norm": 0.011430388110849642, + "learning_rate": 3.8901069968719006e-07, + "loss": 0.0001, + "step": 29205 + }, + { + "epoch": 11.877185847905652, + "grad_norm": 0.8843612153891185, + "learning_rate": 3.887309020232688e-07, + "loss": 0.0073, + "step": 29206 + }, + { + "epoch": 11.877592517283448, + "grad_norm": 0.008483732114443834, + "learning_rate": 3.8845120302386186e-07, + "loss": 0.0001, + "step": 29207 + }, + { + "epoch": 11.877999186661244, + "grad_norm": 0.048013067739199874, + "learning_rate": 3.881716026918403e-07, + "loss": 0.0005, + "step": 29208 + }, + { + "epoch": 11.87840585603904, + "grad_norm": 0.0024326809474079325, + "learning_rate": 3.8789210103007513e-07, + "loss": 0.0, + "step": 29209 + }, + { + "epoch": 11.878812525416835, + "grad_norm": 0.059065918555867965, + "learning_rate": 3.876126980414352e-07, + "loss": 0.0007, + "step": 29210 + }, + { + "epoch": 11.879219194794633, + "grad_norm": 0.013131011220243039, + "learning_rate": 3.873333937287871e-07, + "loss": 0.0002, + "step": 29211 + }, + { + "epoch": 11.879625864172429, + "grad_norm": 0.007835764433641158, + "learning_rate": 3.8705418809500074e-07, + "loss": 0.0001, + "step": 29212 + }, + { + "epoch": 11.880032533550224, + "grad_norm": 0.0008361528962426424, + "learning_rate": 3.867750811429427e-07, + "loss": 0.0, + "step": 29213 + }, + { + "epoch": 11.88043920292802, + "grad_norm": 0.002659327728287133, + "learning_rate": 3.8649607287547517e-07, + "loss": 0.0, + "step": 29214 + }, + { + "epoch": 11.880845872305816, + "grad_norm": 0.0006824061191046956, + "learning_rate": 3.862171632954648e-07, + "loss": 0.0, + "step": 29215 + }, + { + "epoch": 11.881252541683612, + "grad_norm": 0.003061778647440865, + "learning_rate": 3.859383524057725e-07, + "loss": 0.0, + "step": 29216 + }, + { + "epoch": 11.881659211061407, + "grad_norm": 0.013635568838554797, + "learning_rate": 3.856596402092627e-07, + "loss": 0.0001, + "step": 29217 + }, + { + "epoch": 11.882065880439203, + "grad_norm": 0.008746631219691726, + "learning_rate": 3.853810267087943e-07, + "loss": 0.0001, + "step": 29218 + }, + { + "epoch": 11.882472549816999, + "grad_norm": 0.010039855644067319, + "learning_rate": 3.851025119072294e-07, + "loss": 0.0001, + "step": 29219 + }, + { + "epoch": 11.882879219194795, + "grad_norm": 7.721859547588233e-05, + "learning_rate": 3.8482409580742675e-07, + "loss": 0.0, + "step": 29220 + }, + { + "epoch": 11.88328588857259, + "grad_norm": 0.0013026032351999698, + "learning_rate": 3.845457784122442e-07, + "loss": 0.0, + "step": 29221 + }, + { + "epoch": 11.883692557950386, + "grad_norm": 0.006208754672626085, + "learning_rate": 3.8426755972453824e-07, + "loss": 0.0, + "step": 29222 + }, + { + "epoch": 11.884099227328182, + "grad_norm": 0.018564390829070177, + "learning_rate": 3.839894397471644e-07, + "loss": 0.0001, + "step": 29223 + }, + { + "epoch": 11.884505896705978, + "grad_norm": 0.01818211068297184, + "learning_rate": 3.8371141848297935e-07, + "loss": 0.0001, + "step": 29224 + }, + { + "epoch": 11.884912566083774, + "grad_norm": 0.020646490532468458, + "learning_rate": 3.834334959348351e-07, + "loss": 0.0002, + "step": 29225 + }, + { + "epoch": 11.88531923546157, + "grad_norm": 0.19863109950454605, + "learning_rate": 3.8315567210558845e-07, + "loss": 0.0018, + "step": 29226 + }, + { + "epoch": 11.885725904839365, + "grad_norm": 0.012629773558681398, + "learning_rate": 3.828779469980881e-07, + "loss": 0.0001, + "step": 29227 + }, + { + "epoch": 11.88613257421716, + "grad_norm": 0.052920281477776214, + "learning_rate": 3.8260032061518515e-07, + "loss": 0.0004, + "step": 29228 + }, + { + "epoch": 11.886539243594957, + "grad_norm": 0.05797425964295445, + "learning_rate": 3.8232279295973064e-07, + "loss": 0.0005, + "step": 29229 + }, + { + "epoch": 11.886945912972752, + "grad_norm": 0.023343470908261485, + "learning_rate": 3.8204536403457115e-07, + "loss": 0.0001, + "step": 29230 + }, + { + "epoch": 11.887352582350548, + "grad_norm": 0.0027058848497373417, + "learning_rate": 3.817680338425589e-07, + "loss": 0.0, + "step": 29231 + }, + { + "epoch": 11.887759251728344, + "grad_norm": 0.001944941668245424, + "learning_rate": 3.8149080238653825e-07, + "loss": 0.0, + "step": 29232 + }, + { + "epoch": 11.888165921106141, + "grad_norm": 0.011839502586356397, + "learning_rate": 3.812136696693547e-07, + "loss": 0.0001, + "step": 29233 + }, + { + "epoch": 11.888572590483937, + "grad_norm": 0.04429129593867551, + "learning_rate": 3.8093663569385487e-07, + "loss": 0.0004, + "step": 29234 + }, + { + "epoch": 11.888979259861733, + "grad_norm": 0.00017519556915929254, + "learning_rate": 3.8065970046288095e-07, + "loss": 0.0, + "step": 29235 + }, + { + "epoch": 11.889385929239529, + "grad_norm": 0.03384224201129548, + "learning_rate": 3.8038286397927504e-07, + "loss": 0.0003, + "step": 29236 + }, + { + "epoch": 11.889792598617325, + "grad_norm": 0.0001828774024272079, + "learning_rate": 3.801061262458827e-07, + "loss": 0.0, + "step": 29237 + }, + { + "epoch": 11.89019926799512, + "grad_norm": 0.005417184139153871, + "learning_rate": 3.798294872655417e-07, + "loss": 0.0001, + "step": 29238 + }, + { + "epoch": 11.890605937372916, + "grad_norm": 0.02433581859189639, + "learning_rate": 3.7955294704109304e-07, + "loss": 0.0003, + "step": 29239 + }, + { + "epoch": 11.891012606750712, + "grad_norm": 0.00022574518244116739, + "learning_rate": 3.792765055753755e-07, + "loss": 0.0, + "step": 29240 + }, + { + "epoch": 11.891419276128508, + "grad_norm": 0.045636120810159275, + "learning_rate": 3.790001628712259e-07, + "loss": 0.0002, + "step": 29241 + }, + { + "epoch": 11.891825945506303, + "grad_norm": 0.012909167573461201, + "learning_rate": 3.7872391893148284e-07, + "loss": 0.0001, + "step": 29242 + }, + { + "epoch": 11.892232614884099, + "grad_norm": 6.0640530578854204e-05, + "learning_rate": 3.7844777375897866e-07, + "loss": 0.0, + "step": 29243 + }, + { + "epoch": 11.892639284261895, + "grad_norm": 0.5360656451321956, + "learning_rate": 3.781717273565533e-07, + "loss": 0.0052, + "step": 29244 + }, + { + "epoch": 11.89304595363969, + "grad_norm": 0.004831182542151985, + "learning_rate": 3.778957797270366e-07, + "loss": 0.0, + "step": 29245 + }, + { + "epoch": 11.893452623017486, + "grad_norm": 0.00039447996970108233, + "learning_rate": 3.77619930873262e-07, + "loss": 0.0, + "step": 29246 + }, + { + "epoch": 11.893859292395282, + "grad_norm": 0.00023617968547875198, + "learning_rate": 3.773441807980627e-07, + "loss": 0.0, + "step": 29247 + }, + { + "epoch": 11.894265961773078, + "grad_norm": 0.0004213501291950765, + "learning_rate": 3.7706852950426755e-07, + "loss": 0.0, + "step": 29248 + }, + { + "epoch": 11.894672631150874, + "grad_norm": 0.0007070347039204728, + "learning_rate": 3.767929769947065e-07, + "loss": 0.0, + "step": 29249 + }, + { + "epoch": 11.89507930052867, + "grad_norm": 0.02037671303993924, + "learning_rate": 3.765175232722096e-07, + "loss": 0.0002, + "step": 29250 + }, + { + "epoch": 11.895485969906465, + "grad_norm": 0.0014573759540553776, + "learning_rate": 3.7624216833960337e-07, + "loss": 0.0, + "step": 29251 + }, + { + "epoch": 11.895892639284263, + "grad_norm": 0.02438608751004107, + "learning_rate": 3.759669121997156e-07, + "loss": 0.0002, + "step": 29252 + }, + { + "epoch": 11.896299308662059, + "grad_norm": 0.017701364760914554, + "learning_rate": 3.7569175485537067e-07, + "loss": 0.0001, + "step": 29253 + }, + { + "epoch": 11.896705978039854, + "grad_norm": 0.02418287534337957, + "learning_rate": 3.754166963093941e-07, + "loss": 0.0004, + "step": 29254 + }, + { + "epoch": 11.89711264741765, + "grad_norm": 0.0020467615973966805, + "learning_rate": 3.751417365646093e-07, + "loss": 0.0, + "step": 29255 + }, + { + "epoch": 11.897519316795446, + "grad_norm": 0.11620956793903948, + "learning_rate": 3.748668756238361e-07, + "loss": 0.0012, + "step": 29256 + }, + { + "epoch": 11.897925986173242, + "grad_norm": 0.008788917172863321, + "learning_rate": 3.745921134899011e-07, + "loss": 0.0001, + "step": 29257 + }, + { + "epoch": 11.898332655551037, + "grad_norm": 0.0109810472773919, + "learning_rate": 3.7431745016562215e-07, + "loss": 0.0001, + "step": 29258 + }, + { + "epoch": 11.898739324928833, + "grad_norm": 0.0003673177508832346, + "learning_rate": 3.740428856538192e-07, + "loss": 0.0, + "step": 29259 + }, + { + "epoch": 11.899145994306629, + "grad_norm": 0.059743865130826385, + "learning_rate": 3.7376841995730994e-07, + "loss": 0.0004, + "step": 29260 + }, + { + "epoch": 11.899552663684425, + "grad_norm": 0.0048142514854204125, + "learning_rate": 3.7349405307891327e-07, + "loss": 0.0, + "step": 29261 + }, + { + "epoch": 11.89995933306222, + "grad_norm": 0.007427570472686619, + "learning_rate": 3.732197850214436e-07, + "loss": 0.0001, + "step": 29262 + }, + { + "epoch": 11.900366002440016, + "grad_norm": 0.003235368843371008, + "learning_rate": 3.729456157877198e-07, + "loss": 0.0, + "step": 29263 + }, + { + "epoch": 11.900772671817812, + "grad_norm": 0.04759603281129532, + "learning_rate": 3.7267154538055407e-07, + "loss": 0.0003, + "step": 29264 + }, + { + "epoch": 11.901179341195608, + "grad_norm": 0.0007020345314728246, + "learning_rate": 3.7239757380275966e-07, + "loss": 0.0, + "step": 29265 + }, + { + "epoch": 11.901586010573403, + "grad_norm": 0.0042039466546629385, + "learning_rate": 3.721237010571499e-07, + "loss": 0.0, + "step": 29266 + }, + { + "epoch": 11.9019926799512, + "grad_norm": 0.0014850914104071477, + "learning_rate": 3.71849927146537e-07, + "loss": 0.0, + "step": 29267 + }, + { + "epoch": 11.902399349328995, + "grad_norm": 0.03706311978575068, + "learning_rate": 3.715762520737287e-07, + "loss": 0.0004, + "step": 29268 + }, + { + "epoch": 11.90280601870679, + "grad_norm": 0.0008581925606041902, + "learning_rate": 3.7130267584153614e-07, + "loss": 0.0, + "step": 29269 + }, + { + "epoch": 11.903212688084587, + "grad_norm": 0.009708298696085466, + "learning_rate": 3.710291984527681e-07, + "loss": 0.0001, + "step": 29270 + }, + { + "epoch": 11.903619357462382, + "grad_norm": 3.7312560413465835e-05, + "learning_rate": 3.7075581991023123e-07, + "loss": 0.0, + "step": 29271 + }, + { + "epoch": 11.904026026840178, + "grad_norm": 0.011528640736145582, + "learning_rate": 3.7048254021673336e-07, + "loss": 0.0001, + "step": 29272 + }, + { + "epoch": 11.904432696217976, + "grad_norm": 0.0005730998715168602, + "learning_rate": 3.702093593750777e-07, + "loss": 0.0, + "step": 29273 + }, + { + "epoch": 11.904839365595771, + "grad_norm": 0.018933501136441013, + "learning_rate": 3.6993627738806993e-07, + "loss": 0.0001, + "step": 29274 + }, + { + "epoch": 11.905246034973567, + "grad_norm": 0.010729643533043488, + "learning_rate": 3.696632942585121e-07, + "loss": 0.0001, + "step": 29275 + }, + { + "epoch": 11.905652704351363, + "grad_norm": 0.2976663316512117, + "learning_rate": 3.693904099892076e-07, + "loss": 0.0024, + "step": 29276 + }, + { + "epoch": 11.906059373729159, + "grad_norm": 0.00013754164360504803, + "learning_rate": 3.6911762458295864e-07, + "loss": 0.0, + "step": 29277 + }, + { + "epoch": 11.906466043106954, + "grad_norm": 0.00023772652372104042, + "learning_rate": 3.6884493804256296e-07, + "loss": 0.0, + "step": 29278 + }, + { + "epoch": 11.90687271248475, + "grad_norm": 0.008056964436474589, + "learning_rate": 3.6857235037082274e-07, + "loss": 0.0001, + "step": 29279 + }, + { + "epoch": 11.907279381862546, + "grad_norm": 0.0021359279560450037, + "learning_rate": 3.6829986157053355e-07, + "loss": 0.0, + "step": 29280 + }, + { + "epoch": 11.907686051240342, + "grad_norm": 0.0004126562950652788, + "learning_rate": 3.6802747164449427e-07, + "loss": 0.0, + "step": 29281 + }, + { + "epoch": 11.908092720618138, + "grad_norm": 0.012130100180140373, + "learning_rate": 3.677551805955004e-07, + "loss": 0.0001, + "step": 29282 + }, + { + "epoch": 11.908499389995933, + "grad_norm": 0.001648100005197306, + "learning_rate": 3.6748298842634757e-07, + "loss": 0.0, + "step": 29283 + }, + { + "epoch": 11.908906059373729, + "grad_norm": 0.12403008938110446, + "learning_rate": 3.672108951398301e-07, + "loss": 0.0008, + "step": 29284 + }, + { + "epoch": 11.909312728751525, + "grad_norm": 0.004073193764860843, + "learning_rate": 3.6693890073874027e-07, + "loss": 0.0, + "step": 29285 + }, + { + "epoch": 11.90971939812932, + "grad_norm": 0.003030730964052136, + "learning_rate": 3.666670052258714e-07, + "loss": 0.0, + "step": 29286 + }, + { + "epoch": 11.910126067507116, + "grad_norm": 0.03982113227079265, + "learning_rate": 3.6639520860401457e-07, + "loss": 0.0002, + "step": 29287 + }, + { + "epoch": 11.910532736884912, + "grad_norm": 0.0019918352762331006, + "learning_rate": 3.6612351087595756e-07, + "loss": 0.0, + "step": 29288 + }, + { + "epoch": 11.910939406262708, + "grad_norm": 0.002243486557054377, + "learning_rate": 3.6585191204449253e-07, + "loss": 0.0, + "step": 29289 + }, + { + "epoch": 11.911346075640504, + "grad_norm": 0.002107405874878705, + "learning_rate": 3.6558041211240623e-07, + "loss": 0.0, + "step": 29290 + }, + { + "epoch": 11.9117527450183, + "grad_norm": 0.0032200174140965977, + "learning_rate": 3.653090110824864e-07, + "loss": 0.0, + "step": 29291 + }, + { + "epoch": 11.912159414396095, + "grad_norm": 0.00400136581195983, + "learning_rate": 3.6503770895751854e-07, + "loss": 0.0, + "step": 29292 + }, + { + "epoch": 11.912566083773893, + "grad_norm": 0.0320101265578048, + "learning_rate": 3.6476650574028713e-07, + "loss": 0.0004, + "step": 29293 + }, + { + "epoch": 11.912972753151688, + "grad_norm": 0.0025988042346535194, + "learning_rate": 3.6449540143357775e-07, + "loss": 0.0, + "step": 29294 + }, + { + "epoch": 11.913379422529484, + "grad_norm": 0.045491254783752384, + "learning_rate": 3.6422439604017035e-07, + "loss": 0.0002, + "step": 29295 + }, + { + "epoch": 11.91378609190728, + "grad_norm": 0.0010461960724005908, + "learning_rate": 3.6395348956285157e-07, + "loss": 0.0, + "step": 29296 + }, + { + "epoch": 11.914192761285076, + "grad_norm": 0.003706795177936635, + "learning_rate": 3.6368268200439925e-07, + "loss": 0.0, + "step": 29297 + }, + { + "epoch": 11.914599430662872, + "grad_norm": 0.04240123420937571, + "learning_rate": 3.6341197336759335e-07, + "loss": 0.0004, + "step": 29298 + }, + { + "epoch": 11.915006100040667, + "grad_norm": 0.0019574147335446154, + "learning_rate": 3.63141363655215e-07, + "loss": 0.0, + "step": 29299 + }, + { + "epoch": 11.915412769418463, + "grad_norm": 0.0019162740098919693, + "learning_rate": 3.628708528700398e-07, + "loss": 0.0, + "step": 29300 + }, + { + "epoch": 11.915819438796259, + "grad_norm": 0.1171464479490535, + "learning_rate": 3.6260044101484424e-07, + "loss": 0.0013, + "step": 29301 + }, + { + "epoch": 11.916226108174055, + "grad_norm": 0.0005992947489040986, + "learning_rate": 3.623301280924074e-07, + "loss": 0.0, + "step": 29302 + }, + { + "epoch": 11.91663277755185, + "grad_norm": 0.10089894812568415, + "learning_rate": 3.6205991410550145e-07, + "loss": 0.0011, + "step": 29303 + }, + { + "epoch": 11.917039446929646, + "grad_norm": 0.0002136700482640327, + "learning_rate": 3.6178979905690195e-07, + "loss": 0.0, + "step": 29304 + }, + { + "epoch": 11.917446116307442, + "grad_norm": 0.006851681133036467, + "learning_rate": 3.615197829493811e-07, + "loss": 0.0001, + "step": 29305 + }, + { + "epoch": 11.917852785685238, + "grad_norm": 0.0033297789710668856, + "learning_rate": 3.6124986578570995e-07, + "loss": 0.0, + "step": 29306 + }, + { + "epoch": 11.918259455063033, + "grad_norm": 0.05479299293946303, + "learning_rate": 3.609800475686598e-07, + "loss": 0.0005, + "step": 29307 + }, + { + "epoch": 11.91866612444083, + "grad_norm": 0.07833635383186244, + "learning_rate": 3.607103283010005e-07, + "loss": 0.0005, + "step": 29308 + }, + { + "epoch": 11.919072793818625, + "grad_norm": 0.010619209742689345, + "learning_rate": 3.60440707985501e-07, + "loss": 0.0001, + "step": 29309 + }, + { + "epoch": 11.91947946319642, + "grad_norm": 0.0003336393868576693, + "learning_rate": 3.601711866249302e-07, + "loss": 0.0, + "step": 29310 + }, + { + "epoch": 11.919886132574216, + "grad_norm": 0.004313982026734883, + "learning_rate": 3.599017642220537e-07, + "loss": 0.0, + "step": 29311 + }, + { + "epoch": 11.920292801952012, + "grad_norm": 0.01690103733571729, + "learning_rate": 3.59632440779637e-07, + "loss": 0.0001, + "step": 29312 + }, + { + "epoch": 11.920699471329808, + "grad_norm": 0.00033343954466122887, + "learning_rate": 3.593632163004446e-07, + "loss": 0.0, + "step": 29313 + }, + { + "epoch": 11.921106140707606, + "grad_norm": 0.013395761213362998, + "learning_rate": 3.5909409078724087e-07, + "loss": 0.0001, + "step": 29314 + }, + { + "epoch": 11.921512810085401, + "grad_norm": 0.011174398541420487, + "learning_rate": 3.5882506424278816e-07, + "loss": 0.0001, + "step": 29315 + }, + { + "epoch": 11.921919479463197, + "grad_norm": 0.0006102601831037771, + "learning_rate": 3.5855613666984976e-07, + "loss": 0.0, + "step": 29316 + }, + { + "epoch": 11.922326148840993, + "grad_norm": 0.0016096684496732596, + "learning_rate": 3.5828730807118464e-07, + "loss": 0.0, + "step": 29317 + }, + { + "epoch": 11.922732818218789, + "grad_norm": 0.0019470118912772374, + "learning_rate": 3.580185784495527e-07, + "loss": 0.0, + "step": 29318 + }, + { + "epoch": 11.923139487596584, + "grad_norm": 6.569767753488896e-05, + "learning_rate": 3.577499478077129e-07, + "loss": 0.0, + "step": 29319 + }, + { + "epoch": 11.92354615697438, + "grad_norm": 0.0167412955282291, + "learning_rate": 3.5748141614842304e-07, + "loss": 0.0001, + "step": 29320 + }, + { + "epoch": 11.923952826352176, + "grad_norm": 0.028858980132536387, + "learning_rate": 3.572129834744376e-07, + "loss": 0.0002, + "step": 29321 + }, + { + "epoch": 11.924359495729972, + "grad_norm": 0.00839032827530714, + "learning_rate": 3.569446497885154e-07, + "loss": 0.0, + "step": 29322 + }, + { + "epoch": 11.924766165107767, + "grad_norm": 0.003118907630475713, + "learning_rate": 3.566764150934099e-07, + "loss": 0.0, + "step": 29323 + }, + { + "epoch": 11.925172834485563, + "grad_norm": 0.0059106150052424, + "learning_rate": 3.564082793918744e-07, + "loss": 0.0, + "step": 29324 + }, + { + "epoch": 11.925579503863359, + "grad_norm": 0.004260088462859559, + "learning_rate": 3.5614024268666223e-07, + "loss": 0.0, + "step": 29325 + }, + { + "epoch": 11.925986173241155, + "grad_norm": 0.00015940404648826808, + "learning_rate": 3.558723049805224e-07, + "loss": 0.0, + "step": 29326 + }, + { + "epoch": 11.92639284261895, + "grad_norm": 0.0001626978768432532, + "learning_rate": 3.5560446627620816e-07, + "loss": 0.0, + "step": 29327 + }, + { + "epoch": 11.926799511996746, + "grad_norm": 0.0006935974948401206, + "learning_rate": 3.553367265764673e-07, + "loss": 0.0, + "step": 29328 + }, + { + "epoch": 11.927206181374542, + "grad_norm": 0.0011206277690505272, + "learning_rate": 3.5506908588404996e-07, + "loss": 0.0, + "step": 29329 + }, + { + "epoch": 11.927612850752338, + "grad_norm": 0.00036281191463046755, + "learning_rate": 3.5480154420170164e-07, + "loss": 0.0, + "step": 29330 + }, + { + "epoch": 11.928019520130134, + "grad_norm": 0.01280307919744331, + "learning_rate": 3.5453410153217014e-07, + "loss": 0.0001, + "step": 29331 + }, + { + "epoch": 11.92842618950793, + "grad_norm": 0.009466968192522247, + "learning_rate": 3.542667578782011e-07, + "loss": 0.0001, + "step": 29332 + }, + { + "epoch": 11.928832858885725, + "grad_norm": 0.002661850040143581, + "learning_rate": 3.5399951324253666e-07, + "loss": 0.0, + "step": 29333 + }, + { + "epoch": 11.929239528263523, + "grad_norm": 0.00012335111782758062, + "learning_rate": 3.537323676279236e-07, + "loss": 0.0, + "step": 29334 + }, + { + "epoch": 11.929646197641318, + "grad_norm": 0.004415246789252848, + "learning_rate": 3.5346532103710197e-07, + "loss": 0.0, + "step": 29335 + }, + { + "epoch": 11.930052867019114, + "grad_norm": 0.032997001291701585, + "learning_rate": 3.5319837347281394e-07, + "loss": 0.0004, + "step": 29336 + }, + { + "epoch": 11.93045953639691, + "grad_norm": 0.0006194448634781141, + "learning_rate": 3.5293152493779956e-07, + "loss": 0.0, + "step": 29337 + }, + { + "epoch": 11.930866205774706, + "grad_norm": 0.050882303348759664, + "learning_rate": 3.5266477543479895e-07, + "loss": 0.0003, + "step": 29338 + }, + { + "epoch": 11.931272875152501, + "grad_norm": 0.006592828341573319, + "learning_rate": 3.523981249665487e-07, + "loss": 0.0001, + "step": 29339 + }, + { + "epoch": 11.931679544530297, + "grad_norm": 0.003859568153505569, + "learning_rate": 3.5213157353578773e-07, + "loss": 0.0, + "step": 29340 + }, + { + "epoch": 11.932086213908093, + "grad_norm": 0.019533783120810697, + "learning_rate": 3.5186512114525283e-07, + "loss": 0.0002, + "step": 29341 + }, + { + "epoch": 11.932492883285889, + "grad_norm": 0.016573322549894468, + "learning_rate": 3.515987677976773e-07, + "loss": 0.0001, + "step": 29342 + }, + { + "epoch": 11.932899552663685, + "grad_norm": 0.024223264546425413, + "learning_rate": 3.513325134957968e-07, + "loss": 0.0002, + "step": 29343 + }, + { + "epoch": 11.93330622204148, + "grad_norm": 0.0014963278300898256, + "learning_rate": 3.510663582423446e-07, + "loss": 0.0, + "step": 29344 + }, + { + "epoch": 11.933712891419276, + "grad_norm": 0.05655702188647013, + "learning_rate": 3.508003020400519e-07, + "loss": 0.0007, + "step": 29345 + }, + { + "epoch": 11.934119560797072, + "grad_norm": 8.632515501212977e-05, + "learning_rate": 3.5053434489164875e-07, + "loss": 0.0, + "step": 29346 + }, + { + "epoch": 11.934526230174868, + "grad_norm": 0.013339754000069324, + "learning_rate": 3.5026848679986956e-07, + "loss": 0.0001, + "step": 29347 + }, + { + "epoch": 11.934932899552663, + "grad_norm": 0.35607179408475625, + "learning_rate": 3.5000272776744004e-07, + "loss": 0.0031, + "step": 29348 + }, + { + "epoch": 11.935339568930459, + "grad_norm": 0.010107374514492874, + "learning_rate": 3.4973706779708903e-07, + "loss": 0.0001, + "step": 29349 + }, + { + "epoch": 11.935746238308255, + "grad_norm": 0.0012325116453103125, + "learning_rate": 3.494715068915444e-07, + "loss": 0.0, + "step": 29350 + }, + { + "epoch": 11.93615290768605, + "grad_norm": 0.00455622370388991, + "learning_rate": 3.4920604505353173e-07, + "loss": 0.0, + "step": 29351 + }, + { + "epoch": 11.936559577063846, + "grad_norm": 0.00011193245290702209, + "learning_rate": 3.489406822857744e-07, + "loss": 0.0, + "step": 29352 + }, + { + "epoch": 11.936966246441642, + "grad_norm": 0.04814525701504995, + "learning_rate": 3.4867541859100016e-07, + "loss": 0.0005, + "step": 29353 + }, + { + "epoch": 11.937372915819438, + "grad_norm": 0.0006094677762734109, + "learning_rate": 3.4841025397192917e-07, + "loss": 0.0, + "step": 29354 + }, + { + "epoch": 11.937779585197235, + "grad_norm": 0.0015877506165157896, + "learning_rate": 3.4814518843128585e-07, + "loss": 0.0, + "step": 29355 + }, + { + "epoch": 11.938186254575031, + "grad_norm": 0.09517492990726704, + "learning_rate": 3.4788022197178917e-07, + "loss": 0.0005, + "step": 29356 + }, + { + "epoch": 11.938592923952827, + "grad_norm": 0.01382174975677108, + "learning_rate": 3.476153545961591e-07, + "loss": 0.0002, + "step": 29357 + }, + { + "epoch": 11.938999593330623, + "grad_norm": 0.009462939140247375, + "learning_rate": 3.4735058630711583e-07, + "loss": 0.0001, + "step": 29358 + }, + { + "epoch": 11.939406262708419, + "grad_norm": 0.010175019968376062, + "learning_rate": 3.470859171073759e-07, + "loss": 0.0001, + "step": 29359 + }, + { + "epoch": 11.939812932086214, + "grad_norm": 0.010695669220302535, + "learning_rate": 3.4682134699965843e-07, + "loss": 0.0, + "step": 29360 + }, + { + "epoch": 11.94021960146401, + "grad_norm": 0.00013606253960814988, + "learning_rate": 3.4655687598667887e-07, + "loss": 0.0, + "step": 29361 + }, + { + "epoch": 11.940626270841806, + "grad_norm": 0.0007708146507230728, + "learning_rate": 3.4629250407115065e-07, + "loss": 0.0, + "step": 29362 + }, + { + "epoch": 11.941032940219602, + "grad_norm": 0.002324913443601361, + "learning_rate": 3.460282312557883e-07, + "loss": 0.0, + "step": 29363 + }, + { + "epoch": 11.941439609597397, + "grad_norm": 0.000981363214925641, + "learning_rate": 3.4576405754330524e-07, + "loss": 0.0, + "step": 29364 + }, + { + "epoch": 11.941846278975193, + "grad_norm": 0.05310710646941482, + "learning_rate": 3.454999829364114e-07, + "loss": 0.0005, + "step": 29365 + }, + { + "epoch": 11.942252948352989, + "grad_norm": 0.0012935130930433892, + "learning_rate": 3.452360074378214e-07, + "loss": 0.0, + "step": 29366 + }, + { + "epoch": 11.942659617730785, + "grad_norm": 0.0009390180942683352, + "learning_rate": 3.4497213105024186e-07, + "loss": 0.0, + "step": 29367 + }, + { + "epoch": 11.94306628710858, + "grad_norm": 2.8730082730263132e-05, + "learning_rate": 3.44708353776384e-07, + "loss": 0.0, + "step": 29368 + }, + { + "epoch": 11.943472956486376, + "grad_norm": 0.0038777646289407737, + "learning_rate": 3.4444467561895344e-07, + "loss": 0.0, + "step": 29369 + }, + { + "epoch": 11.943879625864172, + "grad_norm": 0.04526239416794106, + "learning_rate": 3.441810965806569e-07, + "loss": 0.0004, + "step": 29370 + }, + { + "epoch": 11.944286295241968, + "grad_norm": 0.02016014111849081, + "learning_rate": 3.439176166642022e-07, + "loss": 0.0002, + "step": 29371 + }, + { + "epoch": 11.944692964619763, + "grad_norm": 0.035437509254243806, + "learning_rate": 3.436542358722916e-07, + "loss": 0.0002, + "step": 29372 + }, + { + "epoch": 11.94509963399756, + "grad_norm": 0.01841371283876186, + "learning_rate": 3.433909542076319e-07, + "loss": 0.0001, + "step": 29373 + }, + { + "epoch": 11.945506303375355, + "grad_norm": 0.008522571810859816, + "learning_rate": 3.431277716729231e-07, + "loss": 0.0001, + "step": 29374 + }, + { + "epoch": 11.945912972753153, + "grad_norm": 0.0012067521130665253, + "learning_rate": 3.428646882708686e-07, + "loss": 0.0, + "step": 29375 + }, + { + "epoch": 11.946319642130948, + "grad_norm": 0.003149629161157993, + "learning_rate": 3.4260170400416846e-07, + "loss": 0.0, + "step": 29376 + }, + { + "epoch": 11.946726311508744, + "grad_norm": 1.227539574995518, + "learning_rate": 3.4233881887552166e-07, + "loss": 0.0121, + "step": 29377 + }, + { + "epoch": 11.94713298088654, + "grad_norm": 0.0832364876698237, + "learning_rate": 3.420760328876271e-07, + "loss": 0.0003, + "step": 29378 + }, + { + "epoch": 11.947539650264336, + "grad_norm": 0.0016380189377591203, + "learning_rate": 3.4181334604318383e-07, + "loss": 0.0, + "step": 29379 + }, + { + "epoch": 11.947946319642131, + "grad_norm": 0.08938410032691421, + "learning_rate": 3.415507583448863e-07, + "loss": 0.0005, + "step": 29380 + }, + { + "epoch": 11.948352989019927, + "grad_norm": 0.009909240748420096, + "learning_rate": 3.4128826979543226e-07, + "loss": 0.0001, + "step": 29381 + }, + { + "epoch": 11.948759658397723, + "grad_norm": 0.0031622500506882714, + "learning_rate": 3.410258803975153e-07, + "loss": 0.0, + "step": 29382 + }, + { + "epoch": 11.949166327775519, + "grad_norm": 0.025137140212692415, + "learning_rate": 3.4076359015382864e-07, + "loss": 0.0002, + "step": 29383 + }, + { + "epoch": 11.949572997153314, + "grad_norm": 0.003489144867715434, + "learning_rate": 3.405013990670658e-07, + "loss": 0.0, + "step": 29384 + }, + { + "epoch": 11.94997966653111, + "grad_norm": 0.111326214420276, + "learning_rate": 3.402393071399157e-07, + "loss": 0.0005, + "step": 29385 + }, + { + "epoch": 11.950386335908906, + "grad_norm": 0.0061437192255666155, + "learning_rate": 3.3997731437507175e-07, + "loss": 0.0, + "step": 29386 + }, + { + "epoch": 11.950793005286702, + "grad_norm": 0.0033809103694986686, + "learning_rate": 3.397154207752229e-07, + "loss": 0.0, + "step": 29387 + }, + { + "epoch": 11.951199674664498, + "grad_norm": 0.00037135956718945364, + "learning_rate": 3.39453626343057e-07, + "loss": 0.0, + "step": 29388 + }, + { + "epoch": 11.951606344042293, + "grad_norm": 0.0025281798187729047, + "learning_rate": 3.3919193108126193e-07, + "loss": 0.0, + "step": 29389 + }, + { + "epoch": 11.952013013420089, + "grad_norm": 0.020669995151571936, + "learning_rate": 3.3893033499252327e-07, + "loss": 0.0002, + "step": 29390 + }, + { + "epoch": 11.952419682797885, + "grad_norm": 0.029198947693614002, + "learning_rate": 3.386688380795267e-07, + "loss": 0.0002, + "step": 29391 + }, + { + "epoch": 11.95282635217568, + "grad_norm": 0.0020796421916424177, + "learning_rate": 3.3840744034495777e-07, + "loss": 0.0, + "step": 29392 + }, + { + "epoch": 11.953233021553476, + "grad_norm": 0.032120916545656644, + "learning_rate": 3.3814614179149884e-07, + "loss": 0.0003, + "step": 29393 + }, + { + "epoch": 11.953639690931272, + "grad_norm": 0.001648504613802972, + "learning_rate": 3.3788494242183224e-07, + "loss": 0.0, + "step": 29394 + }, + { + "epoch": 11.954046360309068, + "grad_norm": 0.004339748503220754, + "learning_rate": 3.376238422386402e-07, + "loss": 0.0001, + "step": 29395 + }, + { + "epoch": 11.954453029686865, + "grad_norm": 0.01190098159311429, + "learning_rate": 3.373628412446028e-07, + "loss": 0.0001, + "step": 29396 + }, + { + "epoch": 11.954859699064661, + "grad_norm": 0.02708568151288657, + "learning_rate": 3.37101939442398e-07, + "loss": 0.0002, + "step": 29397 + }, + { + "epoch": 11.955266368442457, + "grad_norm": 0.0008289066216179107, + "learning_rate": 3.368411368347035e-07, + "loss": 0.0, + "step": 29398 + }, + { + "epoch": 11.955673037820253, + "grad_norm": 0.003837741392127221, + "learning_rate": 3.3658043342420066e-07, + "loss": 0.0, + "step": 29399 + }, + { + "epoch": 11.956079707198048, + "grad_norm": 0.03569093804891527, + "learning_rate": 3.363198292135617e-07, + "loss": 0.0003, + "step": 29400 + }, + { + "epoch": 11.956486376575844, + "grad_norm": 0.004758300454790725, + "learning_rate": 3.360593242054644e-07, + "loss": 0.0, + "step": 29401 + }, + { + "epoch": 11.95689304595364, + "grad_norm": 0.008987933203953753, + "learning_rate": 3.357989184025812e-07, + "loss": 0.0001, + "step": 29402 + }, + { + "epoch": 11.957299715331436, + "grad_norm": 0.10014413507403501, + "learning_rate": 3.355386118075865e-07, + "loss": 0.0008, + "step": 29403 + }, + { + "epoch": 11.957706384709232, + "grad_norm": 0.012358679290609098, + "learning_rate": 3.352784044231505e-07, + "loss": 0.0001, + "step": 29404 + }, + { + "epoch": 11.958113054087027, + "grad_norm": 0.0010502350573694725, + "learning_rate": 3.3501829625194773e-07, + "loss": 0.0, + "step": 29405 + }, + { + "epoch": 11.958519723464823, + "grad_norm": 0.05656638661785229, + "learning_rate": 3.3475828729664597e-07, + "loss": 0.0004, + "step": 29406 + }, + { + "epoch": 11.958926392842619, + "grad_norm": 0.09290290160758968, + "learning_rate": 3.344983775599142e-07, + "loss": 0.0008, + "step": 29407 + }, + { + "epoch": 11.959333062220415, + "grad_norm": 0.3132856223016041, + "learning_rate": 3.342385670444215e-07, + "loss": 0.0029, + "step": 29408 + }, + { + "epoch": 11.95973973159821, + "grad_norm": 0.010692034547195781, + "learning_rate": 3.339788557528345e-07, + "loss": 0.0001, + "step": 29409 + }, + { + "epoch": 11.960146400976006, + "grad_norm": 0.0008344276383724627, + "learning_rate": 3.3371924368782004e-07, + "loss": 0.0, + "step": 29410 + }, + { + "epoch": 11.960553070353802, + "grad_norm": 0.033455994659580164, + "learning_rate": 3.3345973085204044e-07, + "loss": 0.0002, + "step": 29411 + }, + { + "epoch": 11.960959739731598, + "grad_norm": 0.0009417097703650554, + "learning_rate": 3.332003172481635e-07, + "loss": 0.0, + "step": 29412 + }, + { + "epoch": 11.961366409109393, + "grad_norm": 0.0008239250669681281, + "learning_rate": 3.3294100287885047e-07, + "loss": 0.0, + "step": 29413 + }, + { + "epoch": 11.96177307848719, + "grad_norm": 0.034537551159641716, + "learning_rate": 3.326817877467625e-07, + "loss": 0.0003, + "step": 29414 + }, + { + "epoch": 11.962179747864985, + "grad_norm": 0.0010943127702556347, + "learning_rate": 3.3242267185456313e-07, + "loss": 0.0, + "step": 29415 + }, + { + "epoch": 11.962586417242782, + "grad_norm": 0.0011917086467374111, + "learning_rate": 3.32163655204909e-07, + "loss": 0.0, + "step": 29416 + }, + { + "epoch": 11.962993086620578, + "grad_norm": 8.398873714613807e-05, + "learning_rate": 3.319047378004603e-07, + "loss": 0.0, + "step": 29417 + }, + { + "epoch": 11.963399755998374, + "grad_norm": 0.0017762833281165602, + "learning_rate": 3.3164591964387596e-07, + "loss": 0.0, + "step": 29418 + }, + { + "epoch": 11.96380642537617, + "grad_norm": 0.0046062602482035055, + "learning_rate": 3.313872007378127e-07, + "loss": 0.0, + "step": 29419 + }, + { + "epoch": 11.964213094753966, + "grad_norm": 0.003709666118017177, + "learning_rate": 3.311285810849263e-07, + "loss": 0.0, + "step": 29420 + }, + { + "epoch": 11.964619764131761, + "grad_norm": 0.00017917467447681648, + "learning_rate": 3.3087006068787117e-07, + "loss": 0.0, + "step": 29421 + }, + { + "epoch": 11.965026433509557, + "grad_norm": 0.004061767160642756, + "learning_rate": 3.3061163954930085e-07, + "loss": 0.0, + "step": 29422 + }, + { + "epoch": 11.965433102887353, + "grad_norm": 0.026284521692917635, + "learning_rate": 3.303533176718676e-07, + "loss": 0.0002, + "step": 29423 + }, + { + "epoch": 11.965839772265149, + "grad_norm": 0.05387491100828379, + "learning_rate": 3.300950950582249e-07, + "loss": 0.0005, + "step": 29424 + }, + { + "epoch": 11.966246441642944, + "grad_norm": 0.00257552287854622, + "learning_rate": 3.2983697171102393e-07, + "loss": 0.0, + "step": 29425 + }, + { + "epoch": 11.96665311102074, + "grad_norm": 0.05012739545739591, + "learning_rate": 3.2957894763291256e-07, + "loss": 0.0003, + "step": 29426 + }, + { + "epoch": 11.967059780398536, + "grad_norm": 0.00011456203113193686, + "learning_rate": 3.2932102282653977e-07, + "loss": 0.0, + "step": 29427 + }, + { + "epoch": 11.967466449776332, + "grad_norm": 2.736801346406499, + "learning_rate": 3.290631972945546e-07, + "loss": 0.0301, + "step": 29428 + }, + { + "epoch": 11.967873119154127, + "grad_norm": 0.0004251790322264701, + "learning_rate": 3.2880547103960157e-07, + "loss": 0.0, + "step": 29429 + }, + { + "epoch": 11.968279788531923, + "grad_norm": 0.03128575251950449, + "learning_rate": 3.285478440643286e-07, + "loss": 0.0004, + "step": 29430 + }, + { + "epoch": 11.968686457909719, + "grad_norm": 0.005259552248465712, + "learning_rate": 3.282903163713802e-07, + "loss": 0.0, + "step": 29431 + }, + { + "epoch": 11.969093127287515, + "grad_norm": 0.023395591666115573, + "learning_rate": 3.280328879633987e-07, + "loss": 0.0002, + "step": 29432 + }, + { + "epoch": 11.96949979666531, + "grad_norm": 1.12125436050662, + "learning_rate": 3.2777555884302756e-07, + "loss": 0.0122, + "step": 29433 + }, + { + "epoch": 11.969906466043106, + "grad_norm": 0.0005232863283865608, + "learning_rate": 3.27518329012908e-07, + "loss": 0.0, + "step": 29434 + }, + { + "epoch": 11.970313135420902, + "grad_norm": 0.017751113340926463, + "learning_rate": 3.272611984756813e-07, + "loss": 0.0001, + "step": 29435 + }, + { + "epoch": 11.970719804798698, + "grad_norm": 0.03668419717087807, + "learning_rate": 3.270041672339863e-07, + "loss": 0.0004, + "step": 29436 + }, + { + "epoch": 11.971126474176495, + "grad_norm": 0.05323138265940529, + "learning_rate": 3.2674723529046217e-07, + "loss": 0.0004, + "step": 29437 + }, + { + "epoch": 11.971533143554291, + "grad_norm": 0.0003767574061877951, + "learning_rate": 3.2649040264774556e-07, + "loss": 0.0, + "step": 29438 + }, + { + "epoch": 11.971939812932087, + "grad_norm": 0.0016605515254860105, + "learning_rate": 3.2623366930847447e-07, + "loss": 0.0, + "step": 29439 + }, + { + "epoch": 11.972346482309883, + "grad_norm": 0.016245242042992422, + "learning_rate": 3.259770352752822e-07, + "loss": 0.0001, + "step": 29440 + }, + { + "epoch": 11.972753151687678, + "grad_norm": 0.0007174442946343112, + "learning_rate": 3.257205005508057e-07, + "loss": 0.0, + "step": 29441 + }, + { + "epoch": 11.973159821065474, + "grad_norm": 0.3009155673790346, + "learning_rate": 3.25464065137675e-07, + "loss": 0.0022, + "step": 29442 + }, + { + "epoch": 11.97356649044327, + "grad_norm": 0.01781862255743606, + "learning_rate": 3.2520772903852693e-07, + "loss": 0.0001, + "step": 29443 + }, + { + "epoch": 11.973973159821066, + "grad_norm": 0.008773656956757931, + "learning_rate": 3.249514922559904e-07, + "loss": 0.0001, + "step": 29444 + }, + { + "epoch": 11.974379829198861, + "grad_norm": 0.04866778340597, + "learning_rate": 3.2469535479269676e-07, + "loss": 0.0004, + "step": 29445 + }, + { + "epoch": 11.974786498576657, + "grad_norm": 0.0018182442051944633, + "learning_rate": 3.244393166512738e-07, + "loss": 0.0, + "step": 29446 + }, + { + "epoch": 11.975193167954453, + "grad_norm": 0.00012674122438415252, + "learning_rate": 3.241833778343517e-07, + "loss": 0.0, + "step": 29447 + }, + { + "epoch": 11.975599837332249, + "grad_norm": 0.024083243000844214, + "learning_rate": 3.2392753834455615e-07, + "loss": 0.0001, + "step": 29448 + }, + { + "epoch": 11.976006506710045, + "grad_norm": 0.007269440406367221, + "learning_rate": 3.236717981845139e-07, + "loss": 0.0001, + "step": 29449 + }, + { + "epoch": 11.97641317608784, + "grad_norm": 0.0026161728996593235, + "learning_rate": 3.234161573568506e-07, + "loss": 0.0, + "step": 29450 + }, + { + "epoch": 11.976819845465636, + "grad_norm": 0.002540391358239755, + "learning_rate": 3.2316061586419203e-07, + "loss": 0.0, + "step": 29451 + }, + { + "epoch": 11.977226514843432, + "grad_norm": 0.005159484208528078, + "learning_rate": 3.229051737091593e-07, + "loss": 0.0, + "step": 29452 + }, + { + "epoch": 11.977633184221228, + "grad_norm": 0.0055125540237785115, + "learning_rate": 3.2264983089437485e-07, + "loss": 0.0, + "step": 29453 + }, + { + "epoch": 11.978039853599023, + "grad_norm": 0.020665083168890656, + "learning_rate": 3.22394587422461e-07, + "loss": 0.0001, + "step": 29454 + }, + { + "epoch": 11.978446522976819, + "grad_norm": 0.038716187534921336, + "learning_rate": 3.221394432960356e-07, + "loss": 0.0003, + "step": 29455 + }, + { + "epoch": 11.978853192354615, + "grad_norm": 0.0028267190796523465, + "learning_rate": 3.2188439851772114e-07, + "loss": 0.0, + "step": 29456 + }, + { + "epoch": 11.979259861732412, + "grad_norm": 0.0122972806460756, + "learning_rate": 3.216294530901343e-07, + "loss": 0.0001, + "step": 29457 + }, + { + "epoch": 11.979666531110208, + "grad_norm": 0.05401350399228053, + "learning_rate": 3.2137460701589294e-07, + "loss": 0.0005, + "step": 29458 + }, + { + "epoch": 11.980073200488004, + "grad_norm": 0.017632695049783532, + "learning_rate": 3.2111986029761067e-07, + "loss": 0.0001, + "step": 29459 + }, + { + "epoch": 11.9804798698658, + "grad_norm": 0.029032024154281525, + "learning_rate": 3.2086521293790527e-07, + "loss": 0.0002, + "step": 29460 + }, + { + "epoch": 11.980886539243595, + "grad_norm": 1.06803678093662, + "learning_rate": 3.206106649393903e-07, + "loss": 0.0092, + "step": 29461 + }, + { + "epoch": 11.981293208621391, + "grad_norm": 0.005695485597193757, + "learning_rate": 3.2035621630467693e-07, + "loss": 0.0, + "step": 29462 + }, + { + "epoch": 11.981699877999187, + "grad_norm": 0.00902698831651499, + "learning_rate": 3.201018670363798e-07, + "loss": 0.0, + "step": 29463 + }, + { + "epoch": 11.982106547376983, + "grad_norm": 0.0004092972084712762, + "learning_rate": 3.198476171371079e-07, + "loss": 0.0, + "step": 29464 + }, + { + "epoch": 11.982513216754779, + "grad_norm": 0.40346222010318233, + "learning_rate": 3.1959346660947355e-07, + "loss": 0.0032, + "step": 29465 + }, + { + "epoch": 11.982919886132574, + "grad_norm": 0.04255735645021783, + "learning_rate": 3.193394154560836e-07, + "loss": 0.0004, + "step": 29466 + }, + { + "epoch": 11.98332655551037, + "grad_norm": 0.0015973867240778978, + "learning_rate": 3.1908546367954704e-07, + "loss": 0.0, + "step": 29467 + }, + { + "epoch": 11.983733224888166, + "grad_norm": 0.001599014927440561, + "learning_rate": 3.188316112824685e-07, + "loss": 0.0, + "step": 29468 + }, + { + "epoch": 11.984139894265962, + "grad_norm": 0.0004470867068060031, + "learning_rate": 3.185778582674581e-07, + "loss": 0.0, + "step": 29469 + }, + { + "epoch": 11.984546563643757, + "grad_norm": 0.003999061982210353, + "learning_rate": 3.1832420463711824e-07, + "loss": 0.0, + "step": 29470 + }, + { + "epoch": 11.984953233021553, + "grad_norm": 0.017625907834628012, + "learning_rate": 3.180706503940523e-07, + "loss": 0.0002, + "step": 29471 + }, + { + "epoch": 11.985359902399349, + "grad_norm": 0.04774755307936673, + "learning_rate": 3.1781719554086485e-07, + "loss": 0.0005, + "step": 29472 + }, + { + "epoch": 11.985766571777145, + "grad_norm": 0.20041983207250713, + "learning_rate": 3.1756384008015616e-07, + "loss": 0.0017, + "step": 29473 + }, + { + "epoch": 11.98617324115494, + "grad_norm": 0.0037834549776735284, + "learning_rate": 3.173105840145285e-07, + "loss": 0.0, + "step": 29474 + }, + { + "epoch": 11.986579910532736, + "grad_norm": 0.12183257862354037, + "learning_rate": 3.170574273465787e-07, + "loss": 0.0008, + "step": 29475 + }, + { + "epoch": 11.986986579910532, + "grad_norm": 0.007753088822975072, + "learning_rate": 3.168043700789103e-07, + "loss": 0.0001, + "step": 29476 + }, + { + "epoch": 11.987393249288328, + "grad_norm": 0.013002190301758077, + "learning_rate": 3.165514122141178e-07, + "loss": 0.0001, + "step": 29477 + }, + { + "epoch": 11.987799918666125, + "grad_norm": 0.0007729135675648362, + "learning_rate": 3.1629855375479803e-07, + "loss": 0.0, + "step": 29478 + }, + { + "epoch": 11.988206588043921, + "grad_norm": 0.0007573057180719651, + "learning_rate": 3.160457947035478e-07, + "loss": 0.0, + "step": 29479 + }, + { + "epoch": 11.988613257421717, + "grad_norm": 0.007436577046613684, + "learning_rate": 3.157931350629617e-07, + "loss": 0.0001, + "step": 29480 + }, + { + "epoch": 11.989019926799513, + "grad_norm": 8.644638875435896e-05, + "learning_rate": 3.1554057483563103e-07, + "loss": 0.0, + "step": 29481 + }, + { + "epoch": 11.989426596177308, + "grad_norm": 2.6403440377343947e-05, + "learning_rate": 3.152881140241526e-07, + "loss": 0.0, + "step": 29482 + }, + { + "epoch": 11.989833265555104, + "grad_norm": 0.002341826874157418, + "learning_rate": 3.150357526311154e-07, + "loss": 0.0, + "step": 29483 + }, + { + "epoch": 11.9902399349329, + "grad_norm": 0.007709702097455018, + "learning_rate": 3.147834906591107e-07, + "loss": 0.0001, + "step": 29484 + }, + { + "epoch": 11.990646604310696, + "grad_norm": 0.00033278166767861225, + "learning_rate": 3.1453132811072764e-07, + "loss": 0.0, + "step": 29485 + }, + { + "epoch": 11.991053273688491, + "grad_norm": 0.19473165151035585, + "learning_rate": 3.142792649885551e-07, + "loss": 0.0012, + "step": 29486 + }, + { + "epoch": 11.991459943066287, + "grad_norm": 2.08400026988271e-05, + "learning_rate": 3.140273012951811e-07, + "loss": 0.0, + "step": 29487 + }, + { + "epoch": 11.991866612444083, + "grad_norm": 0.00020533421607521102, + "learning_rate": 3.1377543703319026e-07, + "loss": 0.0, + "step": 29488 + }, + { + "epoch": 11.992273281821879, + "grad_norm": 0.0014787120280985008, + "learning_rate": 3.1352367220517043e-07, + "loss": 0.0, + "step": 29489 + }, + { + "epoch": 11.992679951199674, + "grad_norm": 0.0007686175349497885, + "learning_rate": 3.132720068137052e-07, + "loss": 0.0, + "step": 29490 + }, + { + "epoch": 11.99308662057747, + "grad_norm": 0.3311930547011213, + "learning_rate": 3.1302044086137797e-07, + "loss": 0.0033, + "step": 29491 + }, + { + "epoch": 11.993493289955266, + "grad_norm": 0.0009854238046902767, + "learning_rate": 3.127689743507722e-07, + "loss": 0.0, + "step": 29492 + }, + { + "epoch": 11.993899959333062, + "grad_norm": 0.016303527454469866, + "learning_rate": 3.125176072844671e-07, + "loss": 0.0002, + "step": 29493 + }, + { + "epoch": 11.994306628710858, + "grad_norm": 0.000601064149287656, + "learning_rate": 3.122663396650438e-07, + "loss": 0.0, + "step": 29494 + }, + { + "epoch": 11.994713298088653, + "grad_norm": 0.0015795429239631354, + "learning_rate": 3.1201517149508363e-07, + "loss": 0.0, + "step": 29495 + }, + { + "epoch": 11.995119967466449, + "grad_norm": 5.216538000092698, + "learning_rate": 3.117641027771634e-07, + "loss": 0.0661, + "step": 29496 + }, + { + "epoch": 11.995526636844245, + "grad_norm": 0.25113511924427606, + "learning_rate": 3.1151313351386106e-07, + "loss": 0.0019, + "step": 29497 + }, + { + "epoch": 11.995933306222042, + "grad_norm": 0.029584332420254055, + "learning_rate": 3.1126226370775225e-07, + "loss": 0.0002, + "step": 29498 + }, + { + "epoch": 11.996339975599838, + "grad_norm": 0.0011933517378957027, + "learning_rate": 3.110114933614117e-07, + "loss": 0.0, + "step": 29499 + }, + { + "epoch": 11.996746644977634, + "grad_norm": 0.011509272534018528, + "learning_rate": 3.107608224774161e-07, + "loss": 0.0001, + "step": 29500 + }, + { + "epoch": 11.99715331435543, + "grad_norm": 0.019442047856254387, + "learning_rate": 3.1051025105833463e-07, + "loss": 0.0001, + "step": 29501 + }, + { + "epoch": 11.997559983733225, + "grad_norm": 0.00020667717852634293, + "learning_rate": 3.102597791067441e-07, + "loss": 0.0, + "step": 29502 + }, + { + "epoch": 11.997966653111021, + "grad_norm": 0.24667576572004846, + "learning_rate": 3.100094066252135e-07, + "loss": 0.0021, + "step": 29503 + }, + { + "epoch": 11.998373322488817, + "grad_norm": 0.23429552551683316, + "learning_rate": 3.0975913361631194e-07, + "loss": 0.002, + "step": 29504 + }, + { + "epoch": 11.998779991866613, + "grad_norm": 0.04612310857924096, + "learning_rate": 3.0950896008261177e-07, + "loss": 0.0004, + "step": 29505 + }, + { + "epoch": 11.999186661244408, + "grad_norm": 0.0002182410404958592, + "learning_rate": 3.0925888602667766e-07, + "loss": 0.0, + "step": 29506 + }, + { + "epoch": 11.999593330622204, + "grad_norm": 0.23659905644583037, + "learning_rate": 3.0900891145107856e-07, + "loss": 0.002, + "step": 29507 + }, + { + "epoch": 12.0, + "grad_norm": 0.008966149886950858, + "learning_rate": 3.0875903635838035e-07, + "loss": 0.0, + "step": 29508 + }, + { + "epoch": 12.000406669377796, + "grad_norm": 0.0015639012314259675, + "learning_rate": 3.085092607511486e-07, + "loss": 0.0, + "step": 29509 + }, + { + "epoch": 12.000813338755592, + "grad_norm": 0.03301209498771733, + "learning_rate": 3.0825958463194805e-07, + "loss": 0.0003, + "step": 29510 + }, + { + "epoch": 12.001220008133387, + "grad_norm": 0.009230617430321688, + "learning_rate": 3.080100080033388e-07, + "loss": 0.0001, + "step": 29511 + }, + { + "epoch": 12.001626677511183, + "grad_norm": 0.0016021396880281202, + "learning_rate": 3.077605308678855e-07, + "loss": 0.0, + "step": 29512 + }, + { + "epoch": 12.002033346888979, + "grad_norm": 0.08299152827512651, + "learning_rate": 3.075111532281483e-07, + "loss": 0.0006, + "step": 29513 + }, + { + "epoch": 12.002440016266775, + "grad_norm": 0.0009108066463467651, + "learning_rate": 3.0726187508668624e-07, + "loss": 0.0, + "step": 29514 + }, + { + "epoch": 12.00284668564457, + "grad_norm": 0.01585950570859619, + "learning_rate": 3.070126964460607e-07, + "loss": 0.0001, + "step": 29515 + }, + { + "epoch": 12.003253355022366, + "grad_norm": 0.0015443495020265347, + "learning_rate": 3.0676361730882733e-07, + "loss": 0.0, + "step": 29516 + }, + { + "epoch": 12.003660024400162, + "grad_norm": 0.004633174459504433, + "learning_rate": 3.065146376775441e-07, + "loss": 0.0, + "step": 29517 + }, + { + "epoch": 12.004066693777958, + "grad_norm": 0.05506424740845041, + "learning_rate": 3.0626575755476784e-07, + "loss": 0.0002, + "step": 29518 + }, + { + "epoch": 12.004473363155755, + "grad_norm": 0.004993210146215724, + "learning_rate": 3.06016976943051e-07, + "loss": 0.0, + "step": 29519 + }, + { + "epoch": 12.004880032533551, + "grad_norm": 0.004699741993891653, + "learning_rate": 3.0576829584494816e-07, + "loss": 0.0, + "step": 29520 + }, + { + "epoch": 12.005286701911347, + "grad_norm": 0.0004644797623083488, + "learning_rate": 3.0551971426301397e-07, + "loss": 0.0, + "step": 29521 + }, + { + "epoch": 12.005693371289142, + "grad_norm": 0.0017210711017726435, + "learning_rate": 3.052712321997986e-07, + "loss": 0.0, + "step": 29522 + }, + { + "epoch": 12.006100040666938, + "grad_norm": 0.005871128393485992, + "learning_rate": 3.0502284965785446e-07, + "loss": 0.0, + "step": 29523 + }, + { + "epoch": 12.006506710044734, + "grad_norm": 0.15602899795269368, + "learning_rate": 3.047745666397295e-07, + "loss": 0.0019, + "step": 29524 + }, + { + "epoch": 12.00691337942253, + "grad_norm": 0.0006352736156383206, + "learning_rate": 3.045263831479739e-07, + "loss": 0.0, + "step": 29525 + }, + { + "epoch": 12.007320048800326, + "grad_norm": 5.1655436469730874e-05, + "learning_rate": 3.042782991851323e-07, + "loss": 0.0, + "step": 29526 + }, + { + "epoch": 12.007726718178121, + "grad_norm": 0.004902882332097642, + "learning_rate": 3.040303147537549e-07, + "loss": 0.0, + "step": 29527 + }, + { + "epoch": 12.008133387555917, + "grad_norm": 0.03767483036143687, + "learning_rate": 3.0378242985638626e-07, + "loss": 0.0003, + "step": 29528 + }, + { + "epoch": 12.008540056933713, + "grad_norm": 0.0026982102872646504, + "learning_rate": 3.035346444955711e-07, + "loss": 0.0, + "step": 29529 + }, + { + "epoch": 12.008946726311509, + "grad_norm": 0.02000958631795218, + "learning_rate": 3.032869586738529e-07, + "loss": 0.0002, + "step": 29530 + }, + { + "epoch": 12.009353395689304, + "grad_norm": 8.374582767428504e-05, + "learning_rate": 3.0303937239377523e-07, + "loss": 0.0, + "step": 29531 + }, + { + "epoch": 12.0097600650671, + "grad_norm": 0.08400635226178008, + "learning_rate": 3.0279188565787597e-07, + "loss": 0.0008, + "step": 29532 + }, + { + "epoch": 12.010166734444896, + "grad_norm": 0.000664226126728112, + "learning_rate": 3.0254449846870094e-07, + "loss": 0.0, + "step": 29533 + }, + { + "epoch": 12.010573403822692, + "grad_norm": 0.018849871642348037, + "learning_rate": 3.0229721082878695e-07, + "loss": 0.0001, + "step": 29534 + }, + { + "epoch": 12.010980073200487, + "grad_norm": 0.030711084821614956, + "learning_rate": 3.020500227406731e-07, + "loss": 0.0002, + "step": 29535 + }, + { + "epoch": 12.011386742578283, + "grad_norm": 0.006764733747876787, + "learning_rate": 3.0180293420689623e-07, + "loss": 0.0, + "step": 29536 + }, + { + "epoch": 12.011793411956079, + "grad_norm": 0.13026969706019995, + "learning_rate": 3.0155594522999323e-07, + "loss": 0.0006, + "step": 29537 + }, + { + "epoch": 12.012200081333875, + "grad_norm": 0.002063269830401332, + "learning_rate": 3.013090558124987e-07, + "loss": 0.0, + "step": 29538 + }, + { + "epoch": 12.012606750711672, + "grad_norm": 0.06181119471855496, + "learning_rate": 3.010622659569484e-07, + "loss": 0.0004, + "step": 29539 + }, + { + "epoch": 12.013013420089468, + "grad_norm": 0.1391657307542487, + "learning_rate": 3.0081557566587595e-07, + "loss": 0.0014, + "step": 29540 + }, + { + "epoch": 12.013420089467264, + "grad_norm": 0.06421587497077264, + "learning_rate": 3.005689849418125e-07, + "loss": 0.0003, + "step": 29541 + }, + { + "epoch": 12.01382675884506, + "grad_norm": 0.007830698015892756, + "learning_rate": 3.003224937872906e-07, + "loss": 0.0001, + "step": 29542 + }, + { + "epoch": 12.014233428222855, + "grad_norm": 0.00011533483110942112, + "learning_rate": 3.0007610220483927e-07, + "loss": 0.0, + "step": 29543 + }, + { + "epoch": 12.014640097600651, + "grad_norm": 0.008569517913246049, + "learning_rate": 2.9982981019698985e-07, + "loss": 0.0001, + "step": 29544 + }, + { + "epoch": 12.015046766978447, + "grad_norm": 0.08261145080974483, + "learning_rate": 2.995836177662681e-07, + "loss": 0.0006, + "step": 29545 + }, + { + "epoch": 12.015453436356243, + "grad_norm": 0.008070861376314171, + "learning_rate": 2.9933752491520306e-07, + "loss": 0.0001, + "step": 29546 + }, + { + "epoch": 12.015860105734038, + "grad_norm": 0.0002879678809812135, + "learning_rate": 2.9909153164632056e-07, + "loss": 0.0, + "step": 29547 + }, + { + "epoch": 12.016266775111834, + "grad_norm": 0.0021760025240615904, + "learning_rate": 2.9884563796214627e-07, + "loss": 0.0, + "step": 29548 + }, + { + "epoch": 12.01667344448963, + "grad_norm": 0.02924167907702061, + "learning_rate": 2.985998438652038e-07, + "loss": 0.0002, + "step": 29549 + }, + { + "epoch": 12.017080113867426, + "grad_norm": 3.279614010879545e-05, + "learning_rate": 2.9835414935801667e-07, + "loss": 0.0, + "step": 29550 + }, + { + "epoch": 12.017486783245221, + "grad_norm": 0.04132120561432232, + "learning_rate": 2.9810855444310724e-07, + "loss": 0.0002, + "step": 29551 + }, + { + "epoch": 12.017893452623017, + "grad_norm": 0.0005852683521412661, + "learning_rate": 2.978630591229947e-07, + "loss": 0.0, + "step": 29552 + }, + { + "epoch": 12.018300122000813, + "grad_norm": 0.015610978779424396, + "learning_rate": 2.976176634002026e-07, + "loss": 0.0, + "step": 29553 + }, + { + "epoch": 12.018706791378609, + "grad_norm": 0.0007426640389745593, + "learning_rate": 2.9737236727724773e-07, + "loss": 0.0, + "step": 29554 + }, + { + "epoch": 12.019113460756405, + "grad_norm": 0.028517080857794214, + "learning_rate": 2.971271707566481e-07, + "loss": 0.0002, + "step": 29555 + }, + { + "epoch": 12.0195201301342, + "grad_norm": 0.009632738132132052, + "learning_rate": 2.968820738409228e-07, + "loss": 0.0001, + "step": 29556 + }, + { + "epoch": 12.019926799511996, + "grad_norm": 0.014356719741086581, + "learning_rate": 2.9663707653258543e-07, + "loss": 0.0001, + "step": 29557 + }, + { + "epoch": 12.020333468889792, + "grad_norm": 0.0013091355064529292, + "learning_rate": 2.9639217883415173e-07, + "loss": 0.0, + "step": 29558 + }, + { + "epoch": 12.020740138267588, + "grad_norm": 0.032012937843508345, + "learning_rate": 2.9614738074813633e-07, + "loss": 0.0003, + "step": 29559 + }, + { + "epoch": 12.021146807645385, + "grad_norm": 0.04798071871199825, + "learning_rate": 2.9590268227705276e-07, + "loss": 0.0003, + "step": 29560 + }, + { + "epoch": 12.021553477023181, + "grad_norm": 0.0064802748299992614, + "learning_rate": 2.956580834234124e-07, + "loss": 0.0001, + "step": 29561 + }, + { + "epoch": 12.021960146400977, + "grad_norm": 0.001869771026189135, + "learning_rate": 2.954135841897243e-07, + "loss": 0.0, + "step": 29562 + }, + { + "epoch": 12.022366815778772, + "grad_norm": 0.021517588648820873, + "learning_rate": 2.9516918457850097e-07, + "loss": 0.0001, + "step": 29563 + }, + { + "epoch": 12.022773485156568, + "grad_norm": 0.021471741503701357, + "learning_rate": 2.9492488459225034e-07, + "loss": 0.0002, + "step": 29564 + }, + { + "epoch": 12.023180154534364, + "grad_norm": 0.0012422442106559363, + "learning_rate": 2.9468068423347927e-07, + "loss": 0.0, + "step": 29565 + }, + { + "epoch": 12.02358682391216, + "grad_norm": 0.02877633342790975, + "learning_rate": 2.9443658350469693e-07, + "loss": 0.0003, + "step": 29566 + }, + { + "epoch": 12.023993493289955, + "grad_norm": 0.02171068101731105, + "learning_rate": 2.9419258240840685e-07, + "loss": 0.0002, + "step": 29567 + }, + { + "epoch": 12.024400162667751, + "grad_norm": 0.011125688836531231, + "learning_rate": 2.939486809471159e-07, + "loss": 0.0001, + "step": 29568 + }, + { + "epoch": 12.024806832045547, + "grad_norm": 0.010501451729561772, + "learning_rate": 2.9370487912332546e-07, + "loss": 0.0001, + "step": 29569 + }, + { + "epoch": 12.025213501423343, + "grad_norm": 0.19438448887662685, + "learning_rate": 2.9346117693954013e-07, + "loss": 0.0014, + "step": 29570 + }, + { + "epoch": 12.025620170801139, + "grad_norm": 0.21383106408131655, + "learning_rate": 2.9321757439826014e-07, + "loss": 0.0017, + "step": 29571 + }, + { + "epoch": 12.026026840178934, + "grad_norm": 0.00041421633790654825, + "learning_rate": 2.92974071501988e-07, + "loss": 0.0, + "step": 29572 + }, + { + "epoch": 12.02643350955673, + "grad_norm": 0.0022305751051604024, + "learning_rate": 2.927306682532216e-07, + "loss": 0.0, + "step": 29573 + }, + { + "epoch": 12.026840178934526, + "grad_norm": 0.2321069009190447, + "learning_rate": 2.9248736465446124e-07, + "loss": 0.0022, + "step": 29574 + }, + { + "epoch": 12.027246848312322, + "grad_norm": 0.029429324300625437, + "learning_rate": 2.922441607082038e-07, + "loss": 0.0003, + "step": 29575 + }, + { + "epoch": 12.027653517690117, + "grad_norm": 0.0001273839081176269, + "learning_rate": 2.920010564169462e-07, + "loss": 0.0, + "step": 29576 + }, + { + "epoch": 12.028060187067913, + "grad_norm": 0.004704013615595983, + "learning_rate": 2.9175805178318306e-07, + "loss": 0.0, + "step": 29577 + }, + { + "epoch": 12.028466856445709, + "grad_norm": 0.01888831882723801, + "learning_rate": 2.9151514680940904e-07, + "loss": 0.0002, + "step": 29578 + }, + { + "epoch": 12.028873525823505, + "grad_norm": 0.1582155024941879, + "learning_rate": 2.912723414981189e-07, + "loss": 0.0014, + "step": 29579 + }, + { + "epoch": 12.029280195201302, + "grad_norm": 0.002907953668540032, + "learning_rate": 2.910296358518039e-07, + "loss": 0.0, + "step": 29580 + }, + { + "epoch": 12.029686864579098, + "grad_norm": 0.03223695912770833, + "learning_rate": 2.907870298729565e-07, + "loss": 0.0001, + "step": 29581 + }, + { + "epoch": 12.030093533956894, + "grad_norm": 0.002375082846662981, + "learning_rate": 2.9054452356406804e-07, + "loss": 0.0, + "step": 29582 + }, + { + "epoch": 12.03050020333469, + "grad_norm": 0.9011286133599671, + "learning_rate": 2.9030211692762543e-07, + "loss": 0.0057, + "step": 29583 + }, + { + "epoch": 12.030906872712485, + "grad_norm": 0.030139362217642043, + "learning_rate": 2.900598099661178e-07, + "loss": 0.0002, + "step": 29584 + }, + { + "epoch": 12.031313542090281, + "grad_norm": 0.001125364429966673, + "learning_rate": 2.8981760268203317e-07, + "loss": 0.0, + "step": 29585 + }, + { + "epoch": 12.031720211468077, + "grad_norm": 0.0009518647470280839, + "learning_rate": 2.895754950778595e-07, + "loss": 0.0, + "step": 29586 + }, + { + "epoch": 12.032126880845873, + "grad_norm": 0.08371988172756391, + "learning_rate": 2.8933348715607933e-07, + "loss": 0.0007, + "step": 29587 + }, + { + "epoch": 12.032533550223668, + "grad_norm": 2.179696819784603e-05, + "learning_rate": 2.890915789191784e-07, + "loss": 0.0, + "step": 29588 + }, + { + "epoch": 12.032940219601464, + "grad_norm": 0.00021073704269181006, + "learning_rate": 2.888497703696402e-07, + "loss": 0.0, + "step": 29589 + }, + { + "epoch": 12.03334688897926, + "grad_norm": 0.0005140741860769379, + "learning_rate": 2.8860806150994623e-07, + "loss": 0.0, + "step": 29590 + }, + { + "epoch": 12.033753558357056, + "grad_norm": 0.0013912021463404454, + "learning_rate": 2.8836645234257774e-07, + "loss": 0.0, + "step": 29591 + }, + { + "epoch": 12.034160227734851, + "grad_norm": 0.003099053979693218, + "learning_rate": 2.88124942870015e-07, + "loss": 0.0, + "step": 29592 + }, + { + "epoch": 12.034566897112647, + "grad_norm": 0.03431569515320401, + "learning_rate": 2.878835330947394e-07, + "loss": 0.0003, + "step": 29593 + }, + { + "epoch": 12.034973566490443, + "grad_norm": 0.014101089777267099, + "learning_rate": 2.876422230192266e-07, + "loss": 0.0001, + "step": 29594 + }, + { + "epoch": 12.035380235868239, + "grad_norm": 0.0008435917736830148, + "learning_rate": 2.8740101264595364e-07, + "loss": 0.0, + "step": 29595 + }, + { + "epoch": 12.035786905246034, + "grad_norm": 0.06098355454050259, + "learning_rate": 2.871599019773985e-07, + "loss": 0.0003, + "step": 29596 + }, + { + "epoch": 12.03619357462383, + "grad_norm": 0.0014395007816097578, + "learning_rate": 2.8691889101603474e-07, + "loss": 0.0, + "step": 29597 + }, + { + "epoch": 12.036600244001626, + "grad_norm": 0.00023035299380971957, + "learning_rate": 2.8667797976433707e-07, + "loss": 0.0, + "step": 29598 + }, + { + "epoch": 12.037006913379422, + "grad_norm": 0.0074424754985980114, + "learning_rate": 2.8643716822477905e-07, + "loss": 0.0001, + "step": 29599 + }, + { + "epoch": 12.037413582757218, + "grad_norm": 0.034366652642384816, + "learning_rate": 2.8619645639983207e-07, + "loss": 0.0004, + "step": 29600 + }, + { + "epoch": 12.037820252135015, + "grad_norm": 2.5867757855029425e-05, + "learning_rate": 2.8595584429196745e-07, + "loss": 0.0, + "step": 29601 + }, + { + "epoch": 12.03822692151281, + "grad_norm": 0.015230286901033264, + "learning_rate": 2.857153319036554e-07, + "loss": 0.0001, + "step": 29602 + }, + { + "epoch": 12.038633590890607, + "grad_norm": 0.005243900218826324, + "learning_rate": 2.85474919237364e-07, + "loss": 0.0, + "step": 29603 + }, + { + "epoch": 12.039040260268402, + "grad_norm": 0.10724924854710689, + "learning_rate": 2.8523460629556024e-07, + "loss": 0.0006, + "step": 29604 + }, + { + "epoch": 12.039446929646198, + "grad_norm": 0.0027870642781025728, + "learning_rate": 2.849943930807142e-07, + "loss": 0.0, + "step": 29605 + }, + { + "epoch": 12.039853599023994, + "grad_norm": 0.028158740539132778, + "learning_rate": 2.847542795952907e-07, + "loss": 0.0003, + "step": 29606 + }, + { + "epoch": 12.04026026840179, + "grad_norm": 0.004511526164243993, + "learning_rate": 2.8451426584175324e-07, + "loss": 0.0, + "step": 29607 + }, + { + "epoch": 12.040666937779585, + "grad_norm": 0.004152276800000735, + "learning_rate": 2.8427435182256767e-07, + "loss": 0.0, + "step": 29608 + }, + { + "epoch": 12.041073607157381, + "grad_norm": 0.1012109623935125, + "learning_rate": 2.840345375401943e-07, + "loss": 0.001, + "step": 29609 + }, + { + "epoch": 12.041480276535177, + "grad_norm": 0.005842872828381225, + "learning_rate": 2.837948229970966e-07, + "loss": 0.0, + "step": 29610 + }, + { + "epoch": 12.041886945912973, + "grad_norm": 0.001315943495594555, + "learning_rate": 2.835552081957349e-07, + "loss": 0.0, + "step": 29611 + }, + { + "epoch": 12.042293615290768, + "grad_norm": 0.0009997357772692796, + "learning_rate": 2.8331569313857053e-07, + "loss": 0.0, + "step": 29612 + }, + { + "epoch": 12.042700284668564, + "grad_norm": 0.00042280375596317874, + "learning_rate": 2.8307627782805937e-07, + "loss": 0.0, + "step": 29613 + }, + { + "epoch": 12.04310695404636, + "grad_norm": 0.03971467923195441, + "learning_rate": 2.828369622666616e-07, + "loss": 0.0002, + "step": 29614 + }, + { + "epoch": 12.043513623424156, + "grad_norm": 0.00017135479607327688, + "learning_rate": 2.82597746456833e-07, + "loss": 0.0, + "step": 29615 + }, + { + "epoch": 12.043920292801952, + "grad_norm": 0.006677063668938238, + "learning_rate": 2.823586304010295e-07, + "loss": 0.0, + "step": 29616 + }, + { + "epoch": 12.044326962179747, + "grad_norm": 0.004193730438049994, + "learning_rate": 2.8211961410170353e-07, + "loss": 0.0, + "step": 29617 + }, + { + "epoch": 12.044733631557543, + "grad_norm": 0.11835476783951746, + "learning_rate": 2.8188069756131197e-07, + "loss": 0.0007, + "step": 29618 + }, + { + "epoch": 12.045140300935339, + "grad_norm": 0.0001969747009464739, + "learning_rate": 2.8164188078230625e-07, + "loss": 0.0, + "step": 29619 + }, + { + "epoch": 12.045546970313135, + "grad_norm": 0.012561749260423588, + "learning_rate": 2.8140316376713773e-07, + "loss": 0.0001, + "step": 29620 + }, + { + "epoch": 12.045953639690932, + "grad_norm": 1.9392588487740116e-05, + "learning_rate": 2.811645465182566e-07, + "loss": 0.0, + "step": 29621 + }, + { + "epoch": 12.046360309068728, + "grad_norm": 0.017466560403987422, + "learning_rate": 2.8092602903811326e-07, + "loss": 0.0002, + "step": 29622 + }, + { + "epoch": 12.046766978446524, + "grad_norm": 0.00281023718919091, + "learning_rate": 2.8068761132915454e-07, + "loss": 0.0, + "step": 29623 + }, + { + "epoch": 12.04717364782432, + "grad_norm": 0.011813934826651204, + "learning_rate": 2.8044929339383075e-07, + "loss": 0.0001, + "step": 29624 + }, + { + "epoch": 12.047580317202115, + "grad_norm": 0.022017857293702314, + "learning_rate": 2.802110752345866e-07, + "loss": 0.0002, + "step": 29625 + }, + { + "epoch": 12.047986986579911, + "grad_norm": 0.051141057358308886, + "learning_rate": 2.799729568538667e-07, + "loss": 0.0005, + "step": 29626 + }, + { + "epoch": 12.048393655957707, + "grad_norm": 0.018151206789952704, + "learning_rate": 2.7973493825411703e-07, + "loss": 0.0001, + "step": 29627 + }, + { + "epoch": 12.048800325335502, + "grad_norm": 0.03518682767725639, + "learning_rate": 2.7949701943777995e-07, + "loss": 0.0003, + "step": 29628 + }, + { + "epoch": 12.049206994713298, + "grad_norm": 0.00011137586051450684, + "learning_rate": 2.7925920040729805e-07, + "loss": 0.0, + "step": 29629 + }, + { + "epoch": 12.049613664091094, + "grad_norm": 0.002798272599795883, + "learning_rate": 2.7902148116511265e-07, + "loss": 0.0, + "step": 29630 + }, + { + "epoch": 12.05002033346889, + "grad_norm": 0.0067479030583650935, + "learning_rate": 2.787838617136651e-07, + "loss": 0.0001, + "step": 29631 + }, + { + "epoch": 12.050427002846686, + "grad_norm": 0.04980291041123456, + "learning_rate": 2.7854634205539355e-07, + "loss": 0.0002, + "step": 29632 + }, + { + "epoch": 12.050833672224481, + "grad_norm": 0.00045736911647901784, + "learning_rate": 2.7830892219273596e-07, + "loss": 0.0, + "step": 29633 + }, + { + "epoch": 12.051240341602277, + "grad_norm": 0.0296149603601344, + "learning_rate": 2.7807160212813045e-07, + "loss": 0.0002, + "step": 29634 + }, + { + "epoch": 12.051647010980073, + "grad_norm": 0.0006499370208393104, + "learning_rate": 2.778343818640117e-07, + "loss": 0.0, + "step": 29635 + }, + { + "epoch": 12.052053680357869, + "grad_norm": 0.053943143709956905, + "learning_rate": 2.7759726140281776e-07, + "loss": 0.0005, + "step": 29636 + }, + { + "epoch": 12.052460349735664, + "grad_norm": 0.017333373915915804, + "learning_rate": 2.773602407469811e-07, + "loss": 0.0001, + "step": 29637 + }, + { + "epoch": 12.05286701911346, + "grad_norm": 0.006357062899509599, + "learning_rate": 2.7712331989893426e-07, + "loss": 0.0, + "step": 29638 + }, + { + "epoch": 12.053273688491256, + "grad_norm": 0.1935002434222357, + "learning_rate": 2.768864988611097e-07, + "loss": 0.0013, + "step": 29639 + }, + { + "epoch": 12.053680357869052, + "grad_norm": 0.002815060782725263, + "learning_rate": 2.7664977763594e-07, + "loss": 0.0, + "step": 29640 + }, + { + "epoch": 12.054087027246847, + "grad_norm": 0.002239787332716406, + "learning_rate": 2.7641315622585317e-07, + "loss": 0.0, + "step": 29641 + }, + { + "epoch": 12.054493696624645, + "grad_norm": 0.001160091576041989, + "learning_rate": 2.7617663463327724e-07, + "loss": 0.0, + "step": 29642 + }, + { + "epoch": 12.05490036600244, + "grad_norm": 0.003464603852082243, + "learning_rate": 2.7594021286064476e-07, + "loss": 0.0, + "step": 29643 + }, + { + "epoch": 12.055307035380237, + "grad_norm": 0.01355833342679585, + "learning_rate": 2.757038909103793e-07, + "loss": 0.0001, + "step": 29644 + }, + { + "epoch": 12.055713704758032, + "grad_norm": 0.00043742054533929455, + "learning_rate": 2.754676687849067e-07, + "loss": 0.0, + "step": 29645 + }, + { + "epoch": 12.056120374135828, + "grad_norm": 0.005518734115527795, + "learning_rate": 2.7523154648665395e-07, + "loss": 0.0001, + "step": 29646 + }, + { + "epoch": 12.056527043513624, + "grad_norm": 0.022710957390307186, + "learning_rate": 2.749955240180435e-07, + "loss": 0.0002, + "step": 29647 + }, + { + "epoch": 12.05693371289142, + "grad_norm": 0.08365647239969556, + "learning_rate": 2.747596013814968e-07, + "loss": 0.0009, + "step": 29648 + }, + { + "epoch": 12.057340382269215, + "grad_norm": 0.034460156351723704, + "learning_rate": 2.7452377857943856e-07, + "loss": 0.0003, + "step": 29649 + }, + { + "epoch": 12.057747051647011, + "grad_norm": 0.0006541099530902129, + "learning_rate": 2.742880556142891e-07, + "loss": 0.0, + "step": 29650 + }, + { + "epoch": 12.058153721024807, + "grad_norm": 0.008422335030332379, + "learning_rate": 2.7405243248846747e-07, + "loss": 0.0001, + "step": 29651 + }, + { + "epoch": 12.058560390402603, + "grad_norm": 0.002017612117714492, + "learning_rate": 2.7381690920439187e-07, + "loss": 0.0, + "step": 29652 + }, + { + "epoch": 12.058967059780398, + "grad_norm": 0.01999294838198207, + "learning_rate": 2.735814857644814e-07, + "loss": 0.0002, + "step": 29653 + }, + { + "epoch": 12.059373729158194, + "grad_norm": 0.004084922897453685, + "learning_rate": 2.7334616217115195e-07, + "loss": 0.0, + "step": 29654 + }, + { + "epoch": 12.05978039853599, + "grad_norm": 0.003915530539646205, + "learning_rate": 2.7311093842681826e-07, + "loss": 0.0, + "step": 29655 + }, + { + "epoch": 12.060187067913786, + "grad_norm": 0.0007403154354464808, + "learning_rate": 2.728758145338983e-07, + "loss": 0.0, + "step": 29656 + }, + { + "epoch": 12.060593737291581, + "grad_norm": 0.007017240926367821, + "learning_rate": 2.7264079049480255e-07, + "loss": 0.0001, + "step": 29657 + }, + { + "epoch": 12.061000406669377, + "grad_norm": 0.017586361410411987, + "learning_rate": 2.7240586631194553e-07, + "loss": 0.0002, + "step": 29658 + }, + { + "epoch": 12.061407076047173, + "grad_norm": 0.008889058263817128, + "learning_rate": 2.721710419877377e-07, + "loss": 0.0001, + "step": 29659 + }, + { + "epoch": 12.061813745424969, + "grad_norm": 0.00800850269921869, + "learning_rate": 2.719363175245893e-07, + "loss": 0.0001, + "step": 29660 + }, + { + "epoch": 12.062220414802765, + "grad_norm": 0.0004121998250474989, + "learning_rate": 2.7170169292491056e-07, + "loss": 0.0, + "step": 29661 + }, + { + "epoch": 12.062627084180562, + "grad_norm": 0.005814835282996947, + "learning_rate": 2.7146716819111073e-07, + "loss": 0.0, + "step": 29662 + }, + { + "epoch": 12.063033753558358, + "grad_norm": 0.004215472976284072, + "learning_rate": 2.712327433255968e-07, + "loss": 0.0, + "step": 29663 + }, + { + "epoch": 12.063440422936154, + "grad_norm": 0.0007125507412402384, + "learning_rate": 2.709984183307757e-07, + "loss": 0.0, + "step": 29664 + }, + { + "epoch": 12.06384709231395, + "grad_norm": 0.0004069258307054947, + "learning_rate": 2.707641932090521e-07, + "loss": 0.0, + "step": 29665 + }, + { + "epoch": 12.064253761691745, + "grad_norm": 0.008650906278752485, + "learning_rate": 2.7053006796282977e-07, + "loss": 0.0, + "step": 29666 + }, + { + "epoch": 12.064660431069541, + "grad_norm": 0.06824086957724051, + "learning_rate": 2.702960425945145e-07, + "loss": 0.0004, + "step": 29667 + }, + { + "epoch": 12.065067100447337, + "grad_norm": 0.0013870786643408795, + "learning_rate": 2.700621171065043e-07, + "loss": 0.0, + "step": 29668 + }, + { + "epoch": 12.065473769825132, + "grad_norm": 0.0056766063034315315, + "learning_rate": 2.6982829150120625e-07, + "loss": 0.0001, + "step": 29669 + }, + { + "epoch": 12.065880439202928, + "grad_norm": 0.008841826992940753, + "learning_rate": 2.695945657810173e-07, + "loss": 0.0001, + "step": 29670 + }, + { + "epoch": 12.066287108580724, + "grad_norm": 0.00024252091973708085, + "learning_rate": 2.6936093994833654e-07, + "loss": 0.0, + "step": 29671 + }, + { + "epoch": 12.06669377795852, + "grad_norm": 0.00032748579295744707, + "learning_rate": 2.691274140055644e-07, + "loss": 0.0, + "step": 29672 + }, + { + "epoch": 12.067100447336315, + "grad_norm": 0.0001821515285466711, + "learning_rate": 2.688939879550956e-07, + "loss": 0.0, + "step": 29673 + }, + { + "epoch": 12.067507116714111, + "grad_norm": 0.0062923718330132935, + "learning_rate": 2.6866066179932703e-07, + "loss": 0.0001, + "step": 29674 + }, + { + "epoch": 12.067913786091907, + "grad_norm": 0.00013590270105345612, + "learning_rate": 2.6842743554065577e-07, + "loss": 0.0, + "step": 29675 + }, + { + "epoch": 12.068320455469703, + "grad_norm": 9.359744593696744e-05, + "learning_rate": 2.6819430918147425e-07, + "loss": 0.0, + "step": 29676 + }, + { + "epoch": 12.068727124847499, + "grad_norm": 8.346876606886174e-05, + "learning_rate": 2.679612827241762e-07, + "loss": 0.0, + "step": 29677 + }, + { + "epoch": 12.069133794225294, + "grad_norm": 0.033144520456801045, + "learning_rate": 2.677283561711541e-07, + "loss": 0.0003, + "step": 29678 + }, + { + "epoch": 12.06954046360309, + "grad_norm": 0.032007481036195363, + "learning_rate": 2.674955295247983e-07, + "loss": 0.0003, + "step": 29679 + }, + { + "epoch": 12.069947132980886, + "grad_norm": 0.009605630052035102, + "learning_rate": 2.6726280278749905e-07, + "loss": 0.0001, + "step": 29680 + }, + { + "epoch": 12.070353802358682, + "grad_norm": 0.0008074508095001, + "learning_rate": 2.670301759616445e-07, + "loss": 0.0, + "step": 29681 + }, + { + "epoch": 12.070760471736477, + "grad_norm": 0.019679945962935826, + "learning_rate": 2.6679764904962377e-07, + "loss": 0.0001, + "step": 29682 + }, + { + "epoch": 12.071167141114275, + "grad_norm": 0.0009630219943033945, + "learning_rate": 2.6656522205382504e-07, + "loss": 0.0, + "step": 29683 + }, + { + "epoch": 12.07157381049207, + "grad_norm": 0.008643658767641272, + "learning_rate": 2.6633289497663306e-07, + "loss": 0.0, + "step": 29684 + }, + { + "epoch": 12.071980479869866, + "grad_norm": 0.03780533445987979, + "learning_rate": 2.661006678204314e-07, + "loss": 0.0003, + "step": 29685 + }, + { + "epoch": 12.072387149247662, + "grad_norm": 0.01586417983064972, + "learning_rate": 2.658685405876071e-07, + "loss": 0.0, + "step": 29686 + }, + { + "epoch": 12.072793818625458, + "grad_norm": 0.04445040826110061, + "learning_rate": 2.656365132805394e-07, + "loss": 0.0005, + "step": 29687 + }, + { + "epoch": 12.073200488003254, + "grad_norm": 0.012543257910505836, + "learning_rate": 2.65404585901613e-07, + "loss": 0.0001, + "step": 29688 + }, + { + "epoch": 12.07360715738105, + "grad_norm": 0.006420660178101521, + "learning_rate": 2.6517275845320824e-07, + "loss": 0.0, + "step": 29689 + }, + { + "epoch": 12.074013826758845, + "grad_norm": 0.01798100409718382, + "learning_rate": 2.649410309377043e-07, + "loss": 0.0002, + "step": 29690 + }, + { + "epoch": 12.074420496136641, + "grad_norm": 0.0009655948497072478, + "learning_rate": 2.647094033574804e-07, + "loss": 0.0, + "step": 29691 + }, + { + "epoch": 12.074827165514437, + "grad_norm": 0.03146037981177685, + "learning_rate": 2.6447787571491356e-07, + "loss": 0.0003, + "step": 29692 + }, + { + "epoch": 12.075233834892233, + "grad_norm": 0.0018713044546259897, + "learning_rate": 2.6424644801238185e-07, + "loss": 0.0, + "step": 29693 + }, + { + "epoch": 12.075640504270028, + "grad_norm": 0.0006962357998455652, + "learning_rate": 2.6401512025225894e-07, + "loss": 0.0, + "step": 29694 + }, + { + "epoch": 12.076047173647824, + "grad_norm": 0.0004719597436994692, + "learning_rate": 2.637838924369218e-07, + "loss": 0.0, + "step": 29695 + }, + { + "epoch": 12.07645384302562, + "grad_norm": 0.037603303350881356, + "learning_rate": 2.635527645687441e-07, + "loss": 0.0004, + "step": 29696 + }, + { + "epoch": 12.076860512403416, + "grad_norm": 0.009103866444592636, + "learning_rate": 2.633217366500962e-07, + "loss": 0.0001, + "step": 29697 + }, + { + "epoch": 12.077267181781211, + "grad_norm": 0.005495625088452147, + "learning_rate": 2.630908086833517e-07, + "loss": 0.0, + "step": 29698 + }, + { + "epoch": 12.077673851159007, + "grad_norm": 0.0016139347514516529, + "learning_rate": 2.628599806708798e-07, + "loss": 0.0, + "step": 29699 + }, + { + "epoch": 12.078080520536803, + "grad_norm": 0.02418379463067831, + "learning_rate": 2.626292526150509e-07, + "loss": 0.0002, + "step": 29700 + }, + { + "epoch": 12.078487189914599, + "grad_norm": 0.003471349277902421, + "learning_rate": 2.623986245182342e-07, + "loss": 0.0, + "step": 29701 + }, + { + "epoch": 12.078893859292394, + "grad_norm": 0.0023966855950014335, + "learning_rate": 2.621680963827955e-07, + "loss": 0.0, + "step": 29702 + }, + { + "epoch": 12.079300528670192, + "grad_norm": 0.008242347477590997, + "learning_rate": 2.61937668211103e-07, + "loss": 0.0, + "step": 29703 + }, + { + "epoch": 12.079707198047988, + "grad_norm": 0.0010170697947182623, + "learning_rate": 2.6170734000552146e-07, + "loss": 0.0, + "step": 29704 + }, + { + "epoch": 12.080113867425784, + "grad_norm": 0.001970268573222663, + "learning_rate": 2.614771117684156e-07, + "loss": 0.0, + "step": 29705 + }, + { + "epoch": 12.08052053680358, + "grad_norm": 0.01975817145426119, + "learning_rate": 2.612469835021481e-07, + "loss": 0.0002, + "step": 29706 + }, + { + "epoch": 12.080927206181375, + "grad_norm": 0.001964243082518845, + "learning_rate": 2.610169552090802e-07, + "loss": 0.0, + "step": 29707 + }, + { + "epoch": 12.08133387555917, + "grad_norm": 0.0024682587525802395, + "learning_rate": 2.6078702689157577e-07, + "loss": 0.0, + "step": 29708 + }, + { + "epoch": 12.081740544936967, + "grad_norm": 0.03780466893857506, + "learning_rate": 2.60557198551995e-07, + "loss": 0.0002, + "step": 29709 + }, + { + "epoch": 12.082147214314762, + "grad_norm": 6.693441278921338e-05, + "learning_rate": 2.603274701926961e-07, + "loss": 0.0, + "step": 29710 + }, + { + "epoch": 12.082553883692558, + "grad_norm": 0.001349186336893538, + "learning_rate": 2.6009784181603716e-07, + "loss": 0.0, + "step": 29711 + }, + { + "epoch": 12.082960553070354, + "grad_norm": 0.0011863676499617574, + "learning_rate": 2.5986831342437624e-07, + "loss": 0.0, + "step": 29712 + }, + { + "epoch": 12.08336722244815, + "grad_norm": 0.6819438920642618, + "learning_rate": 2.5963888502006816e-07, + "loss": 0.0066, + "step": 29713 + }, + { + "epoch": 12.083773891825945, + "grad_norm": 0.0036364301046631024, + "learning_rate": 2.5940955660546995e-07, + "loss": 0.0, + "step": 29714 + }, + { + "epoch": 12.084180561203741, + "grad_norm": 0.00021608694452879433, + "learning_rate": 2.5918032818293523e-07, + "loss": 0.0, + "step": 29715 + }, + { + "epoch": 12.084587230581537, + "grad_norm": 0.00045734876289048933, + "learning_rate": 2.5895119975481666e-07, + "loss": 0.0, + "step": 29716 + }, + { + "epoch": 12.084993899959333, + "grad_norm": 0.190979835293037, + "learning_rate": 2.5872217132346665e-07, + "loss": 0.0015, + "step": 29717 + }, + { + "epoch": 12.085400569337128, + "grad_norm": 0.009523678223469095, + "learning_rate": 2.5849324289123567e-07, + "loss": 0.0001, + "step": 29718 + }, + { + "epoch": 12.085807238714924, + "grad_norm": 0.001783024647012575, + "learning_rate": 2.582644144604751e-07, + "loss": 0.0, + "step": 29719 + }, + { + "epoch": 12.08621390809272, + "grad_norm": 0.0016420585027260345, + "learning_rate": 2.5803568603353204e-07, + "loss": 0.0, + "step": 29720 + }, + { + "epoch": 12.086620577470516, + "grad_norm": 0.001586361957989014, + "learning_rate": 2.5780705761275673e-07, + "loss": 0.0, + "step": 29721 + }, + { + "epoch": 12.087027246848312, + "grad_norm": 0.017081704333317876, + "learning_rate": 2.575785292004951e-07, + "loss": 0.0001, + "step": 29722 + }, + { + "epoch": 12.087433916226107, + "grad_norm": 0.008277475891191361, + "learning_rate": 2.5735010079909306e-07, + "loss": 0.0, + "step": 29723 + }, + { + "epoch": 12.087840585603905, + "grad_norm": 0.0022224364398897804, + "learning_rate": 2.571217724108965e-07, + "loss": 0.0, + "step": 29724 + }, + { + "epoch": 12.0882472549817, + "grad_norm": 0.017080030360918777, + "learning_rate": 2.5689354403824694e-07, + "loss": 0.0001, + "step": 29725 + }, + { + "epoch": 12.088653924359496, + "grad_norm": 4.265454224885108e-05, + "learning_rate": 2.566654156834891e-07, + "loss": 0.0, + "step": 29726 + }, + { + "epoch": 12.089060593737292, + "grad_norm": 0.018261893137078305, + "learning_rate": 2.564373873489645e-07, + "loss": 0.0002, + "step": 29727 + }, + { + "epoch": 12.089467263115088, + "grad_norm": 0.00035876323279146385, + "learning_rate": 2.5620945903701453e-07, + "loss": 0.0, + "step": 29728 + }, + { + "epoch": 12.089873932492884, + "grad_norm": 0.047843001397194156, + "learning_rate": 2.559816307499785e-07, + "loss": 0.0003, + "step": 29729 + }, + { + "epoch": 12.09028060187068, + "grad_norm": 0.007760372634533486, + "learning_rate": 2.557539024901945e-07, + "loss": 0.0001, + "step": 29730 + }, + { + "epoch": 12.090687271248475, + "grad_norm": 0.00038662564972228295, + "learning_rate": 2.555262742600007e-07, + "loss": 0.0, + "step": 29731 + }, + { + "epoch": 12.091093940626271, + "grad_norm": 0.006223192678960656, + "learning_rate": 2.55298746061734e-07, + "loss": 0.0, + "step": 29732 + }, + { + "epoch": 12.091500610004067, + "grad_norm": 0.023866125463054385, + "learning_rate": 2.5507131789773044e-07, + "loss": 0.0001, + "step": 29733 + }, + { + "epoch": 12.091907279381862, + "grad_norm": 0.005587921639425383, + "learning_rate": 2.548439897703248e-07, + "loss": 0.0001, + "step": 29734 + }, + { + "epoch": 12.092313948759658, + "grad_norm": 0.008877836632440742, + "learning_rate": 2.5461676168184955e-07, + "loss": 0.0001, + "step": 29735 + }, + { + "epoch": 12.092720618137454, + "grad_norm": 0.009565186756488278, + "learning_rate": 2.543896336346374e-07, + "loss": 0.0001, + "step": 29736 + }, + { + "epoch": 12.09312728751525, + "grad_norm": 0.009388181665676464, + "learning_rate": 2.5416260563102204e-07, + "loss": 0.0001, + "step": 29737 + }, + { + "epoch": 12.093533956893046, + "grad_norm": 0.010139466289983017, + "learning_rate": 2.5393567767333036e-07, + "loss": 0.0001, + "step": 29738 + }, + { + "epoch": 12.093940626270841, + "grad_norm": 0.00014300177727470477, + "learning_rate": 2.537088497638962e-07, + "loss": 0.0, + "step": 29739 + }, + { + "epoch": 12.094347295648637, + "grad_norm": 0.0014914542586192264, + "learning_rate": 2.534821219050443e-07, + "loss": 0.0, + "step": 29740 + }, + { + "epoch": 12.094753965026433, + "grad_norm": 0.00021892576967703808, + "learning_rate": 2.53255494099105e-07, + "loss": 0.0, + "step": 29741 + }, + { + "epoch": 12.095160634404229, + "grad_norm": 0.059222600541729835, + "learning_rate": 2.530289663484031e-07, + "loss": 0.0005, + "step": 29742 + }, + { + "epoch": 12.095567303782024, + "grad_norm": 0.00988033638328747, + "learning_rate": 2.528025386552646e-07, + "loss": 0.0001, + "step": 29743 + }, + { + "epoch": 12.095973973159822, + "grad_norm": 0.0054038057068283045, + "learning_rate": 2.52576211022012e-07, + "loss": 0.0, + "step": 29744 + }, + { + "epoch": 12.096380642537618, + "grad_norm": 0.044846119878757494, + "learning_rate": 2.523499834509724e-07, + "loss": 0.0002, + "step": 29745 + }, + { + "epoch": 12.096787311915413, + "grad_norm": 0.0001197446380261284, + "learning_rate": 2.521238559444661e-07, + "loss": 0.0, + "step": 29746 + }, + { + "epoch": 12.09719398129321, + "grad_norm": 0.0023906597446088714, + "learning_rate": 2.5189782850481346e-07, + "loss": 0.0, + "step": 29747 + }, + { + "epoch": 12.097600650671005, + "grad_norm": 0.004102413782463155, + "learning_rate": 2.5167190113433603e-07, + "loss": 0.0, + "step": 29748 + }, + { + "epoch": 12.0980073200488, + "grad_norm": 5.633658664520301e-05, + "learning_rate": 2.5144607383535304e-07, + "loss": 0.0, + "step": 29749 + }, + { + "epoch": 12.098413989426597, + "grad_norm": 0.004342541784868292, + "learning_rate": 2.5122034661018256e-07, + "loss": 0.0, + "step": 29750 + }, + { + "epoch": 12.098820658804392, + "grad_norm": 0.000306761820087086, + "learning_rate": 2.5099471946114064e-07, + "loss": 0.0, + "step": 29751 + }, + { + "epoch": 12.099227328182188, + "grad_norm": 0.08087390527447119, + "learning_rate": 2.507691923905453e-07, + "loss": 0.0005, + "step": 29752 + }, + { + "epoch": 12.099633997559984, + "grad_norm": 0.0013506171685968322, + "learning_rate": 2.5054376540071034e-07, + "loss": 0.0, + "step": 29753 + }, + { + "epoch": 12.10004066693778, + "grad_norm": 0.03372964897453069, + "learning_rate": 2.5031843849395164e-07, + "loss": 0.0002, + "step": 29754 + }, + { + "epoch": 12.100447336315575, + "grad_norm": 0.02704823793672123, + "learning_rate": 2.500932116725796e-07, + "loss": 0.0003, + "step": 29755 + }, + { + "epoch": 12.100854005693371, + "grad_norm": 0.00575106129673996, + "learning_rate": 2.49868084938909e-07, + "loss": 0.0001, + "step": 29756 + }, + { + "epoch": 12.101260675071167, + "grad_norm": 0.00877269747769145, + "learning_rate": 2.4964305829524914e-07, + "loss": 0.0001, + "step": 29757 + }, + { + "epoch": 12.101667344448963, + "grad_norm": 0.03760311501076985, + "learning_rate": 2.494181317439104e-07, + "loss": 0.0004, + "step": 29758 + }, + { + "epoch": 12.102074013826758, + "grad_norm": 0.0005207241194398848, + "learning_rate": 2.49193305287202e-07, + "loss": 0.0, + "step": 29759 + }, + { + "epoch": 12.102480683204554, + "grad_norm": 0.0024656239556915373, + "learning_rate": 2.489685789274321e-07, + "loss": 0.0, + "step": 29760 + }, + { + "epoch": 12.10288735258235, + "grad_norm": 0.0891542220565771, + "learning_rate": 2.4874395266690666e-07, + "loss": 0.0006, + "step": 29761 + }, + { + "epoch": 12.103294021960146, + "grad_norm": 1.0298471792539403, + "learning_rate": 2.485194265079327e-07, + "loss": 0.0131, + "step": 29762 + }, + { + "epoch": 12.103700691337941, + "grad_norm": 0.0015733175340558127, + "learning_rate": 2.4829500045281507e-07, + "loss": 0.0, + "step": 29763 + }, + { + "epoch": 12.104107360715737, + "grad_norm": 0.047206429730433694, + "learning_rate": 2.4807067450385525e-07, + "loss": 0.0004, + "step": 29764 + }, + { + "epoch": 12.104514030093535, + "grad_norm": 0.04301579515671359, + "learning_rate": 2.478464486633603e-07, + "loss": 0.0003, + "step": 29765 + }, + { + "epoch": 12.10492069947133, + "grad_norm": 0.023847496500824968, + "learning_rate": 2.476223229336283e-07, + "loss": 0.0001, + "step": 29766 + }, + { + "epoch": 12.105327368849126, + "grad_norm": 0.00010385660783709429, + "learning_rate": 2.47398297316962e-07, + "loss": 0.0, + "step": 29767 + }, + { + "epoch": 12.105734038226922, + "grad_norm": 0.06291877204958572, + "learning_rate": 2.4717437181566164e-07, + "loss": 0.0007, + "step": 29768 + }, + { + "epoch": 12.106140707604718, + "grad_norm": 0.016658657236147036, + "learning_rate": 2.469505464320232e-07, + "loss": 0.0001, + "step": 29769 + }, + { + "epoch": 12.106547376982514, + "grad_norm": 0.001704789710448602, + "learning_rate": 2.467268211683471e-07, + "loss": 0.0, + "step": 29770 + }, + { + "epoch": 12.10695404636031, + "grad_norm": 0.0019009194425901007, + "learning_rate": 2.4650319602692706e-07, + "loss": 0.0, + "step": 29771 + }, + { + "epoch": 12.107360715738105, + "grad_norm": 0.005346948145735302, + "learning_rate": 2.4627967101006236e-07, + "loss": 0.0, + "step": 29772 + }, + { + "epoch": 12.107767385115901, + "grad_norm": 0.008447714679162334, + "learning_rate": 2.4605624612004554e-07, + "loss": 0.0001, + "step": 29773 + }, + { + "epoch": 12.108174054493697, + "grad_norm": 0.0032303206588258373, + "learning_rate": 2.458329213591704e-07, + "loss": 0.0, + "step": 29774 + }, + { + "epoch": 12.108580723871492, + "grad_norm": 0.0014115966355145832, + "learning_rate": 2.4560969672972947e-07, + "loss": 0.0, + "step": 29775 + }, + { + "epoch": 12.108987393249288, + "grad_norm": 0.006642090349017398, + "learning_rate": 2.453865722340143e-07, + "loss": 0.0001, + "step": 29776 + }, + { + "epoch": 12.109394062627084, + "grad_norm": 0.23005034926948376, + "learning_rate": 2.451635478743142e-07, + "loss": 0.0017, + "step": 29777 + }, + { + "epoch": 12.10980073200488, + "grad_norm": 0.38382561789537156, + "learning_rate": 2.449406236529206e-07, + "loss": 0.0035, + "step": 29778 + }, + { + "epoch": 12.110207401382675, + "grad_norm": 0.07926858852446313, + "learning_rate": 2.447177995721206e-07, + "loss": 0.0007, + "step": 29779 + }, + { + "epoch": 12.110614070760471, + "grad_norm": 0.0010558348376171372, + "learning_rate": 2.444950756342035e-07, + "loss": 0.0, + "step": 29780 + }, + { + "epoch": 12.111020740138267, + "grad_norm": 0.012413885963199625, + "learning_rate": 2.4427245184145407e-07, + "loss": 0.0001, + "step": 29781 + }, + { + "epoch": 12.111427409516063, + "grad_norm": 0.035204777796698845, + "learning_rate": 2.4404992819615724e-07, + "loss": 0.0002, + "step": 29782 + }, + { + "epoch": 12.111834078893859, + "grad_norm": 0.015101982451168637, + "learning_rate": 2.438275047005978e-07, + "loss": 0.0002, + "step": 29783 + }, + { + "epoch": 12.112240748271654, + "grad_norm": 0.005265487933575794, + "learning_rate": 2.4360518135705836e-07, + "loss": 0.0, + "step": 29784 + }, + { + "epoch": 12.112647417649452, + "grad_norm": 0.0006876037891166264, + "learning_rate": 2.4338295816782373e-07, + "loss": 0.0, + "step": 29785 + }, + { + "epoch": 12.113054087027248, + "grad_norm": 0.02271455428746187, + "learning_rate": 2.431608351351733e-07, + "loss": 0.0001, + "step": 29786 + }, + { + "epoch": 12.113460756405043, + "grad_norm": 0.003432982606975688, + "learning_rate": 2.429388122613863e-07, + "loss": 0.0, + "step": 29787 + }, + { + "epoch": 12.11386742578284, + "grad_norm": 0.008284658829490182, + "learning_rate": 2.427168895487442e-07, + "loss": 0.0001, + "step": 29788 + }, + { + "epoch": 12.114274095160635, + "grad_norm": 5.31377411791269e-05, + "learning_rate": 2.424950669995241e-07, + "loss": 0.0, + "step": 29789 + }, + { + "epoch": 12.11468076453843, + "grad_norm": 0.0013547628884360116, + "learning_rate": 2.422733446160008e-07, + "loss": 0.0, + "step": 29790 + }, + { + "epoch": 12.115087433916226, + "grad_norm": 0.028431736168563166, + "learning_rate": 2.420517224004548e-07, + "loss": 0.0002, + "step": 29791 + }, + { + "epoch": 12.115494103294022, + "grad_norm": 5.932139627133409e-05, + "learning_rate": 2.418302003551576e-07, + "loss": 0.0, + "step": 29792 + }, + { + "epoch": 12.115900772671818, + "grad_norm": 0.14416574232998247, + "learning_rate": 2.416087784823862e-07, + "loss": 0.0016, + "step": 29793 + }, + { + "epoch": 12.116307442049614, + "grad_norm": 0.0005076180610508167, + "learning_rate": 2.41387456784411e-07, + "loss": 0.0, + "step": 29794 + }, + { + "epoch": 12.11671411142741, + "grad_norm": 0.0030958767894054525, + "learning_rate": 2.411662352635047e-07, + "loss": 0.0, + "step": 29795 + }, + { + "epoch": 12.117120780805205, + "grad_norm": 0.021016694037313686, + "learning_rate": 2.409451139219388e-07, + "loss": 0.0002, + "step": 29796 + }, + { + "epoch": 12.117527450183001, + "grad_norm": 0.004544565815357599, + "learning_rate": 2.407240927619825e-07, + "loss": 0.0, + "step": 29797 + }, + { + "epoch": 12.117934119560797, + "grad_norm": 0.0011944453361149969, + "learning_rate": 2.405031717859052e-07, + "loss": 0.0, + "step": 29798 + }, + { + "epoch": 12.118340788938593, + "grad_norm": 0.007924390266059844, + "learning_rate": 2.402823509959751e-07, + "loss": 0.0001, + "step": 29799 + }, + { + "epoch": 12.118747458316388, + "grad_norm": 0.0014104051321445108, + "learning_rate": 2.400616303944592e-07, + "loss": 0.0, + "step": 29800 + }, + { + "epoch": 12.119154127694184, + "grad_norm": 8.487738525362142e-05, + "learning_rate": 2.3984100998362237e-07, + "loss": 0.0, + "step": 29801 + }, + { + "epoch": 12.11956079707198, + "grad_norm": 0.005652732558161139, + "learning_rate": 2.396204897657295e-07, + "loss": 0.0, + "step": 29802 + }, + { + "epoch": 12.119967466449776, + "grad_norm": 0.004561419987018238, + "learning_rate": 2.394000697430432e-07, + "loss": 0.0001, + "step": 29803 + }, + { + "epoch": 12.120374135827571, + "grad_norm": 0.002648174785071367, + "learning_rate": 2.3917974991782943e-07, + "loss": 0.0, + "step": 29804 + }, + { + "epoch": 12.120780805205367, + "grad_norm": 0.001153343864520432, + "learning_rate": 2.3895953029234755e-07, + "loss": 0.0, + "step": 29805 + }, + { + "epoch": 12.121187474583165, + "grad_norm": 0.0006447410687110138, + "learning_rate": 2.38739410868859e-07, + "loss": 0.0, + "step": 29806 + }, + { + "epoch": 12.12159414396096, + "grad_norm": 0.002481911375116113, + "learning_rate": 2.3851939164962314e-07, + "loss": 0.0, + "step": 29807 + }, + { + "epoch": 12.122000813338756, + "grad_norm": 0.0006273149363460745, + "learning_rate": 2.3829947263689813e-07, + "loss": 0.0, + "step": 29808 + }, + { + "epoch": 12.122407482716552, + "grad_norm": 0.000793258004702674, + "learning_rate": 2.3807965383294328e-07, + "loss": 0.0, + "step": 29809 + }, + { + "epoch": 12.122814152094348, + "grad_norm": 0.019703337349737386, + "learning_rate": 2.3785993524001128e-07, + "loss": 0.0002, + "step": 29810 + }, + { + "epoch": 12.123220821472144, + "grad_norm": 0.00015592336465064305, + "learning_rate": 2.3764031686036248e-07, + "loss": 0.0, + "step": 29811 + }, + { + "epoch": 12.12362749084994, + "grad_norm": 0.0008087611299834656, + "learning_rate": 2.3742079869624846e-07, + "loss": 0.0, + "step": 29812 + }, + { + "epoch": 12.124034160227735, + "grad_norm": 0.0004179785689393545, + "learning_rate": 2.3720138074992293e-07, + "loss": 0.0, + "step": 29813 + }, + { + "epoch": 12.12444082960553, + "grad_norm": 0.01554229164754935, + "learning_rate": 2.369820630236397e-07, + "loss": 0.0001, + "step": 29814 + }, + { + "epoch": 12.124847498983327, + "grad_norm": 0.017234207220054594, + "learning_rate": 2.36762845519648e-07, + "loss": 0.0002, + "step": 29815 + }, + { + "epoch": 12.125254168361122, + "grad_norm": 0.00026539513322684875, + "learning_rate": 2.3654372824019946e-07, + "loss": 0.0, + "step": 29816 + }, + { + "epoch": 12.125660837738918, + "grad_norm": 0.0038262145119714853, + "learning_rate": 2.363247111875444e-07, + "loss": 0.0, + "step": 29817 + }, + { + "epoch": 12.126067507116714, + "grad_norm": 0.0002149687937745746, + "learning_rate": 2.3610579436392999e-07, + "loss": 0.0, + "step": 29818 + }, + { + "epoch": 12.12647417649451, + "grad_norm": 0.00018118795737773514, + "learning_rate": 2.358869777716033e-07, + "loss": 0.0, + "step": 29819 + }, + { + "epoch": 12.126880845872305, + "grad_norm": 0.04676835818905554, + "learning_rate": 2.356682614128103e-07, + "loss": 0.0005, + "step": 29820 + }, + { + "epoch": 12.127287515250101, + "grad_norm": 0.004227815256945844, + "learning_rate": 2.3544964528979807e-07, + "loss": 0.0, + "step": 29821 + }, + { + "epoch": 12.127694184627897, + "grad_norm": 2.887078169007788e-05, + "learning_rate": 2.352311294048093e-07, + "loss": 0.0, + "step": 29822 + }, + { + "epoch": 12.128100854005693, + "grad_norm": 0.00042941958933900214, + "learning_rate": 2.350127137600866e-07, + "loss": 0.0, + "step": 29823 + }, + { + "epoch": 12.128507523383488, + "grad_norm": 0.0033640954957539254, + "learning_rate": 2.3479439835787488e-07, + "loss": 0.0, + "step": 29824 + }, + { + "epoch": 12.128914192761284, + "grad_norm": 0.06055173808856714, + "learning_rate": 2.345761832004123e-07, + "loss": 0.0003, + "step": 29825 + }, + { + "epoch": 12.129320862139082, + "grad_norm": 0.00014116026202654546, + "learning_rate": 2.3435806828994046e-07, + "loss": 0.0, + "step": 29826 + }, + { + "epoch": 12.129727531516878, + "grad_norm": 0.017201526151848313, + "learning_rate": 2.341400536286975e-07, + "loss": 0.0001, + "step": 29827 + }, + { + "epoch": 12.130134200894673, + "grad_norm": 6.744984914535483e-05, + "learning_rate": 2.339221392189217e-07, + "loss": 0.0, + "step": 29828 + }, + { + "epoch": 12.130540870272469, + "grad_norm": 0.011480739512110774, + "learning_rate": 2.3370432506285014e-07, + "loss": 0.0001, + "step": 29829 + }, + { + "epoch": 12.130947539650265, + "grad_norm": 0.0025910272398693016, + "learning_rate": 2.3348661116271987e-07, + "loss": 0.0, + "step": 29830 + }, + { + "epoch": 12.13135420902806, + "grad_norm": 0.0009186296714552559, + "learning_rate": 2.332689975207647e-07, + "loss": 0.0, + "step": 29831 + }, + { + "epoch": 12.131760878405856, + "grad_norm": 0.00947973740829142, + "learning_rate": 2.330514841392184e-07, + "loss": 0.0001, + "step": 29832 + }, + { + "epoch": 12.132167547783652, + "grad_norm": 0.06428468667773705, + "learning_rate": 2.3283407102031473e-07, + "loss": 0.0006, + "step": 29833 + }, + { + "epoch": 12.132574217161448, + "grad_norm": 0.0037485152855588635, + "learning_rate": 2.3261675816628304e-07, + "loss": 0.0, + "step": 29834 + }, + { + "epoch": 12.132980886539244, + "grad_norm": 0.019733246962321573, + "learning_rate": 2.3239954557935818e-07, + "loss": 0.0002, + "step": 29835 + }, + { + "epoch": 12.13338755591704, + "grad_norm": 0.0030333655386316307, + "learning_rate": 2.3218243326176726e-07, + "loss": 0.0, + "step": 29836 + }, + { + "epoch": 12.133794225294835, + "grad_norm": 0.0001056316726905622, + "learning_rate": 2.3196542121574074e-07, + "loss": 0.0, + "step": 29837 + }, + { + "epoch": 12.134200894672631, + "grad_norm": 0.004460773117901156, + "learning_rate": 2.3174850944350458e-07, + "loss": 0.0, + "step": 29838 + }, + { + "epoch": 12.134607564050427, + "grad_norm": 0.000669406750525461, + "learning_rate": 2.315316979472859e-07, + "loss": 0.0, + "step": 29839 + }, + { + "epoch": 12.135014233428222, + "grad_norm": 0.006846945594947833, + "learning_rate": 2.3131498672931075e-07, + "loss": 0.0001, + "step": 29840 + }, + { + "epoch": 12.135420902806018, + "grad_norm": 0.00035155490594343335, + "learning_rate": 2.3109837579180394e-07, + "loss": 0.0, + "step": 29841 + }, + { + "epoch": 12.135827572183814, + "grad_norm": 0.035889123341865566, + "learning_rate": 2.3088186513698818e-07, + "loss": 0.0002, + "step": 29842 + }, + { + "epoch": 12.13623424156161, + "grad_norm": 0.015373458514699416, + "learning_rate": 2.3066545476708835e-07, + "loss": 0.0001, + "step": 29843 + }, + { + "epoch": 12.136640910939406, + "grad_norm": 0.03328495748039881, + "learning_rate": 2.304491446843238e-07, + "loss": 0.0002, + "step": 29844 + }, + { + "epoch": 12.137047580317201, + "grad_norm": 0.12219202933553, + "learning_rate": 2.302329348909149e-07, + "loss": 0.0012, + "step": 29845 + }, + { + "epoch": 12.137454249694997, + "grad_norm": 0.00856880154089542, + "learning_rate": 2.3001682538908333e-07, + "loss": 0.0001, + "step": 29846 + }, + { + "epoch": 12.137860919072795, + "grad_norm": 0.1306764996777768, + "learning_rate": 2.298008161810439e-07, + "loss": 0.0012, + "step": 29847 + }, + { + "epoch": 12.13826758845059, + "grad_norm": 0.003732556502705496, + "learning_rate": 2.2958490726901817e-07, + "loss": 0.0, + "step": 29848 + }, + { + "epoch": 12.138674257828386, + "grad_norm": 0.00974295478283299, + "learning_rate": 2.2936909865521993e-07, + "loss": 0.0001, + "step": 29849 + }, + { + "epoch": 12.139080927206182, + "grad_norm": 0.0010844954125389893, + "learning_rate": 2.2915339034186633e-07, + "loss": 0.0, + "step": 29850 + }, + { + "epoch": 12.139487596583978, + "grad_norm": 0.026915895119562806, + "learning_rate": 2.2893778233117004e-07, + "loss": 0.0002, + "step": 29851 + }, + { + "epoch": 12.139894265961773, + "grad_norm": 0.0013595426912480148, + "learning_rate": 2.287222746253459e-07, + "loss": 0.0, + "step": 29852 + }, + { + "epoch": 12.14030093533957, + "grad_norm": 1.5060252086269397e-06, + "learning_rate": 2.285068672266044e-07, + "loss": 0.0, + "step": 29853 + }, + { + "epoch": 12.140707604717365, + "grad_norm": 0.05592456843732458, + "learning_rate": 2.2829156013715714e-07, + "loss": 0.0005, + "step": 29854 + }, + { + "epoch": 12.14111427409516, + "grad_norm": 0.0050473237792917035, + "learning_rate": 2.2807635335921674e-07, + "loss": 0.0, + "step": 29855 + }, + { + "epoch": 12.141520943472957, + "grad_norm": 0.006378477576874477, + "learning_rate": 2.2786124689498922e-07, + "loss": 0.0001, + "step": 29856 + }, + { + "epoch": 12.141927612850752, + "grad_norm": 0.04484232489839596, + "learning_rate": 2.2764624074668506e-07, + "loss": 0.0004, + "step": 29857 + }, + { + "epoch": 12.142334282228548, + "grad_norm": 0.0008136090651407677, + "learning_rate": 2.2743133491651137e-07, + "loss": 0.0, + "step": 29858 + }, + { + "epoch": 12.142740951606344, + "grad_norm": 0.002384914177615594, + "learning_rate": 2.2721652940667193e-07, + "loss": 0.0, + "step": 29859 + }, + { + "epoch": 12.14314762098414, + "grad_norm": 0.012188245834299116, + "learning_rate": 2.270018242193739e-07, + "loss": 0.0001, + "step": 29860 + }, + { + "epoch": 12.143554290361935, + "grad_norm": 0.02430877143091606, + "learning_rate": 2.267872193568199e-07, + "loss": 0.0003, + "step": 29861 + }, + { + "epoch": 12.143960959739731, + "grad_norm": 0.010658641761105273, + "learning_rate": 2.265727148212149e-07, + "loss": 0.0001, + "step": 29862 + }, + { + "epoch": 12.144367629117527, + "grad_norm": 0.001346168602417484, + "learning_rate": 2.2635831061476043e-07, + "loss": 0.0, + "step": 29863 + }, + { + "epoch": 12.144774298495323, + "grad_norm": 0.00016967622313484508, + "learning_rate": 2.2614400673965586e-07, + "loss": 0.0, + "step": 29864 + }, + { + "epoch": 12.145180967873118, + "grad_norm": 0.0029149590143270343, + "learning_rate": 2.2592980319810275e-07, + "loss": 0.0, + "step": 29865 + }, + { + "epoch": 12.145587637250914, + "grad_norm": 0.006113873425704841, + "learning_rate": 2.2571569999229936e-07, + "loss": 0.0, + "step": 29866 + }, + { + "epoch": 12.145994306628712, + "grad_norm": 0.004880749760051265, + "learning_rate": 2.2550169712444281e-07, + "loss": 0.0, + "step": 29867 + }, + { + "epoch": 12.146400976006507, + "grad_norm": 5.9306198475920985e-05, + "learning_rate": 2.2528779459673134e-07, + "loss": 0.0, + "step": 29868 + }, + { + "epoch": 12.146807645384303, + "grad_norm": 6.327248063422292e-05, + "learning_rate": 2.2507399241135985e-07, + "loss": 0.0, + "step": 29869 + }, + { + "epoch": 12.147214314762099, + "grad_norm": 0.0478534020619321, + "learning_rate": 2.248602905705244e-07, + "loss": 0.0005, + "step": 29870 + }, + { + "epoch": 12.147620984139895, + "grad_norm": 0.003312040912039025, + "learning_rate": 2.246466890764165e-07, + "loss": 0.0, + "step": 29871 + }, + { + "epoch": 12.14802765351769, + "grad_norm": 0.0003754241422074546, + "learning_rate": 2.2443318793123114e-07, + "loss": 0.0, + "step": 29872 + }, + { + "epoch": 12.148434322895486, + "grad_norm": 0.0008576641659270796, + "learning_rate": 2.2421978713715875e-07, + "loss": 0.0, + "step": 29873 + }, + { + "epoch": 12.148840992273282, + "grad_norm": 0.08256206787980631, + "learning_rate": 2.240064866963898e-07, + "loss": 0.0007, + "step": 29874 + }, + { + "epoch": 12.149247661651078, + "grad_norm": 0.0541024079068413, + "learning_rate": 2.2379328661111477e-07, + "loss": 0.0004, + "step": 29875 + }, + { + "epoch": 12.149654331028874, + "grad_norm": 0.002428348692644514, + "learning_rate": 2.235801868835219e-07, + "loss": 0.0, + "step": 29876 + }, + { + "epoch": 12.15006100040667, + "grad_norm": 0.04356624822499804, + "learning_rate": 2.2336718751579834e-07, + "loss": 0.0004, + "step": 29877 + }, + { + "epoch": 12.150467669784465, + "grad_norm": 0.0032686331797132677, + "learning_rate": 2.2315428851013232e-07, + "loss": 0.0, + "step": 29878 + }, + { + "epoch": 12.150874339162261, + "grad_norm": 0.024068120019543338, + "learning_rate": 2.2294148986870768e-07, + "loss": 0.0002, + "step": 29879 + }, + { + "epoch": 12.151281008540057, + "grad_norm": 0.016068676282941615, + "learning_rate": 2.227287915937082e-07, + "loss": 0.0001, + "step": 29880 + }, + { + "epoch": 12.151687677917852, + "grad_norm": 0.00784978033210742, + "learning_rate": 2.2251619368731881e-07, + "loss": 0.0001, + "step": 29881 + }, + { + "epoch": 12.152094347295648, + "grad_norm": 0.0011572125675440087, + "learning_rate": 2.223036961517222e-07, + "loss": 0.0, + "step": 29882 + }, + { + "epoch": 12.152501016673444, + "grad_norm": 0.06482461696241923, + "learning_rate": 2.2209129898909887e-07, + "loss": 0.0005, + "step": 29883 + }, + { + "epoch": 12.15290768605124, + "grad_norm": 0.0005190710376245394, + "learning_rate": 2.218790022016304e-07, + "loss": 0.0, + "step": 29884 + }, + { + "epoch": 12.153314355429035, + "grad_norm": 0.0032083185109263467, + "learning_rate": 2.216668057914939e-07, + "loss": 0.0, + "step": 29885 + }, + { + "epoch": 12.153721024806831, + "grad_norm": 0.0005195218153528688, + "learning_rate": 2.2145470976086992e-07, + "loss": 0.0, + "step": 29886 + }, + { + "epoch": 12.154127694184627, + "grad_norm": 0.0009913339688983212, + "learning_rate": 2.2124271411193333e-07, + "loss": 0.0, + "step": 29887 + }, + { + "epoch": 12.154534363562425, + "grad_norm": 0.002401793791558197, + "learning_rate": 2.210308188468635e-07, + "loss": 0.0, + "step": 29888 + }, + { + "epoch": 12.15494103294022, + "grad_norm": 0.000697364123367722, + "learning_rate": 2.2081902396783318e-07, + "loss": 0.0, + "step": 29889 + }, + { + "epoch": 12.155347702318016, + "grad_norm": 0.019840940053997204, + "learning_rate": 2.2060732947701834e-07, + "loss": 0.0002, + "step": 29890 + }, + { + "epoch": 12.155754371695812, + "grad_norm": 0.011131728083017794, + "learning_rate": 2.2039573537658953e-07, + "loss": 0.0001, + "step": 29891 + }, + { + "epoch": 12.156161041073608, + "grad_norm": 0.007879054466103935, + "learning_rate": 2.2018424166872164e-07, + "loss": 0.0, + "step": 29892 + }, + { + "epoch": 12.156567710451403, + "grad_norm": 0.0043710331081355394, + "learning_rate": 2.1997284835558296e-07, + "loss": 0.0, + "step": 29893 + }, + { + "epoch": 12.1569743798292, + "grad_norm": 0.0025276094000833203, + "learning_rate": 2.197615554393473e-07, + "loss": 0.0, + "step": 29894 + }, + { + "epoch": 12.157381049206995, + "grad_norm": 0.002772837488732813, + "learning_rate": 2.1955036292218067e-07, + "loss": 0.0, + "step": 29895 + }, + { + "epoch": 12.15778771858479, + "grad_norm": 0.0003568118948430947, + "learning_rate": 2.1933927080625138e-07, + "loss": 0.0, + "step": 29896 + }, + { + "epoch": 12.158194387962586, + "grad_norm": 0.00020062131998247316, + "learning_rate": 2.191282790937277e-07, + "loss": 0.0, + "step": 29897 + }, + { + "epoch": 12.158601057340382, + "grad_norm": 0.005739844350448295, + "learning_rate": 2.1891738778677563e-07, + "loss": 0.0001, + "step": 29898 + }, + { + "epoch": 12.159007726718178, + "grad_norm": 0.000175765717372719, + "learning_rate": 2.187065968875579e-07, + "loss": 0.0, + "step": 29899 + }, + { + "epoch": 12.159414396095974, + "grad_norm": 0.0023810215051651815, + "learning_rate": 2.1849590639823948e-07, + "loss": 0.0, + "step": 29900 + }, + { + "epoch": 12.15982106547377, + "grad_norm": 0.09303110483408206, + "learning_rate": 2.1828531632098415e-07, + "loss": 0.0008, + "step": 29901 + }, + { + "epoch": 12.160227734851565, + "grad_norm": 4.053993894171302e-05, + "learning_rate": 2.1807482665795354e-07, + "loss": 0.0, + "step": 29902 + }, + { + "epoch": 12.160634404229361, + "grad_norm": 0.007849597955945933, + "learning_rate": 2.1786443741130813e-07, + "loss": 0.0001, + "step": 29903 + }, + { + "epoch": 12.161041073607157, + "grad_norm": 0.0021608518988053957, + "learning_rate": 2.1765414858320734e-07, + "loss": 0.0, + "step": 29904 + }, + { + "epoch": 12.161447742984953, + "grad_norm": 0.0022382529043361177, + "learning_rate": 2.1744396017580938e-07, + "loss": 0.0, + "step": 29905 + }, + { + "epoch": 12.161854412362748, + "grad_norm": 0.1072597390886533, + "learning_rate": 2.1723387219127257e-07, + "loss": 0.0009, + "step": 29906 + }, + { + "epoch": 12.162261081740544, + "grad_norm": 0.004076642883844703, + "learning_rate": 2.1702388463175405e-07, + "loss": 0.0, + "step": 29907 + }, + { + "epoch": 12.162667751118342, + "grad_norm": 0.32598291709717886, + "learning_rate": 2.168139974994088e-07, + "loss": 0.003, + "step": 29908 + }, + { + "epoch": 12.163074420496137, + "grad_norm": 0.0014251733368405661, + "learning_rate": 2.166042107963917e-07, + "loss": 0.0, + "step": 29909 + }, + { + "epoch": 12.163481089873933, + "grad_norm": 0.0015851652907804404, + "learning_rate": 2.1639452452485665e-07, + "loss": 0.0, + "step": 29910 + }, + { + "epoch": 12.163887759251729, + "grad_norm": 0.0011726724976729205, + "learning_rate": 2.1618493868695633e-07, + "loss": 0.0, + "step": 29911 + }, + { + "epoch": 12.164294428629525, + "grad_norm": 0.0055693214187515425, + "learning_rate": 2.1597545328484016e-07, + "loss": 0.0001, + "step": 29912 + }, + { + "epoch": 12.16470109800732, + "grad_norm": 0.0001041263763405184, + "learning_rate": 2.1576606832066082e-07, + "loss": 0.0, + "step": 29913 + }, + { + "epoch": 12.165107767385116, + "grad_norm": 0.010176514363632833, + "learning_rate": 2.1555678379656663e-07, + "loss": 0.0001, + "step": 29914 + }, + { + "epoch": 12.165514436762912, + "grad_norm": 0.005896135666789879, + "learning_rate": 2.1534759971470698e-07, + "loss": 0.0, + "step": 29915 + }, + { + "epoch": 12.165921106140708, + "grad_norm": 0.21764286849571443, + "learning_rate": 2.1513851607722902e-07, + "loss": 0.0019, + "step": 29916 + }, + { + "epoch": 12.166327775518504, + "grad_norm": 0.0002831653178388799, + "learning_rate": 2.1492953288627882e-07, + "loss": 0.0, + "step": 29917 + }, + { + "epoch": 12.1667344448963, + "grad_norm": 0.09584787760530789, + "learning_rate": 2.1472065014400135e-07, + "loss": 0.0009, + "step": 29918 + }, + { + "epoch": 12.167141114274095, + "grad_norm": 0.0008481992096001294, + "learning_rate": 2.1451186785254152e-07, + "loss": 0.0, + "step": 29919 + }, + { + "epoch": 12.16754778365189, + "grad_norm": 0.033688350787954895, + "learning_rate": 2.143031860140421e-07, + "loss": 0.0003, + "step": 29920 + }, + { + "epoch": 12.167954453029687, + "grad_norm": 0.0011229389102723477, + "learning_rate": 2.1409460463064579e-07, + "loss": 0.0, + "step": 29921 + }, + { + "epoch": 12.168361122407482, + "grad_norm": 0.000294673065460941, + "learning_rate": 2.1388612370449314e-07, + "loss": 0.0, + "step": 29922 + }, + { + "epoch": 12.168767791785278, + "grad_norm": 0.004851623277046909, + "learning_rate": 2.136777432377246e-07, + "loss": 0.0, + "step": 29923 + }, + { + "epoch": 12.169174461163074, + "grad_norm": 0.008379900077579055, + "learning_rate": 2.1346946323247963e-07, + "loss": 0.0001, + "step": 29924 + }, + { + "epoch": 12.16958113054087, + "grad_norm": 0.02625191881835448, + "learning_rate": 2.132612836908965e-07, + "loss": 0.0003, + "step": 29925 + }, + { + "epoch": 12.169987799918665, + "grad_norm": 0.0077934910666406934, + "learning_rate": 2.1305320461511124e-07, + "loss": 0.0001, + "step": 29926 + }, + { + "epoch": 12.170394469296461, + "grad_norm": 0.07600775112316459, + "learning_rate": 2.128452260072611e-07, + "loss": 0.0005, + "step": 29927 + }, + { + "epoch": 12.170801138674257, + "grad_norm": 0.00857218374100482, + "learning_rate": 2.1263734786947986e-07, + "loss": 0.0001, + "step": 29928 + }, + { + "epoch": 12.171207808052054, + "grad_norm": 0.014864965587791124, + "learning_rate": 2.1242957020390254e-07, + "loss": 0.0001, + "step": 29929 + }, + { + "epoch": 12.17161447742985, + "grad_norm": 0.026643803130450346, + "learning_rate": 2.1222189301266072e-07, + "loss": 0.0001, + "step": 29930 + }, + { + "epoch": 12.172021146807646, + "grad_norm": 0.2180923159135507, + "learning_rate": 2.1201431629788716e-07, + "loss": 0.0016, + "step": 29931 + }, + { + "epoch": 12.172427816185442, + "grad_norm": 0.010152609641967967, + "learning_rate": 2.1180684006171348e-07, + "loss": 0.0001, + "step": 29932 + }, + { + "epoch": 12.172834485563238, + "grad_norm": 2.735350732560042e-05, + "learning_rate": 2.115994643062702e-07, + "loss": 0.0, + "step": 29933 + }, + { + "epoch": 12.173241154941033, + "grad_norm": 1.5680245545936224, + "learning_rate": 2.1139218903368342e-07, + "loss": 0.0142, + "step": 29934 + }, + { + "epoch": 12.173647824318829, + "grad_norm": 0.22048582643197523, + "learning_rate": 2.1118501424608252e-07, + "loss": 0.0014, + "step": 29935 + }, + { + "epoch": 12.174054493696625, + "grad_norm": 9.647014833612819e-05, + "learning_rate": 2.1097793994559356e-07, + "loss": 0.0, + "step": 29936 + }, + { + "epoch": 12.17446116307442, + "grad_norm": 0.0008048661462414206, + "learning_rate": 2.1077096613434267e-07, + "loss": 0.0, + "step": 29937 + }, + { + "epoch": 12.174867832452216, + "grad_norm": 0.02005950602695615, + "learning_rate": 2.105640928144559e-07, + "loss": 0.0001, + "step": 29938 + }, + { + "epoch": 12.175274501830012, + "grad_norm": 0.013849934024015754, + "learning_rate": 2.1035731998805486e-07, + "loss": 0.0001, + "step": 29939 + }, + { + "epoch": 12.175681171207808, + "grad_norm": 0.0019708213666529257, + "learning_rate": 2.1015064765726345e-07, + "loss": 0.0, + "step": 29940 + }, + { + "epoch": 12.176087840585604, + "grad_norm": 0.045165637695235135, + "learning_rate": 2.0994407582420218e-07, + "loss": 0.0005, + "step": 29941 + }, + { + "epoch": 12.1764945099634, + "grad_norm": 0.00011927903429681392, + "learning_rate": 2.0973760449099267e-07, + "loss": 0.0, + "step": 29942 + }, + { + "epoch": 12.176901179341195, + "grad_norm": 0.00016490689362889398, + "learning_rate": 2.0953123365975324e-07, + "loss": 0.0, + "step": 29943 + }, + { + "epoch": 12.177307848718991, + "grad_norm": 0.024712803620198846, + "learning_rate": 2.0932496333260332e-07, + "loss": 0.0002, + "step": 29944 + }, + { + "epoch": 12.177714518096787, + "grad_norm": 0.0007180850626614878, + "learning_rate": 2.091187935116601e-07, + "loss": 0.0, + "step": 29945 + }, + { + "epoch": 12.178121187474582, + "grad_norm": 0.00013512408430477229, + "learning_rate": 2.0891272419904074e-07, + "loss": 0.0, + "step": 29946 + }, + { + "epoch": 12.178527856852378, + "grad_norm": 0.01406813501216462, + "learning_rate": 2.0870675539686024e-07, + "loss": 0.0001, + "step": 29947 + }, + { + "epoch": 12.178934526230174, + "grad_norm": 0.00036719629569507685, + "learning_rate": 2.0850088710723247e-07, + "loss": 0.0, + "step": 29948 + }, + { + "epoch": 12.179341195607972, + "grad_norm": 0.03285588014541481, + "learning_rate": 2.082951193322702e-07, + "loss": 0.0002, + "step": 29949 + }, + { + "epoch": 12.179747864985767, + "grad_norm": 1.0220984127389768e-06, + "learning_rate": 2.0808945207408725e-07, + "loss": 0.0, + "step": 29950 + }, + { + "epoch": 12.180154534363563, + "grad_norm": 0.011974839782048335, + "learning_rate": 2.078838853347942e-07, + "loss": 0.0001, + "step": 29951 + }, + { + "epoch": 12.180561203741359, + "grad_norm": 0.003918912368498006, + "learning_rate": 2.0767841911650155e-07, + "loss": 0.0, + "step": 29952 + }, + { + "epoch": 12.180967873119155, + "grad_norm": 0.005503767951211314, + "learning_rate": 2.0747305342131763e-07, + "loss": 0.0001, + "step": 29953 + }, + { + "epoch": 12.18137454249695, + "grad_norm": 0.003471460759944813, + "learning_rate": 2.0726778825135186e-07, + "loss": 0.0, + "step": 29954 + }, + { + "epoch": 12.181781211874746, + "grad_norm": 0.008359972400911336, + "learning_rate": 2.0706262360871143e-07, + "loss": 0.0001, + "step": 29955 + }, + { + "epoch": 12.182187881252542, + "grad_norm": 0.007771561452016615, + "learning_rate": 2.0685755949550136e-07, + "loss": 0.0001, + "step": 29956 + }, + { + "epoch": 12.182594550630338, + "grad_norm": 5.9425869540644364e-05, + "learning_rate": 2.0665259591382658e-07, + "loss": 0.0, + "step": 29957 + }, + { + "epoch": 12.183001220008133, + "grad_norm": 0.003070275760938646, + "learning_rate": 2.064477328657921e-07, + "loss": 0.0, + "step": 29958 + }, + { + "epoch": 12.18340788938593, + "grad_norm": 0.12896104321695115, + "learning_rate": 2.062429703535007e-07, + "loss": 0.0013, + "step": 29959 + }, + { + "epoch": 12.183814558763725, + "grad_norm": 0.0059861355620477305, + "learning_rate": 2.0603830837905403e-07, + "loss": 0.0001, + "step": 29960 + }, + { + "epoch": 12.18422122814152, + "grad_norm": 0.0014532499175455118, + "learning_rate": 2.0583374694455484e-07, + "loss": 0.0, + "step": 29961 + }, + { + "epoch": 12.184627897519317, + "grad_norm": 0.0006633430459637441, + "learning_rate": 2.056292860521003e-07, + "loss": 0.0, + "step": 29962 + }, + { + "epoch": 12.185034566897112, + "grad_norm": 8.337372704923384e-05, + "learning_rate": 2.054249257037899e-07, + "loss": 0.0, + "step": 29963 + }, + { + "epoch": 12.185441236274908, + "grad_norm": 0.041385445479234284, + "learning_rate": 2.0522066590172307e-07, + "loss": 0.0004, + "step": 29964 + }, + { + "epoch": 12.185847905652704, + "grad_norm": 0.018926965488011906, + "learning_rate": 2.0501650664799476e-07, + "loss": 0.0002, + "step": 29965 + }, + { + "epoch": 12.1862545750305, + "grad_norm": 0.0003596605536457335, + "learning_rate": 2.0481244794470334e-07, + "loss": 0.0, + "step": 29966 + }, + { + "epoch": 12.186661244408295, + "grad_norm": 0.0089936941455781, + "learning_rate": 2.0460848979394045e-07, + "loss": 0.0001, + "step": 29967 + }, + { + "epoch": 12.187067913786091, + "grad_norm": 6.527660915582103e-05, + "learning_rate": 2.0440463219780215e-07, + "loss": 0.0, + "step": 29968 + }, + { + "epoch": 12.187474583163887, + "grad_norm": 0.0015162899564317071, + "learning_rate": 2.0420087515838017e-07, + "loss": 0.0, + "step": 29969 + }, + { + "epoch": 12.187881252541684, + "grad_norm": 0.00040153998220583025, + "learning_rate": 2.039972186777661e-07, + "loss": 0.0, + "step": 29970 + }, + { + "epoch": 12.18828792191948, + "grad_norm": 0.028117634520396687, + "learning_rate": 2.0379366275805057e-07, + "loss": 0.0004, + "step": 29971 + }, + { + "epoch": 12.188694591297276, + "grad_norm": 0.04119171251891687, + "learning_rate": 2.0359020740132407e-07, + "loss": 0.0003, + "step": 29972 + }, + { + "epoch": 12.189101260675072, + "grad_norm": 0.016382298072206102, + "learning_rate": 2.0338685260967495e-07, + "loss": 0.0001, + "step": 29973 + }, + { + "epoch": 12.189507930052867, + "grad_norm": 0.0039035238181936833, + "learning_rate": 2.0318359838518932e-07, + "loss": 0.0, + "step": 29974 + }, + { + "epoch": 12.189914599430663, + "grad_norm": 0.0007880788383022192, + "learning_rate": 2.0298044472995548e-07, + "loss": 0.0, + "step": 29975 + }, + { + "epoch": 12.190321268808459, + "grad_norm": 0.005461355984513524, + "learning_rate": 2.0277739164605737e-07, + "loss": 0.0, + "step": 29976 + }, + { + "epoch": 12.190727938186255, + "grad_norm": 0.0018191428433878096, + "learning_rate": 2.0257443913557996e-07, + "loss": 0.0, + "step": 29977 + }, + { + "epoch": 12.19113460756405, + "grad_norm": 0.0013437647854015692, + "learning_rate": 2.0237158720060712e-07, + "loss": 0.0, + "step": 29978 + }, + { + "epoch": 12.191541276941846, + "grad_norm": 0.06821495263694609, + "learning_rate": 2.0216883584322167e-07, + "loss": 0.0004, + "step": 29979 + }, + { + "epoch": 12.191947946319642, + "grad_norm": 0.013330670928785995, + "learning_rate": 2.0196618506550413e-07, + "loss": 0.0001, + "step": 29980 + }, + { + "epoch": 12.192354615697438, + "grad_norm": 0.010316277105336205, + "learning_rate": 2.017636348695351e-07, + "loss": 0.0001, + "step": 29981 + }, + { + "epoch": 12.192761285075234, + "grad_norm": 8.024625212003106e-05, + "learning_rate": 2.0156118525739287e-07, + "loss": 0.0, + "step": 29982 + }, + { + "epoch": 12.19316795445303, + "grad_norm": 0.010537600315227876, + "learning_rate": 2.0135883623115692e-07, + "loss": 0.0, + "step": 29983 + }, + { + "epoch": 12.193574623830825, + "grad_norm": 9.312020668094618e-05, + "learning_rate": 2.0115658779290338e-07, + "loss": 0.0, + "step": 29984 + }, + { + "epoch": 12.193981293208621, + "grad_norm": 0.003968351721662788, + "learning_rate": 2.0095443994471053e-07, + "loss": 0.0, + "step": 29985 + }, + { + "epoch": 12.194387962586417, + "grad_norm": 0.01098864923366164, + "learning_rate": 2.0075239268865122e-07, + "loss": 0.0001, + "step": 29986 + }, + { + "epoch": 12.194794631964212, + "grad_norm": 0.1058150238281516, + "learning_rate": 2.005504460268004e-07, + "loss": 0.0011, + "step": 29987 + }, + { + "epoch": 12.195201301342008, + "grad_norm": 0.07161189254511066, + "learning_rate": 2.00348599961232e-07, + "loss": 0.0006, + "step": 29988 + }, + { + "epoch": 12.195607970719804, + "grad_norm": 0.347282024203696, + "learning_rate": 2.001468544940166e-07, + "loss": 0.0026, + "step": 29989 + }, + { + "epoch": 12.196014640097602, + "grad_norm": 0.0015824214194223986, + "learning_rate": 1.9994520962722586e-07, + "loss": 0.0, + "step": 29990 + }, + { + "epoch": 12.196421309475397, + "grad_norm": 0.037258575358548644, + "learning_rate": 1.997436653629292e-07, + "loss": 0.0002, + "step": 29991 + }, + { + "epoch": 12.196827978853193, + "grad_norm": 2.0611469486926355e-05, + "learning_rate": 1.9954222170319726e-07, + "loss": 0.0, + "step": 29992 + }, + { + "epoch": 12.197234648230989, + "grad_norm": 0.038730525311653725, + "learning_rate": 1.9934087865009722e-07, + "loss": 0.0002, + "step": 29993 + }, + { + "epoch": 12.197641317608785, + "grad_norm": 0.0007538443679222769, + "learning_rate": 1.991396362056952e-07, + "loss": 0.0, + "step": 29994 + }, + { + "epoch": 12.19804798698658, + "grad_norm": 0.04284575797071799, + "learning_rate": 1.9893849437205737e-07, + "loss": 0.0003, + "step": 29995 + }, + { + "epoch": 12.198454656364376, + "grad_norm": 0.015163547771944341, + "learning_rate": 1.987374531512476e-07, + "loss": 0.0001, + "step": 29996 + }, + { + "epoch": 12.198861325742172, + "grad_norm": 0.00183483381642396, + "learning_rate": 1.9853651254533202e-07, + "loss": 0.0, + "step": 29997 + }, + { + "epoch": 12.199267995119968, + "grad_norm": 0.006439062905075654, + "learning_rate": 1.983356725563712e-07, + "loss": 0.0001, + "step": 29998 + }, + { + "epoch": 12.199674664497763, + "grad_norm": 0.008830888435754306, + "learning_rate": 1.9813493318642906e-07, + "loss": 0.0001, + "step": 29999 + }, + { + "epoch": 12.20008133387556, + "grad_norm": 0.0050798372376530615, + "learning_rate": 1.9793429443756394e-07, + "loss": 0.0001, + "step": 30000 + }, + { + "epoch": 12.200488003253355, + "grad_norm": 0.0006742622811492739, + "learning_rate": 1.9773375631183757e-07, + "loss": 0.0, + "step": 30001 + }, + { + "epoch": 12.20089467263115, + "grad_norm": 0.011175256534379362, + "learning_rate": 1.975333188113071e-07, + "loss": 0.0001, + "step": 30002 + }, + { + "epoch": 12.201301342008946, + "grad_norm": 0.0006160914048211698, + "learning_rate": 1.9733298193802985e-07, + "loss": 0.0, + "step": 30003 + }, + { + "epoch": 12.201708011386742, + "grad_norm": 0.1131777552090638, + "learning_rate": 1.9713274569406414e-07, + "loss": 0.001, + "step": 30004 + }, + { + "epoch": 12.202114680764538, + "grad_norm": 0.029175142753687523, + "learning_rate": 1.9693261008146392e-07, + "loss": 0.0003, + "step": 30005 + }, + { + "epoch": 12.202521350142334, + "grad_norm": 0.01697329949490758, + "learning_rate": 1.9673257510228416e-07, + "loss": 0.0001, + "step": 30006 + }, + { + "epoch": 12.20292801952013, + "grad_norm": 0.15583768539175835, + "learning_rate": 1.965326407585788e-07, + "loss": 0.0015, + "step": 30007 + }, + { + "epoch": 12.203334688897925, + "grad_norm": 0.0020623078455645036, + "learning_rate": 1.9633280705239955e-07, + "loss": 0.0, + "step": 30008 + }, + { + "epoch": 12.203741358275721, + "grad_norm": 0.0003552720189980221, + "learning_rate": 1.9613307398579807e-07, + "loss": 0.0, + "step": 30009 + }, + { + "epoch": 12.204148027653517, + "grad_norm": 0.0022364017460710004, + "learning_rate": 1.9593344156082493e-07, + "loss": 0.0, + "step": 30010 + }, + { + "epoch": 12.204554697031314, + "grad_norm": 0.0034254060402516867, + "learning_rate": 1.9573390977952967e-07, + "loss": 0.0, + "step": 30011 + }, + { + "epoch": 12.20496136640911, + "grad_norm": 0.0030351155362921315, + "learning_rate": 1.9553447864395947e-07, + "loss": 0.0, + "step": 30012 + }, + { + "epoch": 12.205368035786906, + "grad_norm": 0.025420489936229355, + "learning_rate": 1.9533514815616273e-07, + "loss": 0.0003, + "step": 30013 + }, + { + "epoch": 12.205774705164702, + "grad_norm": 0.0728939216925721, + "learning_rate": 1.9513591831818557e-07, + "loss": 0.0007, + "step": 30014 + }, + { + "epoch": 12.206181374542497, + "grad_norm": 0.0017286841506721515, + "learning_rate": 1.9493678913207303e-07, + "loss": 0.0, + "step": 30015 + }, + { + "epoch": 12.206588043920293, + "grad_norm": 0.00975477872431043, + "learning_rate": 1.9473776059986792e-07, + "loss": 0.0001, + "step": 30016 + }, + { + "epoch": 12.206994713298089, + "grad_norm": 0.026280550942824388, + "learning_rate": 1.945388327236153e-07, + "loss": 0.0002, + "step": 30017 + }, + { + "epoch": 12.207401382675885, + "grad_norm": 0.009865706671761284, + "learning_rate": 1.9434000550535682e-07, + "loss": 0.0, + "step": 30018 + }, + { + "epoch": 12.20780805205368, + "grad_norm": 0.0006186800945320469, + "learning_rate": 1.9414127894713307e-07, + "loss": 0.0, + "step": 30019 + }, + { + "epoch": 12.208214721431476, + "grad_norm": 0.029211604095304464, + "learning_rate": 1.939426530509847e-07, + "loss": 0.0003, + "step": 30020 + }, + { + "epoch": 12.208621390809272, + "grad_norm": 0.00032442827875326903, + "learning_rate": 1.937441278189489e-07, + "loss": 0.0, + "step": 30021 + }, + { + "epoch": 12.209028060187068, + "grad_norm": 0.0008128465620389298, + "learning_rate": 1.9354570325306521e-07, + "loss": 0.0, + "step": 30022 + }, + { + "epoch": 12.209434729564864, + "grad_norm": 0.00010673227086310324, + "learning_rate": 1.9334737935537306e-07, + "loss": 0.0, + "step": 30023 + }, + { + "epoch": 12.20984139894266, + "grad_norm": 0.18682833258814321, + "learning_rate": 1.9314915612790307e-07, + "loss": 0.0016, + "step": 30024 + }, + { + "epoch": 12.210248068320455, + "grad_norm": 0.001317360673953191, + "learning_rate": 1.9295103357269362e-07, + "loss": 0.0, + "step": 30025 + }, + { + "epoch": 12.21065473769825, + "grad_norm": 0.010934715028189096, + "learning_rate": 1.9275301169177752e-07, + "loss": 0.0001, + "step": 30026 + }, + { + "epoch": 12.211061407076047, + "grad_norm": 0.008494414729500058, + "learning_rate": 1.9255509048718646e-07, + "loss": 0.0001, + "step": 30027 + }, + { + "epoch": 12.211468076453842, + "grad_norm": 6.469545550019888e-05, + "learning_rate": 1.9235726996095438e-07, + "loss": 0.0, + "step": 30028 + }, + { + "epoch": 12.211874745831638, + "grad_norm": 0.00018511364491521538, + "learning_rate": 1.9215955011511078e-07, + "loss": 0.0, + "step": 30029 + }, + { + "epoch": 12.212281415209434, + "grad_norm": 8.67166393684307e-05, + "learning_rate": 1.9196193095168624e-07, + "loss": 0.0, + "step": 30030 + }, + { + "epoch": 12.212688084587231, + "grad_norm": 0.0005272894650276161, + "learning_rate": 1.9176441247270805e-07, + "loss": 0.0, + "step": 30031 + }, + { + "epoch": 12.213094753965027, + "grad_norm": 0.001737137250557536, + "learning_rate": 1.9156699468020567e-07, + "loss": 0.0, + "step": 30032 + }, + { + "epoch": 12.213501423342823, + "grad_norm": 0.010165148005629376, + "learning_rate": 1.9136967757620305e-07, + "loss": 0.0001, + "step": 30033 + }, + { + "epoch": 12.213908092720619, + "grad_norm": 0.0005362871801122404, + "learning_rate": 1.9117246116272746e-07, + "loss": 0.0, + "step": 30034 + }, + { + "epoch": 12.214314762098414, + "grad_norm": 0.022472639027422178, + "learning_rate": 1.9097534544180395e-07, + "loss": 0.0003, + "step": 30035 + }, + { + "epoch": 12.21472143147621, + "grad_norm": 0.00514948300236467, + "learning_rate": 1.9077833041545534e-07, + "loss": 0.0001, + "step": 30036 + }, + { + "epoch": 12.215128100854006, + "grad_norm": 5.333367003336771e-06, + "learning_rate": 1.9058141608570335e-07, + "loss": 0.0, + "step": 30037 + }, + { + "epoch": 12.215534770231802, + "grad_norm": 0.0028275695740265553, + "learning_rate": 1.903846024545708e-07, + "loss": 0.0, + "step": 30038 + }, + { + "epoch": 12.215941439609598, + "grad_norm": 0.0012537861797851794, + "learning_rate": 1.9018788952407718e-07, + "loss": 0.0, + "step": 30039 + }, + { + "epoch": 12.216348108987393, + "grad_norm": 0.004527799409678587, + "learning_rate": 1.899912772962409e-07, + "loss": 0.0, + "step": 30040 + }, + { + "epoch": 12.216754778365189, + "grad_norm": 0.013250317748385806, + "learning_rate": 1.8979476577308253e-07, + "loss": 0.0001, + "step": 30041 + }, + { + "epoch": 12.217161447742985, + "grad_norm": 0.002004126605282496, + "learning_rate": 1.8959835495661827e-07, + "loss": 0.0, + "step": 30042 + }, + { + "epoch": 12.21756811712078, + "grad_norm": 0.00023044758080314042, + "learning_rate": 1.8940204484886425e-07, + "loss": 0.0, + "step": 30043 + }, + { + "epoch": 12.217974786498576, + "grad_norm": 0.0020511578744261613, + "learning_rate": 1.892058354518367e-07, + "loss": 0.0, + "step": 30044 + }, + { + "epoch": 12.218381455876372, + "grad_norm": 6.887954726690405e-05, + "learning_rate": 1.8900972676754725e-07, + "loss": 0.0, + "step": 30045 + }, + { + "epoch": 12.218788125254168, + "grad_norm": 0.0083479227963433, + "learning_rate": 1.8881371879801213e-07, + "loss": 0.0001, + "step": 30046 + }, + { + "epoch": 12.219194794631964, + "grad_norm": 0.0003352279258081813, + "learning_rate": 1.8861781154524083e-07, + "loss": 0.0, + "step": 30047 + }, + { + "epoch": 12.21960146400976, + "grad_norm": 0.0013165736997726306, + "learning_rate": 1.884220050112462e-07, + "loss": 0.0, + "step": 30048 + }, + { + "epoch": 12.220008133387555, + "grad_norm": 0.003613021827288072, + "learning_rate": 1.8822629919803769e-07, + "loss": 0.0, + "step": 30049 + }, + { + "epoch": 12.220414802765351, + "grad_norm": 0.00012294493305334484, + "learning_rate": 1.8803069410762488e-07, + "loss": 0.0, + "step": 30050 + }, + { + "epoch": 12.220821472143147, + "grad_norm": 6.115470576497084e-05, + "learning_rate": 1.8783518974201498e-07, + "loss": 0.0, + "step": 30051 + }, + { + "epoch": 12.221228141520944, + "grad_norm": 0.009018966183360197, + "learning_rate": 1.8763978610321533e-07, + "loss": 0.0001, + "step": 30052 + }, + { + "epoch": 12.22163481089874, + "grad_norm": 0.023169857778727813, + "learning_rate": 1.8744448319323095e-07, + "loss": 0.0003, + "step": 30053 + }, + { + "epoch": 12.222041480276536, + "grad_norm": 0.00012345900025775283, + "learning_rate": 1.8724928101406802e-07, + "loss": 0.0, + "step": 30054 + }, + { + "epoch": 12.222448149654332, + "grad_norm": 0.0049978692426477485, + "learning_rate": 1.8705417956773052e-07, + "loss": 0.0, + "step": 30055 + }, + { + "epoch": 12.222854819032127, + "grad_norm": 0.09404530031840025, + "learning_rate": 1.868591788562202e-07, + "loss": 0.0008, + "step": 30056 + }, + { + "epoch": 12.223261488409923, + "grad_norm": 0.0012648770979394514, + "learning_rate": 1.866642788815387e-07, + "loss": 0.0, + "step": 30057 + }, + { + "epoch": 12.223668157787719, + "grad_norm": 0.007615846411635428, + "learning_rate": 1.8646947964568783e-07, + "loss": 0.0, + "step": 30058 + }, + { + "epoch": 12.224074827165515, + "grad_norm": 0.010464447789449684, + "learning_rate": 1.8627478115066712e-07, + "loss": 0.0001, + "step": 30059 + }, + { + "epoch": 12.22448149654331, + "grad_norm": 0.006753919318431632, + "learning_rate": 1.8608018339847377e-07, + "loss": 0.0, + "step": 30060 + }, + { + "epoch": 12.224888165921106, + "grad_norm": 0.007355807025277129, + "learning_rate": 1.8588568639110738e-07, + "loss": 0.0, + "step": 30061 + }, + { + "epoch": 12.225294835298902, + "grad_norm": 0.0009336888423373332, + "learning_rate": 1.8569129013056408e-07, + "loss": 0.0, + "step": 30062 + }, + { + "epoch": 12.225701504676698, + "grad_norm": 0.0009215967715866889, + "learning_rate": 1.8549699461883898e-07, + "loss": 0.0, + "step": 30063 + }, + { + "epoch": 12.226108174054493, + "grad_norm": 1.8588231979835276e-05, + "learning_rate": 1.85302799857926e-07, + "loss": 0.0, + "step": 30064 + }, + { + "epoch": 12.22651484343229, + "grad_norm": 0.007893177050725089, + "learning_rate": 1.8510870584982022e-07, + "loss": 0.0001, + "step": 30065 + }, + { + "epoch": 12.226921512810085, + "grad_norm": 0.003987514531056021, + "learning_rate": 1.8491471259651229e-07, + "loss": 0.0, + "step": 30066 + }, + { + "epoch": 12.22732818218788, + "grad_norm": 0.118289517526701, + "learning_rate": 1.8472082009999614e-07, + "loss": 0.0009, + "step": 30067 + }, + { + "epoch": 12.227734851565677, + "grad_norm": 0.00036053277642876855, + "learning_rate": 1.8452702836225912e-07, + "loss": 0.0, + "step": 30068 + }, + { + "epoch": 12.228141520943472, + "grad_norm": 1.1363835962116358, + "learning_rate": 1.8433333738529402e-07, + "loss": 0.0122, + "step": 30069 + }, + { + "epoch": 12.228548190321268, + "grad_norm": 0.004214660308908061, + "learning_rate": 1.8413974717108595e-07, + "loss": 0.0, + "step": 30070 + }, + { + "epoch": 12.228954859699064, + "grad_norm": 0.0029069980444700423, + "learning_rate": 1.8394625772162446e-07, + "loss": 0.0, + "step": 30071 + }, + { + "epoch": 12.229361529076861, + "grad_norm": 0.007619507751023593, + "learning_rate": 1.8375286903889455e-07, + "loss": 0.0, + "step": 30072 + }, + { + "epoch": 12.229768198454657, + "grad_norm": 0.016723384087403384, + "learning_rate": 1.8355958112488138e-07, + "loss": 0.0001, + "step": 30073 + }, + { + "epoch": 12.230174867832453, + "grad_norm": 0.005133614762034172, + "learning_rate": 1.8336639398156997e-07, + "loss": 0.0, + "step": 30074 + }, + { + "epoch": 12.230581537210249, + "grad_norm": 0.015677601022164894, + "learning_rate": 1.831733076109432e-07, + "loss": 0.0002, + "step": 30075 + }, + { + "epoch": 12.230988206588044, + "grad_norm": 0.00018127500567978428, + "learning_rate": 1.8298032201498284e-07, + "loss": 0.0, + "step": 30076 + }, + { + "epoch": 12.23139487596584, + "grad_norm": 0.000266902564533603, + "learning_rate": 1.827874371956706e-07, + "loss": 0.0, + "step": 30077 + }, + { + "epoch": 12.231801545343636, + "grad_norm": 0.021321124731654276, + "learning_rate": 1.8259465315498603e-07, + "loss": 0.0002, + "step": 30078 + }, + { + "epoch": 12.232208214721432, + "grad_norm": 0.23667486068653118, + "learning_rate": 1.8240196989490867e-07, + "loss": 0.0027, + "step": 30079 + }, + { + "epoch": 12.232614884099227, + "grad_norm": 0.006736833783937494, + "learning_rate": 1.8220938741741579e-07, + "loss": 0.0, + "step": 30080 + }, + { + "epoch": 12.233021553477023, + "grad_norm": 0.005895689585372491, + "learning_rate": 1.820169057244847e-07, + "loss": 0.0, + "step": 30081 + }, + { + "epoch": 12.233428222854819, + "grad_norm": 0.0002571479688535451, + "learning_rate": 1.8182452481809165e-07, + "loss": 0.0, + "step": 30082 + }, + { + "epoch": 12.233834892232615, + "grad_norm": 0.004486934621560712, + "learning_rate": 1.8163224470021168e-07, + "loss": 0.0, + "step": 30083 + }, + { + "epoch": 12.23424156161041, + "grad_norm": 0.00337012595432342, + "learning_rate": 1.8144006537281767e-07, + "loss": 0.0, + "step": 30084 + }, + { + "epoch": 12.234648230988206, + "grad_norm": 1.8258237572220264e-06, + "learning_rate": 1.8124798683788357e-07, + "loss": 0.0, + "step": 30085 + }, + { + "epoch": 12.235054900366002, + "grad_norm": 0.08616136069231527, + "learning_rate": 1.8105600909737897e-07, + "loss": 0.0006, + "step": 30086 + }, + { + "epoch": 12.235461569743798, + "grad_norm": 0.020829868156680045, + "learning_rate": 1.808641321532778e-07, + "loss": 0.0002, + "step": 30087 + }, + { + "epoch": 12.235868239121594, + "grad_norm": 0.0003962591545171436, + "learning_rate": 1.806723560075474e-07, + "loss": 0.0, + "step": 30088 + }, + { + "epoch": 12.23627490849939, + "grad_norm": 2.212224360511762e-05, + "learning_rate": 1.804806806621573e-07, + "loss": 0.0, + "step": 30089 + }, + { + "epoch": 12.236681577877185, + "grad_norm": 0.0067400729061342455, + "learning_rate": 1.802891061190759e-07, + "loss": 0.0001, + "step": 30090 + }, + { + "epoch": 12.237088247254981, + "grad_norm": 0.037261436260218384, + "learning_rate": 1.8009763238026834e-07, + "loss": 0.0003, + "step": 30091 + }, + { + "epoch": 12.237494916632777, + "grad_norm": 0.05733185292949605, + "learning_rate": 1.7990625944770078e-07, + "loss": 0.0005, + "step": 30092 + }, + { + "epoch": 12.237901586010574, + "grad_norm": 0.006228995082766796, + "learning_rate": 1.7971498732333725e-07, + "loss": 0.0001, + "step": 30093 + }, + { + "epoch": 12.23830825538837, + "grad_norm": 0.00011363341146083249, + "learning_rate": 1.7952381600914172e-07, + "loss": 0.0, + "step": 30094 + }, + { + "epoch": 12.238714924766166, + "grad_norm": 0.006690286849416093, + "learning_rate": 1.7933274550707814e-07, + "loss": 0.0001, + "step": 30095 + }, + { + "epoch": 12.239121594143962, + "grad_norm": 0.004473580067769786, + "learning_rate": 1.7914177581910607e-07, + "loss": 0.0, + "step": 30096 + }, + { + "epoch": 12.239528263521757, + "grad_norm": 0.03780897464299536, + "learning_rate": 1.789509069471862e-07, + "loss": 0.0001, + "step": 30097 + }, + { + "epoch": 12.239934932899553, + "grad_norm": 0.045901699202716, + "learning_rate": 1.7876013889327803e-07, + "loss": 0.0003, + "step": 30098 + }, + { + "epoch": 12.240341602277349, + "grad_norm": 0.008898136960094873, + "learning_rate": 1.7856947165934003e-07, + "loss": 0.0001, + "step": 30099 + }, + { + "epoch": 12.240748271655145, + "grad_norm": 0.05310582887489385, + "learning_rate": 1.7837890524732948e-07, + "loss": 0.0003, + "step": 30100 + }, + { + "epoch": 12.24115494103294, + "grad_norm": 0.0008215584080710413, + "learning_rate": 1.7818843965920263e-07, + "loss": 0.0, + "step": 30101 + }, + { + "epoch": 12.241561610410736, + "grad_norm": 0.008433182874924401, + "learning_rate": 1.7799807489691457e-07, + "loss": 0.0001, + "step": 30102 + }, + { + "epoch": 12.241968279788532, + "grad_norm": 0.0016577332986898482, + "learning_rate": 1.7780781096241927e-07, + "loss": 0.0, + "step": 30103 + }, + { + "epoch": 12.242374949166328, + "grad_norm": 0.003737610542469099, + "learning_rate": 1.7761764785767077e-07, + "loss": 0.0, + "step": 30104 + }, + { + "epoch": 12.242781618544123, + "grad_norm": 0.005239829942081217, + "learning_rate": 1.774275855846208e-07, + "loss": 0.0, + "step": 30105 + }, + { + "epoch": 12.24318828792192, + "grad_norm": 0.004073323963367807, + "learning_rate": 1.7723762414521896e-07, + "loss": 0.0, + "step": 30106 + }, + { + "epoch": 12.243594957299715, + "grad_norm": 0.013118066965932772, + "learning_rate": 1.7704776354141695e-07, + "loss": 0.0001, + "step": 30107 + }, + { + "epoch": 12.24400162667751, + "grad_norm": 0.019432577521273732, + "learning_rate": 1.7685800377516436e-07, + "loss": 0.0002, + "step": 30108 + }, + { + "epoch": 12.244408296055306, + "grad_norm": 0.10063434576477044, + "learning_rate": 1.7666834484840856e-07, + "loss": 0.0008, + "step": 30109 + }, + { + "epoch": 12.244814965433102, + "grad_norm": 1.3128838809902283e-05, + "learning_rate": 1.764787867630957e-07, + "loss": 0.0, + "step": 30110 + }, + { + "epoch": 12.245221634810898, + "grad_norm": 0.01063136740604364, + "learning_rate": 1.7628932952117207e-07, + "loss": 0.0001, + "step": 30111 + }, + { + "epoch": 12.245628304188694, + "grad_norm": 0.007678604041570393, + "learning_rate": 1.760999731245816e-07, + "loss": 0.0001, + "step": 30112 + }, + { + "epoch": 12.246034973566491, + "grad_norm": 0.00014404620293019727, + "learning_rate": 1.7591071757526944e-07, + "loss": 0.0, + "step": 30113 + }, + { + "epoch": 12.246441642944287, + "grad_norm": 0.0006054379235412797, + "learning_rate": 1.757215628751796e-07, + "loss": 0.0, + "step": 30114 + }, + { + "epoch": 12.246848312322083, + "grad_norm": 0.017467848983989585, + "learning_rate": 1.7553250902625162e-07, + "loss": 0.0001, + "step": 30115 + }, + { + "epoch": 12.247254981699879, + "grad_norm": 0.04393353699488095, + "learning_rate": 1.7534355603042618e-07, + "loss": 0.0002, + "step": 30116 + }, + { + "epoch": 12.247661651077674, + "grad_norm": 0.012771312092246733, + "learning_rate": 1.7515470388964507e-07, + "loss": 0.0001, + "step": 30117 + }, + { + "epoch": 12.24806832045547, + "grad_norm": 0.0016533342243264806, + "learning_rate": 1.749659526058456e-07, + "loss": 0.0, + "step": 30118 + }, + { + "epoch": 12.248474989833266, + "grad_norm": 0.015780722542520765, + "learning_rate": 1.74777302180964e-07, + "loss": 0.0002, + "step": 30119 + }, + { + "epoch": 12.248881659211062, + "grad_norm": 0.051177199138752724, + "learning_rate": 1.7458875261693986e-07, + "loss": 0.0004, + "step": 30120 + }, + { + "epoch": 12.249288328588857, + "grad_norm": 0.002591799209121151, + "learning_rate": 1.7440030391570605e-07, + "loss": 0.0, + "step": 30121 + }, + { + "epoch": 12.249694997966653, + "grad_norm": 0.0021747320898625427, + "learning_rate": 1.7421195607919882e-07, + "loss": 0.0, + "step": 30122 + }, + { + "epoch": 12.250101667344449, + "grad_norm": 0.00014896081244743041, + "learning_rate": 1.740237091093522e-07, + "loss": 0.0, + "step": 30123 + }, + { + "epoch": 12.250508336722245, + "grad_norm": 0.00010038635583180232, + "learning_rate": 1.7383556300809567e-07, + "loss": 0.0, + "step": 30124 + }, + { + "epoch": 12.25091500610004, + "grad_norm": 0.0034906893635759132, + "learning_rate": 1.7364751777736334e-07, + "loss": 0.0, + "step": 30125 + }, + { + "epoch": 12.251321675477836, + "grad_norm": 0.0004270668737591632, + "learning_rate": 1.734595734190847e-07, + "loss": 0.0, + "step": 30126 + }, + { + "epoch": 12.251728344855632, + "grad_norm": 0.00017646126132477247, + "learning_rate": 1.7327172993518937e-07, + "loss": 0.0, + "step": 30127 + }, + { + "epoch": 12.252135014233428, + "grad_norm": 0.00816965425385983, + "learning_rate": 1.7308398732760468e-07, + "loss": 0.0, + "step": 30128 + }, + { + "epoch": 12.252541683611224, + "grad_norm": 0.0012080181153737053, + "learning_rate": 1.728963455982602e-07, + "loss": 0.0, + "step": 30129 + }, + { + "epoch": 12.25294835298902, + "grad_norm": 0.040791023533544385, + "learning_rate": 1.727088047490777e-07, + "loss": 0.0004, + "step": 30130 + }, + { + "epoch": 12.253355022366815, + "grad_norm": 0.000920004083290894, + "learning_rate": 1.725213647819879e-07, + "loss": 0.0, + "step": 30131 + }, + { + "epoch": 12.25376169174461, + "grad_norm": 0.004721595159269055, + "learning_rate": 1.7233402569891145e-07, + "loss": 0.0, + "step": 30132 + }, + { + "epoch": 12.254168361122407, + "grad_norm": 7.441178351716974e-06, + "learning_rate": 1.7214678750177237e-07, + "loss": 0.0, + "step": 30133 + }, + { + "epoch": 12.254575030500204, + "grad_norm": 0.03214415904973734, + "learning_rate": 1.719596501924936e-07, + "loss": 0.0001, + "step": 30134 + }, + { + "epoch": 12.254981699878, + "grad_norm": 0.016878099940506052, + "learning_rate": 1.717726137729947e-07, + "loss": 0.0002, + "step": 30135 + }, + { + "epoch": 12.255388369255796, + "grad_norm": 0.00356221502358944, + "learning_rate": 1.7158567824519635e-07, + "loss": 0.0, + "step": 30136 + }, + { + "epoch": 12.255795038633591, + "grad_norm": 0.006520357279062373, + "learning_rate": 1.71398843611017e-07, + "loss": 0.0, + "step": 30137 + }, + { + "epoch": 12.256201708011387, + "grad_norm": 0.03684182707560462, + "learning_rate": 1.7121210987237512e-07, + "loss": 0.0003, + "step": 30138 + }, + { + "epoch": 12.256608377389183, + "grad_norm": 0.003046951516031587, + "learning_rate": 1.710254770311881e-07, + "loss": 0.0, + "step": 30139 + }, + { + "epoch": 12.257015046766979, + "grad_norm": 0.05735837057210615, + "learning_rate": 1.7083894508937216e-07, + "loss": 0.0004, + "step": 30140 + }, + { + "epoch": 12.257421716144774, + "grad_norm": 0.04352692901111667, + "learning_rate": 1.706525140488402e-07, + "loss": 0.0005, + "step": 30141 + }, + { + "epoch": 12.25782838552257, + "grad_norm": 0.12002644843901221, + "learning_rate": 1.7046618391150847e-07, + "loss": 0.0007, + "step": 30142 + }, + { + "epoch": 12.258235054900366, + "grad_norm": 0.0007424936260114956, + "learning_rate": 1.702799546792866e-07, + "loss": 0.0, + "step": 30143 + }, + { + "epoch": 12.258641724278162, + "grad_norm": 0.033514877576909685, + "learning_rate": 1.7009382635408967e-07, + "loss": 0.0003, + "step": 30144 + }, + { + "epoch": 12.259048393655958, + "grad_norm": 0.016717564810846425, + "learning_rate": 1.6990779893782617e-07, + "loss": 0.0002, + "step": 30145 + }, + { + "epoch": 12.259455063033753, + "grad_norm": 0.0005794112432985181, + "learning_rate": 1.6972187243240677e-07, + "loss": 0.0, + "step": 30146 + }, + { + "epoch": 12.259861732411549, + "grad_norm": 2.732947630752365e-05, + "learning_rate": 1.6953604683974002e-07, + "loss": 0.0, + "step": 30147 + }, + { + "epoch": 12.260268401789345, + "grad_norm": 0.00011894623901361013, + "learning_rate": 1.693503221617332e-07, + "loss": 0.0, + "step": 30148 + }, + { + "epoch": 12.26067507116714, + "grad_norm": 0.0015300083620817964, + "learning_rate": 1.691646984002937e-07, + "loss": 0.0, + "step": 30149 + }, + { + "epoch": 12.261081740544936, + "grad_norm": 0.0013908018134375238, + "learning_rate": 1.6897917555732446e-07, + "loss": 0.0, + "step": 30150 + }, + { + "epoch": 12.261488409922732, + "grad_norm": 0.003301021525300924, + "learning_rate": 1.6879375363473282e-07, + "loss": 0.0, + "step": 30151 + }, + { + "epoch": 12.261895079300528, + "grad_norm": 0.0037764122277234935, + "learning_rate": 1.6860843263442172e-07, + "loss": 0.0, + "step": 30152 + }, + { + "epoch": 12.262301748678324, + "grad_norm": 0.0007466142261906239, + "learning_rate": 1.6842321255829297e-07, + "loss": 0.0, + "step": 30153 + }, + { + "epoch": 12.262708418056121, + "grad_norm": 0.018420687358964943, + "learning_rate": 1.6823809340824727e-07, + "loss": 0.0001, + "step": 30154 + }, + { + "epoch": 12.263115087433917, + "grad_norm": 0.05069478061370358, + "learning_rate": 1.680530751861864e-07, + "loss": 0.0005, + "step": 30155 + }, + { + "epoch": 12.263521756811713, + "grad_norm": 6.663997811180138e-06, + "learning_rate": 1.678681578940089e-07, + "loss": 0.0, + "step": 30156 + }, + { + "epoch": 12.263928426189509, + "grad_norm": 0.016525593154824443, + "learning_rate": 1.676833415336132e-07, + "loss": 0.0001, + "step": 30157 + }, + { + "epoch": 12.264335095567304, + "grad_norm": 0.002718368867949577, + "learning_rate": 1.674986261068967e-07, + "loss": 0.0, + "step": 30158 + }, + { + "epoch": 12.2647417649451, + "grad_norm": 0.0002773153319418205, + "learning_rate": 1.6731401161575566e-07, + "loss": 0.0, + "step": 30159 + }, + { + "epoch": 12.265148434322896, + "grad_norm": 8.965704067437211e-05, + "learning_rate": 1.6712949806208412e-07, + "loss": 0.0, + "step": 30160 + }, + { + "epoch": 12.265555103700692, + "grad_norm": 0.00013745224269680442, + "learning_rate": 1.669450854477772e-07, + "loss": 0.0, + "step": 30161 + }, + { + "epoch": 12.265961773078487, + "grad_norm": 0.006632116296233653, + "learning_rate": 1.66760773774729e-07, + "loss": 0.0001, + "step": 30162 + }, + { + "epoch": 12.266368442456283, + "grad_norm": 0.0807414450951147, + "learning_rate": 1.6657656304482906e-07, + "loss": 0.0006, + "step": 30163 + }, + { + "epoch": 12.266775111834079, + "grad_norm": 0.007635878404281201, + "learning_rate": 1.6639245325997032e-07, + "loss": 0.0001, + "step": 30164 + }, + { + "epoch": 12.267181781211875, + "grad_norm": 0.0031380551659912206, + "learning_rate": 1.6620844442204242e-07, + "loss": 0.0, + "step": 30165 + }, + { + "epoch": 12.26758845058967, + "grad_norm": 0.0008038961687695852, + "learning_rate": 1.6602453653293382e-07, + "loss": 0.0, + "step": 30166 + }, + { + "epoch": 12.267995119967466, + "grad_norm": 0.0061914459390757836, + "learning_rate": 1.658407295945319e-07, + "loss": 0.0001, + "step": 30167 + }, + { + "epoch": 12.268401789345262, + "grad_norm": 0.014076263004336592, + "learning_rate": 1.6565702360872514e-07, + "loss": 0.0001, + "step": 30168 + }, + { + "epoch": 12.268808458723058, + "grad_norm": 0.005379692014616594, + "learning_rate": 1.6547341857739875e-07, + "loss": 0.0, + "step": 30169 + }, + { + "epoch": 12.269215128100853, + "grad_norm": 0.0038670699959505533, + "learning_rate": 1.652899145024367e-07, + "loss": 0.0, + "step": 30170 + }, + { + "epoch": 12.26962179747865, + "grad_norm": 0.013989068878014958, + "learning_rate": 1.651065113857242e-07, + "loss": 0.0002, + "step": 30171 + }, + { + "epoch": 12.270028466856445, + "grad_norm": 0.006558940711403703, + "learning_rate": 1.6492320922914308e-07, + "loss": 0.0, + "step": 30172 + }, + { + "epoch": 12.27043513623424, + "grad_norm": 0.0009158290345530184, + "learning_rate": 1.64740008034574e-07, + "loss": 0.0, + "step": 30173 + }, + { + "epoch": 12.270841805612037, + "grad_norm": 1.3970705966004734e-05, + "learning_rate": 1.6455690780389998e-07, + "loss": 0.0, + "step": 30174 + }, + { + "epoch": 12.271248474989834, + "grad_norm": 0.018496626231030395, + "learning_rate": 1.643739085389995e-07, + "loss": 0.0001, + "step": 30175 + }, + { + "epoch": 12.27165514436763, + "grad_norm": 1.4484476615759011e-05, + "learning_rate": 1.6419101024174876e-07, + "loss": 0.0, + "step": 30176 + }, + { + "epoch": 12.272061813745426, + "grad_norm": 0.0011830211481134746, + "learning_rate": 1.6400821291402968e-07, + "loss": 0.0, + "step": 30177 + }, + { + "epoch": 12.272468483123221, + "grad_norm": 0.0033410009410740583, + "learning_rate": 1.638255165577163e-07, + "loss": 0.0, + "step": 30178 + }, + { + "epoch": 12.272875152501017, + "grad_norm": 0.0019780658764113196, + "learning_rate": 1.6364292117468483e-07, + "loss": 0.0, + "step": 30179 + }, + { + "epoch": 12.273281821878813, + "grad_norm": 0.04922696332656352, + "learning_rate": 1.6346042676680828e-07, + "loss": 0.0004, + "step": 30180 + }, + { + "epoch": 12.273688491256609, + "grad_norm": 0.00724162027636649, + "learning_rate": 1.632780333359618e-07, + "loss": 0.0001, + "step": 30181 + }, + { + "epoch": 12.274095160634404, + "grad_norm": 0.0007138907404646643, + "learning_rate": 1.6309574088401613e-07, + "loss": 0.0, + "step": 30182 + }, + { + "epoch": 12.2745018300122, + "grad_norm": 0.00014018460564801474, + "learning_rate": 1.629135494128442e-07, + "loss": 0.0, + "step": 30183 + }, + { + "epoch": 12.274908499389996, + "grad_norm": 0.006713241185722048, + "learning_rate": 1.6273145892431562e-07, + "loss": 0.0001, + "step": 30184 + }, + { + "epoch": 12.275315168767792, + "grad_norm": 0.1764725536511114, + "learning_rate": 1.6254946942030004e-07, + "loss": 0.0016, + "step": 30185 + }, + { + "epoch": 12.275721838145587, + "grad_norm": 0.0016594429902759653, + "learning_rate": 1.623675809026648e-07, + "loss": 0.0, + "step": 30186 + }, + { + "epoch": 12.276128507523383, + "grad_norm": 0.05365525005990134, + "learning_rate": 1.6218579337327735e-07, + "loss": 0.0005, + "step": 30187 + }, + { + "epoch": 12.276535176901179, + "grad_norm": 0.00011367903494512165, + "learning_rate": 1.6200410683400393e-07, + "loss": 0.0, + "step": 30188 + }, + { + "epoch": 12.276941846278975, + "grad_norm": 0.005307377289083212, + "learning_rate": 1.618225212867097e-07, + "loss": 0.0001, + "step": 30189 + }, + { + "epoch": 12.27734851565677, + "grad_norm": 0.054677854195010395, + "learning_rate": 1.616410367332588e-07, + "loss": 0.0005, + "step": 30190 + }, + { + "epoch": 12.277755185034566, + "grad_norm": 0.03932631528959321, + "learning_rate": 1.6145965317551415e-07, + "loss": 0.0003, + "step": 30191 + }, + { + "epoch": 12.278161854412362, + "grad_norm": 0.00807774566994854, + "learning_rate": 1.612783706153387e-07, + "loss": 0.0001, + "step": 30192 + }, + { + "epoch": 12.278568523790158, + "grad_norm": 0.027013792067369386, + "learning_rate": 1.6109718905459204e-07, + "loss": 0.0002, + "step": 30193 + }, + { + "epoch": 12.278975193167954, + "grad_norm": 8.788059817792587e-05, + "learning_rate": 1.6091610849513383e-07, + "loss": 0.0, + "step": 30194 + }, + { + "epoch": 12.279381862545751, + "grad_norm": 0.0002814368638945765, + "learning_rate": 1.6073512893882371e-07, + "loss": 0.0, + "step": 30195 + }, + { + "epoch": 12.279788531923547, + "grad_norm": 0.02482810615074822, + "learning_rate": 1.6055425038752014e-07, + "loss": 0.0002, + "step": 30196 + }, + { + "epoch": 12.280195201301343, + "grad_norm": 0.012106239098493952, + "learning_rate": 1.6037347284307836e-07, + "loss": 0.0001, + "step": 30197 + }, + { + "epoch": 12.280601870679138, + "grad_norm": 0.002943205936979218, + "learning_rate": 1.6019279630735574e-07, + "loss": 0.0, + "step": 30198 + }, + { + "epoch": 12.281008540056934, + "grad_norm": 0.0004268745072089761, + "learning_rate": 1.6001222078220747e-07, + "loss": 0.0, + "step": 30199 + }, + { + "epoch": 12.28141520943473, + "grad_norm": 0.00013327084476083125, + "learning_rate": 1.5983174626948428e-07, + "loss": 0.0, + "step": 30200 + }, + { + "epoch": 12.281821878812526, + "grad_norm": 0.001570460618678209, + "learning_rate": 1.5965137277104136e-07, + "loss": 0.0, + "step": 30201 + }, + { + "epoch": 12.282228548190322, + "grad_norm": 0.004283427561373119, + "learning_rate": 1.5947110028872948e-07, + "loss": 0.0, + "step": 30202 + }, + { + "epoch": 12.282635217568117, + "grad_norm": 0.013068283333494907, + "learning_rate": 1.5929092882439935e-07, + "loss": 0.0002, + "step": 30203 + }, + { + "epoch": 12.283041886945913, + "grad_norm": 0.00011034344815606156, + "learning_rate": 1.591108583799017e-07, + "loss": 0.0, + "step": 30204 + }, + { + "epoch": 12.283448556323709, + "grad_norm": 0.003830487138074148, + "learning_rate": 1.5893088895708286e-07, + "loss": 0.0, + "step": 30205 + }, + { + "epoch": 12.283855225701505, + "grad_norm": 0.005655788872212489, + "learning_rate": 1.5875102055779135e-07, + "loss": 0.0, + "step": 30206 + }, + { + "epoch": 12.2842618950793, + "grad_norm": 0.0032026259762428237, + "learning_rate": 1.5857125318387345e-07, + "loss": 0.0, + "step": 30207 + }, + { + "epoch": 12.284668564457096, + "grad_norm": 0.003215989756808487, + "learning_rate": 1.5839158683717548e-07, + "loss": 0.0, + "step": 30208 + }, + { + "epoch": 12.285075233834892, + "grad_norm": 0.02590044142329453, + "learning_rate": 1.582120215195404e-07, + "loss": 0.0001, + "step": 30209 + }, + { + "epoch": 12.285481903212688, + "grad_norm": 0.0029590902033275494, + "learning_rate": 1.580325572328123e-07, + "loss": 0.0, + "step": 30210 + }, + { + "epoch": 12.285888572590483, + "grad_norm": 0.008653082182031344, + "learning_rate": 1.5785319397883304e-07, + "loss": 0.0, + "step": 30211 + }, + { + "epoch": 12.28629524196828, + "grad_norm": 0.003530453621703667, + "learning_rate": 1.5767393175944445e-07, + "loss": 0.0, + "step": 30212 + }, + { + "epoch": 12.286701911346075, + "grad_norm": 0.004202127103264435, + "learning_rate": 1.5749477057648732e-07, + "loss": 0.0, + "step": 30213 + }, + { + "epoch": 12.28710858072387, + "grad_norm": 0.0015393308268450922, + "learning_rate": 1.5731571043179906e-07, + "loss": 0.0, + "step": 30214 + }, + { + "epoch": 12.287515250101666, + "grad_norm": 0.0051485103205021285, + "learning_rate": 1.5713675132721818e-07, + "loss": 0.0, + "step": 30215 + }, + { + "epoch": 12.287921919479464, + "grad_norm": 0.1670004625372, + "learning_rate": 1.569578932645832e-07, + "loss": 0.0013, + "step": 30216 + }, + { + "epoch": 12.28832858885726, + "grad_norm": 0.013161645264870698, + "learning_rate": 1.567791362457294e-07, + "loss": 0.0001, + "step": 30217 + }, + { + "epoch": 12.288735258235056, + "grad_norm": 0.006811567138336398, + "learning_rate": 1.5660048027249186e-07, + "loss": 0.0001, + "step": 30218 + }, + { + "epoch": 12.289141927612851, + "grad_norm": 0.00028475036386086663, + "learning_rate": 1.5642192534670476e-07, + "loss": 0.0, + "step": 30219 + }, + { + "epoch": 12.289548596990647, + "grad_norm": 0.0002445318307143824, + "learning_rate": 1.5624347147019992e-07, + "loss": 0.0, + "step": 30220 + }, + { + "epoch": 12.289955266368443, + "grad_norm": 0.005350589944421047, + "learning_rate": 1.5606511864481145e-07, + "loss": 0.0, + "step": 30221 + }, + { + "epoch": 12.290361935746239, + "grad_norm": 0.014589077716093137, + "learning_rate": 1.5588686687236676e-07, + "loss": 0.0001, + "step": 30222 + }, + { + "epoch": 12.290768605124034, + "grad_norm": 0.0007405587600814227, + "learning_rate": 1.5570871615469996e-07, + "loss": 0.0, + "step": 30223 + }, + { + "epoch": 12.29117527450183, + "grad_norm": 0.015438249335603452, + "learning_rate": 1.5553066649363624e-07, + "loss": 0.0001, + "step": 30224 + }, + { + "epoch": 12.291581943879626, + "grad_norm": 0.002792283020566714, + "learning_rate": 1.5535271789100638e-07, + "loss": 0.0, + "step": 30225 + }, + { + "epoch": 12.291988613257422, + "grad_norm": 0.01062724371879009, + "learning_rate": 1.5517487034863444e-07, + "loss": 0.0, + "step": 30226 + }, + { + "epoch": 12.292395282635217, + "grad_norm": 0.0005936697446391154, + "learning_rate": 1.5499712386834675e-07, + "loss": 0.0, + "step": 30227 + }, + { + "epoch": 12.292801952013013, + "grad_norm": 0.0015758471996386028, + "learning_rate": 1.5481947845196854e-07, + "loss": 0.0, + "step": 30228 + }, + { + "epoch": 12.293208621390809, + "grad_norm": 0.005898416849733178, + "learning_rate": 1.546419341013239e-07, + "loss": 0.0, + "step": 30229 + }, + { + "epoch": 12.293615290768605, + "grad_norm": 0.17044798758267402, + "learning_rate": 1.544644908182358e-07, + "loss": 0.0016, + "step": 30230 + }, + { + "epoch": 12.2940219601464, + "grad_norm": 0.007655715390949244, + "learning_rate": 1.5428714860452389e-07, + "loss": 0.0, + "step": 30231 + }, + { + "epoch": 12.294428629524196, + "grad_norm": 0.005284039996336123, + "learning_rate": 1.5410990746201005e-07, + "loss": 0.0, + "step": 30232 + }, + { + "epoch": 12.294835298901992, + "grad_norm": 0.0036330233703432235, + "learning_rate": 1.5393276739251174e-07, + "loss": 0.0, + "step": 30233 + }, + { + "epoch": 12.295241968279788, + "grad_norm": 0.01638755898349217, + "learning_rate": 1.537557283978508e-07, + "loss": 0.0002, + "step": 30234 + }, + { + "epoch": 12.295648637657584, + "grad_norm": 0.034132869639329926, + "learning_rate": 1.5357879047984137e-07, + "loss": 0.0003, + "step": 30235 + }, + { + "epoch": 12.296055307035381, + "grad_norm": 0.001187964553721177, + "learning_rate": 1.5340195364030307e-07, + "loss": 0.0, + "step": 30236 + }, + { + "epoch": 12.296461976413177, + "grad_norm": 0.00048283544183548983, + "learning_rate": 1.5322521788104782e-07, + "loss": 0.0, + "step": 30237 + }, + { + "epoch": 12.296868645790973, + "grad_norm": 0.009649157840751424, + "learning_rate": 1.5304858320389304e-07, + "loss": 0.0, + "step": 30238 + }, + { + "epoch": 12.297275315168768, + "grad_norm": 0.00015983796687550064, + "learning_rate": 1.528720496106495e-07, + "loss": 0.0, + "step": 30239 + }, + { + "epoch": 12.297681984546564, + "grad_norm": 0.017054942566382642, + "learning_rate": 1.526956171031291e-07, + "loss": 0.0002, + "step": 30240 + }, + { + "epoch": 12.29808865392436, + "grad_norm": 0.0024295798476247606, + "learning_rate": 1.5251928568314588e-07, + "loss": 0.0, + "step": 30241 + }, + { + "epoch": 12.298495323302156, + "grad_norm": 3.835710458031806e-05, + "learning_rate": 1.5234305535250847e-07, + "loss": 0.0, + "step": 30242 + }, + { + "epoch": 12.298901992679951, + "grad_norm": 0.008693188666678214, + "learning_rate": 1.521669261130243e-07, + "loss": 0.0001, + "step": 30243 + }, + { + "epoch": 12.299308662057747, + "grad_norm": 0.0300346627917276, + "learning_rate": 1.5199089796650414e-07, + "loss": 0.0003, + "step": 30244 + }, + { + "epoch": 12.299715331435543, + "grad_norm": 0.0007602944091788132, + "learning_rate": 1.5181497091475427e-07, + "loss": 0.0, + "step": 30245 + }, + { + "epoch": 12.300122000813339, + "grad_norm": 0.0019959874859624667, + "learning_rate": 1.5163914495957887e-07, + "loss": 0.0, + "step": 30246 + }, + { + "epoch": 12.300528670191134, + "grad_norm": 0.005721011212610949, + "learning_rate": 1.5146342010278424e-07, + "loss": 0.0, + "step": 30247 + }, + { + "epoch": 12.30093533956893, + "grad_norm": 0.002736932269758126, + "learning_rate": 1.5128779634617564e-07, + "loss": 0.0, + "step": 30248 + }, + { + "epoch": 12.301342008946726, + "grad_norm": 0.00577711455557783, + "learning_rate": 1.511122736915538e-07, + "loss": 0.0, + "step": 30249 + }, + { + "epoch": 12.301748678324522, + "grad_norm": 0.010246192905721836, + "learning_rate": 1.5093685214072173e-07, + "loss": 0.0001, + "step": 30250 + }, + { + "epoch": 12.302155347702318, + "grad_norm": 0.0011544339745681842, + "learning_rate": 1.5076153169547914e-07, + "loss": 0.0, + "step": 30251 + }, + { + "epoch": 12.302562017080113, + "grad_norm": 0.008980490811005987, + "learning_rate": 1.5058631235762678e-07, + "loss": 0.0001, + "step": 30252 + }, + { + "epoch": 12.302968686457909, + "grad_norm": 6.8908930392395735, + "learning_rate": 1.5041119412896211e-07, + "loss": 0.1267, + "step": 30253 + }, + { + "epoch": 12.303375355835705, + "grad_norm": 0.009187693046224566, + "learning_rate": 1.5023617701128478e-07, + "loss": 0.0001, + "step": 30254 + }, + { + "epoch": 12.3037820252135, + "grad_norm": 0.011648467342270832, + "learning_rate": 1.5006126100639007e-07, + "loss": 0.0002, + "step": 30255 + }, + { + "epoch": 12.304188694591296, + "grad_norm": 0.00183475522739941, + "learning_rate": 1.4988644611607427e-07, + "loss": 0.0, + "step": 30256 + }, + { + "epoch": 12.304595363969094, + "grad_norm": 0.005069199639422309, + "learning_rate": 1.4971173234213155e-07, + "loss": 0.0, + "step": 30257 + }, + { + "epoch": 12.30500203334689, + "grad_norm": 0.0006468969101894013, + "learning_rate": 1.4953711968635488e-07, + "loss": 0.0, + "step": 30258 + }, + { + "epoch": 12.305408702724685, + "grad_norm": 0.07557901885084273, + "learning_rate": 1.4936260815053728e-07, + "loss": 0.0003, + "step": 30259 + }, + { + "epoch": 12.305815372102481, + "grad_norm": 0.14471416519757677, + "learning_rate": 1.4918819773647064e-07, + "loss": 0.0012, + "step": 30260 + }, + { + "epoch": 12.306222041480277, + "grad_norm": 0.0028841900907769216, + "learning_rate": 1.4901388844594467e-07, + "loss": 0.0, + "step": 30261 + }, + { + "epoch": 12.306628710858073, + "grad_norm": 0.011483071558917942, + "learning_rate": 1.4883968028075013e-07, + "loss": 0.0001, + "step": 30262 + }, + { + "epoch": 12.307035380235869, + "grad_norm": 0.0018781391962844282, + "learning_rate": 1.4866557324267338e-07, + "loss": 0.0, + "step": 30263 + }, + { + "epoch": 12.307442049613664, + "grad_norm": 0.0012959448537845646, + "learning_rate": 1.4849156733350302e-07, + "loss": 0.0, + "step": 30264 + }, + { + "epoch": 12.30784871899146, + "grad_norm": 0.08983953278003204, + "learning_rate": 1.4831766255502422e-07, + "loss": 0.0008, + "step": 30265 + }, + { + "epoch": 12.308255388369256, + "grad_norm": 0.10966939178159205, + "learning_rate": 1.4814385890902228e-07, + "loss": 0.0004, + "step": 30266 + }, + { + "epoch": 12.308662057747052, + "grad_norm": 0.005259882575038519, + "learning_rate": 1.479701563972824e-07, + "loss": 0.0, + "step": 30267 + }, + { + "epoch": 12.309068727124847, + "grad_norm": 0.0018247860349539942, + "learning_rate": 1.4779655502158874e-07, + "loss": 0.0, + "step": 30268 + }, + { + "epoch": 12.309475396502643, + "grad_norm": 0.0007841770680027278, + "learning_rate": 1.4762305478372098e-07, + "loss": 0.0, + "step": 30269 + }, + { + "epoch": 12.309882065880439, + "grad_norm": 0.00017323989138098362, + "learning_rate": 1.47449655685461e-07, + "loss": 0.0, + "step": 30270 + }, + { + "epoch": 12.310288735258235, + "grad_norm": 0.005664295264441813, + "learning_rate": 1.472763577285885e-07, + "loss": 0.0, + "step": 30271 + }, + { + "epoch": 12.31069540463603, + "grad_norm": 0.060851566790506455, + "learning_rate": 1.4710316091488319e-07, + "loss": 0.0005, + "step": 30272 + }, + { + "epoch": 12.311102074013826, + "grad_norm": 0.002858537949282537, + "learning_rate": 1.4693006524612364e-07, + "loss": 0.0, + "step": 30273 + }, + { + "epoch": 12.311508743391622, + "grad_norm": 0.026108893433027303, + "learning_rate": 1.4675707072408507e-07, + "loss": 0.0003, + "step": 30274 + }, + { + "epoch": 12.311915412769418, + "grad_norm": 0.001778066839123787, + "learning_rate": 1.4658417735054497e-07, + "loss": 0.0, + "step": 30275 + }, + { + "epoch": 12.312322082147213, + "grad_norm": 0.05653215233465612, + "learning_rate": 1.4641138512727638e-07, + "loss": 0.0004, + "step": 30276 + }, + { + "epoch": 12.312728751525011, + "grad_norm": 0.008734892343186033, + "learning_rate": 1.4623869405605451e-07, + "loss": 0.0001, + "step": 30277 + }, + { + "epoch": 12.313135420902807, + "grad_norm": 0.0007083539416452904, + "learning_rate": 1.460661041386524e-07, + "loss": 0.0, + "step": 30278 + }, + { + "epoch": 12.313542090280603, + "grad_norm": 0.0018881828672719322, + "learning_rate": 1.4589361537683977e-07, + "loss": 0.0, + "step": 30279 + }, + { + "epoch": 12.313948759658398, + "grad_norm": 0.015359554832354723, + "learning_rate": 1.457212277723896e-07, + "loss": 0.0002, + "step": 30280 + }, + { + "epoch": 12.314355429036194, + "grad_norm": 0.0024425493717000687, + "learning_rate": 1.4554894132707055e-07, + "loss": 0.0, + "step": 30281 + }, + { + "epoch": 12.31476209841399, + "grad_norm": 0.00436010553783575, + "learning_rate": 1.4537675604265222e-07, + "loss": 0.0, + "step": 30282 + }, + { + "epoch": 12.315168767791786, + "grad_norm": 0.057186501655204426, + "learning_rate": 1.4520467192089994e-07, + "loss": 0.0004, + "step": 30283 + }, + { + "epoch": 12.315575437169581, + "grad_norm": 0.0013962885907650795, + "learning_rate": 1.4503268896358225e-07, + "loss": 0.0, + "step": 30284 + }, + { + "epoch": 12.315982106547377, + "grad_norm": 0.018769305645480395, + "learning_rate": 1.448608071724622e-07, + "loss": 0.0001, + "step": 30285 + }, + { + "epoch": 12.316388775925173, + "grad_norm": 0.037183290278257335, + "learning_rate": 1.4468902654930727e-07, + "loss": 0.0003, + "step": 30286 + }, + { + "epoch": 12.316795445302969, + "grad_norm": 0.04292667337659592, + "learning_rate": 1.445173470958794e-07, + "loss": 0.0002, + "step": 30287 + }, + { + "epoch": 12.317202114680764, + "grad_norm": 0.021278576784083073, + "learning_rate": 1.4434576881394159e-07, + "loss": 0.0002, + "step": 30288 + }, + { + "epoch": 12.31760878405856, + "grad_norm": 0.08557347978149417, + "learning_rate": 1.4417429170525464e-07, + "loss": 0.0009, + "step": 30289 + }, + { + "epoch": 12.318015453436356, + "grad_norm": 5.35460934504391e-05, + "learning_rate": 1.4400291577157833e-07, + "loss": 0.0, + "step": 30290 + }, + { + "epoch": 12.318422122814152, + "grad_norm": 0.011326943955272914, + "learning_rate": 1.438316410146734e-07, + "loss": 0.0001, + "step": 30291 + }, + { + "epoch": 12.318828792191947, + "grad_norm": 0.0003248581832017641, + "learning_rate": 1.4366046743629625e-07, + "loss": 0.0, + "step": 30292 + }, + { + "epoch": 12.319235461569743, + "grad_norm": 0.019300126236620443, + "learning_rate": 1.4348939503820436e-07, + "loss": 0.0002, + "step": 30293 + }, + { + "epoch": 12.319642130947539, + "grad_norm": 0.020444536561713234, + "learning_rate": 1.4331842382215632e-07, + "loss": 0.0001, + "step": 30294 + }, + { + "epoch": 12.320048800325335, + "grad_norm": 0.0005468184343656623, + "learning_rate": 1.431475537899041e-07, + "loss": 0.0, + "step": 30295 + }, + { + "epoch": 12.32045546970313, + "grad_norm": 1.5473508238242266e-05, + "learning_rate": 1.42976784943204e-07, + "loss": 0.0, + "step": 30296 + }, + { + "epoch": 12.320862139080926, + "grad_norm": 0.0009092639451478819, + "learning_rate": 1.42806117283808e-07, + "loss": 0.0, + "step": 30297 + }, + { + "epoch": 12.321268808458724, + "grad_norm": 0.0016052460827443312, + "learning_rate": 1.4263555081346693e-07, + "loss": 0.0, + "step": 30298 + }, + { + "epoch": 12.32167547783652, + "grad_norm": 0.010921440985662842, + "learning_rate": 1.424650855339338e-07, + "loss": 0.0001, + "step": 30299 + }, + { + "epoch": 12.322082147214315, + "grad_norm": 0.03053631912654453, + "learning_rate": 1.4229472144695833e-07, + "loss": 0.0003, + "step": 30300 + }, + { + "epoch": 12.322488816592111, + "grad_norm": 0.001106969695911205, + "learning_rate": 1.4212445855428913e-07, + "loss": 0.0, + "step": 30301 + }, + { + "epoch": 12.322895485969907, + "grad_norm": 0.020553748001735834, + "learning_rate": 1.419542968576726e-07, + "loss": 0.0001, + "step": 30302 + }, + { + "epoch": 12.323302155347703, + "grad_norm": 0.04350180697258242, + "learning_rate": 1.417842363588573e-07, + "loss": 0.0003, + "step": 30303 + }, + { + "epoch": 12.323708824725498, + "grad_norm": 0.013536864500379213, + "learning_rate": 1.4161427705958852e-07, + "loss": 0.0001, + "step": 30304 + }, + { + "epoch": 12.324115494103294, + "grad_norm": 0.0023514511262316324, + "learning_rate": 1.414444189616093e-07, + "loss": 0.0, + "step": 30305 + }, + { + "epoch": 12.32452216348109, + "grad_norm": 0.018712287256882093, + "learning_rate": 1.4127466206666717e-07, + "loss": 0.0001, + "step": 30306 + }, + { + "epoch": 12.324928832858886, + "grad_norm": 4.42429373700897e-06, + "learning_rate": 1.411050063765007e-07, + "loss": 0.0, + "step": 30307 + }, + { + "epoch": 12.325335502236682, + "grad_norm": 0.008706321772974935, + "learning_rate": 1.4093545189285407e-07, + "loss": 0.0001, + "step": 30308 + }, + { + "epoch": 12.325742171614477, + "grad_norm": 0.06704106393939305, + "learning_rate": 1.4076599861746698e-07, + "loss": 0.0002, + "step": 30309 + }, + { + "epoch": 12.326148840992273, + "grad_norm": 0.0004377695183584841, + "learning_rate": 1.4059664655207917e-07, + "loss": 0.0, + "step": 30310 + }, + { + "epoch": 12.326555510370069, + "grad_norm": 0.014725878006061172, + "learning_rate": 1.404273956984281e-07, + "loss": 0.0001, + "step": 30311 + }, + { + "epoch": 12.326962179747865, + "grad_norm": 0.0053916323615590305, + "learning_rate": 1.4025824605825133e-07, + "loss": 0.0, + "step": 30312 + }, + { + "epoch": 12.32736884912566, + "grad_norm": 0.016813303157457358, + "learning_rate": 1.4008919763328744e-07, + "loss": 0.0001, + "step": 30313 + }, + { + "epoch": 12.327775518503456, + "grad_norm": 0.00034602748674289924, + "learning_rate": 1.3992025042526947e-07, + "loss": 0.0, + "step": 30314 + }, + { + "epoch": 12.328182187881252, + "grad_norm": 0.020315115803150218, + "learning_rate": 1.3975140443593382e-07, + "loss": 0.0002, + "step": 30315 + }, + { + "epoch": 12.328588857259048, + "grad_norm": 0.000554209321354134, + "learning_rate": 1.3958265966701133e-07, + "loss": 0.0, + "step": 30316 + }, + { + "epoch": 12.328995526636843, + "grad_norm": 0.056121443233413865, + "learning_rate": 1.394140161202362e-07, + "loss": 0.0005, + "step": 30317 + }, + { + "epoch": 12.329402196014641, + "grad_norm": 0.10390463440116238, + "learning_rate": 1.392454737973381e-07, + "loss": 0.0009, + "step": 30318 + }, + { + "epoch": 12.329808865392437, + "grad_norm": 0.016195871871633058, + "learning_rate": 1.3907703270004902e-07, + "loss": 0.0001, + "step": 30319 + }, + { + "epoch": 12.330215534770232, + "grad_norm": 6.376323731454202e-05, + "learning_rate": 1.3890869283009646e-07, + "loss": 0.0, + "step": 30320 + }, + { + "epoch": 12.330622204148028, + "grad_norm": 0.04785113182314254, + "learning_rate": 1.3874045418920902e-07, + "loss": 0.0004, + "step": 30321 + }, + { + "epoch": 12.331028873525824, + "grad_norm": 0.00011038833865780563, + "learning_rate": 1.3857231677911532e-07, + "loss": 0.0, + "step": 30322 + }, + { + "epoch": 12.33143554290362, + "grad_norm": 0.004812815678400845, + "learning_rate": 1.3840428060153843e-07, + "loss": 0.0001, + "step": 30323 + }, + { + "epoch": 12.331842212281416, + "grad_norm": 0.00042761787395163863, + "learning_rate": 1.3823634565820588e-07, + "loss": 0.0, + "step": 30324 + }, + { + "epoch": 12.332248881659211, + "grad_norm": 0.0008788355193182294, + "learning_rate": 1.3806851195083958e-07, + "loss": 0.0, + "step": 30325 + }, + { + "epoch": 12.332655551037007, + "grad_norm": 0.14117648595242532, + "learning_rate": 1.3790077948116377e-07, + "loss": 0.0012, + "step": 30326 + }, + { + "epoch": 12.333062220414803, + "grad_norm": 0.0026175528411411606, + "learning_rate": 1.3773314825090033e-07, + "loss": 0.0, + "step": 30327 + }, + { + "epoch": 12.333468889792599, + "grad_norm": 0.006056415999930508, + "learning_rate": 1.3756561826176907e-07, + "loss": 0.0001, + "step": 30328 + }, + { + "epoch": 12.333875559170394, + "grad_norm": 0.004385298916310505, + "learning_rate": 1.3739818951549077e-07, + "loss": 0.0, + "step": 30329 + }, + { + "epoch": 12.33428222854819, + "grad_norm": 0.14757023296801444, + "learning_rate": 1.37230862013783e-07, + "loss": 0.0016, + "step": 30330 + }, + { + "epoch": 12.334688897925986, + "grad_norm": 0.04636990007511622, + "learning_rate": 1.3706363575836545e-07, + "loss": 0.0006, + "step": 30331 + }, + { + "epoch": 12.335095567303782, + "grad_norm": 0.0054459529510929765, + "learning_rate": 1.3689651075095234e-07, + "loss": 0.0001, + "step": 30332 + }, + { + "epoch": 12.335502236681577, + "grad_norm": 0.018519691759624406, + "learning_rate": 1.3672948699326228e-07, + "loss": 0.0002, + "step": 30333 + }, + { + "epoch": 12.335908906059373, + "grad_norm": 9.707744675163454e-05, + "learning_rate": 1.3656256448700721e-07, + "loss": 0.0, + "step": 30334 + }, + { + "epoch": 12.336315575437169, + "grad_norm": 0.0004789093537923066, + "learning_rate": 1.3639574323390137e-07, + "loss": 0.0, + "step": 30335 + }, + { + "epoch": 12.336722244814965, + "grad_norm": 0.0026618901161042535, + "learning_rate": 1.362290232356567e-07, + "loss": 0.0, + "step": 30336 + }, + { + "epoch": 12.33712891419276, + "grad_norm": 1.1836487446961306e-06, + "learning_rate": 1.3606240449398623e-07, + "loss": 0.0, + "step": 30337 + }, + { + "epoch": 12.337535583570556, + "grad_norm": 0.020246483708421106, + "learning_rate": 1.3589588701059975e-07, + "loss": 0.0002, + "step": 30338 + }, + { + "epoch": 12.337942252948354, + "grad_norm": 0.07897934123536848, + "learning_rate": 1.35729470787207e-07, + "loss": 0.0006, + "step": 30339 + }, + { + "epoch": 12.33834892232615, + "grad_norm": 0.10323228140667895, + "learning_rate": 1.355631558255155e-07, + "loss": 0.0008, + "step": 30340 + }, + { + "epoch": 12.338755591703945, + "grad_norm": 0.0018187794768907241, + "learning_rate": 1.3539694212723163e-07, + "loss": 0.0, + "step": 30341 + }, + { + "epoch": 12.339162261081741, + "grad_norm": 0.0006347581214936213, + "learning_rate": 1.3523082969406408e-07, + "loss": 0.0, + "step": 30342 + }, + { + "epoch": 12.339568930459537, + "grad_norm": 0.03364853427591333, + "learning_rate": 1.35064818527717e-07, + "loss": 0.0003, + "step": 30343 + }, + { + "epoch": 12.339975599837333, + "grad_norm": 0.01519169041132555, + "learning_rate": 1.3489890862989352e-07, + "loss": 0.0001, + "step": 30344 + }, + { + "epoch": 12.340382269215128, + "grad_norm": 0.0042387284979646875, + "learning_rate": 1.3473310000229889e-07, + "loss": 0.0, + "step": 30345 + }, + { + "epoch": 12.340788938592924, + "grad_norm": 0.012130799673559926, + "learning_rate": 1.34567392646634e-07, + "loss": 0.0001, + "step": 30346 + }, + { + "epoch": 12.34119560797072, + "grad_norm": 0.009848867839833955, + "learning_rate": 1.344017865645997e-07, + "loss": 0.0001, + "step": 30347 + }, + { + "epoch": 12.341602277348516, + "grad_norm": 0.0015575112038690548, + "learning_rate": 1.3423628175789572e-07, + "loss": 0.0, + "step": 30348 + }, + { + "epoch": 12.342008946726311, + "grad_norm": 0.012082475708134918, + "learning_rate": 1.3407087822822185e-07, + "loss": 0.0001, + "step": 30349 + }, + { + "epoch": 12.342415616104107, + "grad_norm": 0.7302523468989236, + "learning_rate": 1.339055759772756e-07, + "loss": 0.0065, + "step": 30350 + }, + { + "epoch": 12.342822285481903, + "grad_norm": 0.004092517189322673, + "learning_rate": 1.3374037500675452e-07, + "loss": 0.0, + "step": 30351 + }, + { + "epoch": 12.343228954859699, + "grad_norm": 0.10731033370605664, + "learning_rate": 1.3357527531835503e-07, + "loss": 0.0011, + "step": 30352 + }, + { + "epoch": 12.343635624237494, + "grad_norm": 0.029823713291482902, + "learning_rate": 1.3341027691377017e-07, + "loss": 0.0002, + "step": 30353 + }, + { + "epoch": 12.34404229361529, + "grad_norm": 0.002792207909130529, + "learning_rate": 1.332453797946942e-07, + "loss": 0.0, + "step": 30354 + }, + { + "epoch": 12.344448962993086, + "grad_norm": 0.022775211926560637, + "learning_rate": 1.3308058396282131e-07, + "loss": 0.0002, + "step": 30355 + }, + { + "epoch": 12.344855632370882, + "grad_norm": 0.011330615754485582, + "learning_rate": 1.3291588941984013e-07, + "loss": 0.0001, + "step": 30356 + }, + { + "epoch": 12.345262301748678, + "grad_norm": 0.0002679293936289586, + "learning_rate": 1.3275129616744487e-07, + "loss": 0.0, + "step": 30357 + }, + { + "epoch": 12.345668971126473, + "grad_norm": 0.001483503014809818, + "learning_rate": 1.3258680420732418e-07, + "loss": 0.0, + "step": 30358 + }, + { + "epoch": 12.34607564050427, + "grad_norm": 0.006713697979666179, + "learning_rate": 1.3242241354116446e-07, + "loss": 0.0, + "step": 30359 + }, + { + "epoch": 12.346482309882067, + "grad_norm": 2.4567505864813004e-05, + "learning_rate": 1.3225812417065664e-07, + "loss": 0.0, + "step": 30360 + }, + { + "epoch": 12.346888979259862, + "grad_norm": 0.0021685025917382063, + "learning_rate": 1.320939360974849e-07, + "loss": 0.0, + "step": 30361 + }, + { + "epoch": 12.347295648637658, + "grad_norm": 0.00762039662366488, + "learning_rate": 1.319298493233334e-07, + "loss": 0.0, + "step": 30362 + }, + { + "epoch": 12.347702318015454, + "grad_norm": 0.41854124344394195, + "learning_rate": 1.3176586384989089e-07, + "loss": 0.0036, + "step": 30363 + }, + { + "epoch": 12.34810898739325, + "grad_norm": 0.003757000487093956, + "learning_rate": 1.3160197967883814e-07, + "loss": 0.0, + "step": 30364 + }, + { + "epoch": 12.348515656771045, + "grad_norm": 0.018522901832920685, + "learning_rate": 1.3143819681185722e-07, + "loss": 0.0001, + "step": 30365 + }, + { + "epoch": 12.348922326148841, + "grad_norm": 0.0004491155671076038, + "learning_rate": 1.3127451525063006e-07, + "loss": 0.0, + "step": 30366 + }, + { + "epoch": 12.349328995526637, + "grad_norm": 0.0043703741421523955, + "learning_rate": 1.3111093499683648e-07, + "loss": 0.0, + "step": 30367 + }, + { + "epoch": 12.349735664904433, + "grad_norm": 0.025274001047480687, + "learning_rate": 1.309474560521562e-07, + "loss": 0.0002, + "step": 30368 + }, + { + "epoch": 12.350142334282229, + "grad_norm": 0.001100254392358605, + "learning_rate": 1.3078407841826678e-07, + "loss": 0.0, + "step": 30369 + }, + { + "epoch": 12.350549003660024, + "grad_norm": 0.0050927645510039955, + "learning_rate": 1.306208020968469e-07, + "loss": 0.0, + "step": 30370 + }, + { + "epoch": 12.35095567303782, + "grad_norm": 0.009818971152822573, + "learning_rate": 1.3045762708957078e-07, + "loss": 0.0001, + "step": 30371 + }, + { + "epoch": 12.351362342415616, + "grad_norm": 0.0001335506963949365, + "learning_rate": 1.302945533981148e-07, + "loss": 0.0, + "step": 30372 + }, + { + "epoch": 12.351769011793412, + "grad_norm": 0.0022037636242926923, + "learning_rate": 1.3013158102415323e-07, + "loss": 0.0, + "step": 30373 + }, + { + "epoch": 12.352175681171207, + "grad_norm": 0.03772100377067043, + "learning_rate": 1.2996870996935805e-07, + "loss": 0.0003, + "step": 30374 + }, + { + "epoch": 12.352582350549003, + "grad_norm": 0.0008042316488167301, + "learning_rate": 1.2980594023540016e-07, + "loss": 0.0, + "step": 30375 + }, + { + "epoch": 12.352989019926799, + "grad_norm": 0.004240978187910852, + "learning_rate": 1.2964327182395264e-07, + "loss": 0.0, + "step": 30376 + }, + { + "epoch": 12.353395689304595, + "grad_norm": 0.01256881768838272, + "learning_rate": 1.2948070473668528e-07, + "loss": 0.0001, + "step": 30377 + }, + { + "epoch": 12.35380235868239, + "grad_norm": 0.12903559132801223, + "learning_rate": 1.2931823897526562e-07, + "loss": 0.0013, + "step": 30378 + }, + { + "epoch": 12.354209028060186, + "grad_norm": 0.00235070930542865, + "learning_rate": 1.2915587454136236e-07, + "loss": 0.0, + "step": 30379 + }, + { + "epoch": 12.354615697437984, + "grad_norm": 0.0019812613389422785, + "learning_rate": 1.289936114366419e-07, + "loss": 0.0, + "step": 30380 + }, + { + "epoch": 12.35502236681578, + "grad_norm": 0.009521555267003004, + "learning_rate": 1.2883144966276962e-07, + "loss": 0.0001, + "step": 30381 + }, + { + "epoch": 12.355429036193575, + "grad_norm": 0.0004428475250337409, + "learning_rate": 1.2866938922141082e-07, + "loss": 0.0, + "step": 30382 + }, + { + "epoch": 12.355835705571371, + "grad_norm": 0.01838856787363238, + "learning_rate": 1.285074301142286e-07, + "loss": 0.0001, + "step": 30383 + }, + { + "epoch": 12.356242374949167, + "grad_norm": 0.0027160142194490953, + "learning_rate": 1.2834557234288615e-07, + "loss": 0.0, + "step": 30384 + }, + { + "epoch": 12.356649044326963, + "grad_norm": 0.00019082135017962643, + "learning_rate": 1.2818381590904428e-07, + "loss": 0.0, + "step": 30385 + }, + { + "epoch": 12.357055713704758, + "grad_norm": 0.0786866271127924, + "learning_rate": 1.2802216081436504e-07, + "loss": 0.0007, + "step": 30386 + }, + { + "epoch": 12.357462383082554, + "grad_norm": 0.013462178440304085, + "learning_rate": 1.2786060706050596e-07, + "loss": 0.0001, + "step": 30387 + }, + { + "epoch": 12.35786905246035, + "grad_norm": 0.019306213652914666, + "learning_rate": 1.2769915464912575e-07, + "loss": 0.0001, + "step": 30388 + }, + { + "epoch": 12.358275721838146, + "grad_norm": 0.0011317182763375111, + "learning_rate": 1.2753780358188306e-07, + "loss": 0.0, + "step": 30389 + }, + { + "epoch": 12.358682391215941, + "grad_norm": 0.011383118796249788, + "learning_rate": 1.2737655386043323e-07, + "loss": 0.0001, + "step": 30390 + }, + { + "epoch": 12.359089060593737, + "grad_norm": 0.012477033614244909, + "learning_rate": 1.2721540548643274e-07, + "loss": 0.0001, + "step": 30391 + }, + { + "epoch": 12.359495729971533, + "grad_norm": 0.014237662531035701, + "learning_rate": 1.2705435846153358e-07, + "loss": 0.0002, + "step": 30392 + }, + { + "epoch": 12.359902399349329, + "grad_norm": 0.001170136465503925, + "learning_rate": 1.268934127873911e-07, + "loss": 0.0, + "step": 30393 + }, + { + "epoch": 12.360309068727124, + "grad_norm": 2.548504195678758e-06, + "learning_rate": 1.2673256846565728e-07, + "loss": 0.0, + "step": 30394 + }, + { + "epoch": 12.36071573810492, + "grad_norm": 0.05432222907486629, + "learning_rate": 1.2657182549798198e-07, + "loss": 0.0003, + "step": 30395 + }, + { + "epoch": 12.361122407482716, + "grad_norm": 0.002042308805335233, + "learning_rate": 1.2641118388601603e-07, + "loss": 0.0, + "step": 30396 + }, + { + "epoch": 12.361529076860512, + "grad_norm": 0.02837940717031566, + "learning_rate": 1.2625064363140926e-07, + "loss": 0.0002, + "step": 30397 + }, + { + "epoch": 12.361935746238307, + "grad_norm": 0.0017987484820578981, + "learning_rate": 1.2609020473580814e-07, + "loss": 0.0, + "step": 30398 + }, + { + "epoch": 12.362342415616103, + "grad_norm": 0.0004566315904318896, + "learning_rate": 1.259298672008613e-07, + "loss": 0.0, + "step": 30399 + }, + { + "epoch": 12.3627490849939, + "grad_norm": 0.00016647253769919623, + "learning_rate": 1.2576963102821306e-07, + "loss": 0.0, + "step": 30400 + }, + { + "epoch": 12.363155754371697, + "grad_norm": 4.206929311703767e-05, + "learning_rate": 1.256094962195098e-07, + "loss": 0.0, + "step": 30401 + }, + { + "epoch": 12.363562423749492, + "grad_norm": 0.13717207509610282, + "learning_rate": 1.254494627763947e-07, + "loss": 0.0019, + "step": 30402 + }, + { + "epoch": 12.363969093127288, + "grad_norm": 0.0015128266676450893, + "learning_rate": 1.2528953070050974e-07, + "loss": 0.0, + "step": 30403 + }, + { + "epoch": 12.364375762505084, + "grad_norm": 0.007627944646471391, + "learning_rate": 1.2512969999349921e-07, + "loss": 0.0001, + "step": 30404 + }, + { + "epoch": 12.36478243188288, + "grad_norm": 0.0012990717310056071, + "learning_rate": 1.2496997065700067e-07, + "loss": 0.0, + "step": 30405 + }, + { + "epoch": 12.365189101260675, + "grad_norm": 0.0159400588505088, + "learning_rate": 1.2481034269265612e-07, + "loss": 0.0001, + "step": 30406 + }, + { + "epoch": 12.365595770638471, + "grad_norm": 0.0013038043664566567, + "learning_rate": 1.2465081610210428e-07, + "loss": 0.0, + "step": 30407 + }, + { + "epoch": 12.366002440016267, + "grad_norm": 0.00015202545346805715, + "learning_rate": 1.244913908869805e-07, + "loss": 0.0, + "step": 30408 + }, + { + "epoch": 12.366409109394063, + "grad_norm": 0.0010798484909739952, + "learning_rate": 1.2433206704892341e-07, + "loss": 0.0, + "step": 30409 + }, + { + "epoch": 12.366815778771858, + "grad_norm": 0.0040424638207646, + "learning_rate": 1.2417284458956847e-07, + "loss": 0.0001, + "step": 30410 + }, + { + "epoch": 12.367222448149654, + "grad_norm": 0.08747090841550485, + "learning_rate": 1.2401372351054875e-07, + "loss": 0.0007, + "step": 30411 + }, + { + "epoch": 12.36762911752745, + "grad_norm": 0.007293773701182439, + "learning_rate": 1.2385470381349963e-07, + "loss": 0.0001, + "step": 30412 + }, + { + "epoch": 12.368035786905246, + "grad_norm": 0.0024821239087609932, + "learning_rate": 1.23695785500052e-07, + "loss": 0.0, + "step": 30413 + }, + { + "epoch": 12.368442456283042, + "grad_norm": 0.015406316659910295, + "learning_rate": 1.2353696857183794e-07, + "loss": 0.0, + "step": 30414 + }, + { + "epoch": 12.368849125660837, + "grad_norm": 0.11353944517917582, + "learning_rate": 1.2337825303048612e-07, + "loss": 0.0008, + "step": 30415 + }, + { + "epoch": 12.369255795038633, + "grad_norm": 0.02988946331121623, + "learning_rate": 1.2321963887762965e-07, + "loss": 0.0002, + "step": 30416 + }, + { + "epoch": 12.369662464416429, + "grad_norm": 0.00494562504596356, + "learning_rate": 1.2306112611489285e-07, + "loss": 0.0, + "step": 30417 + }, + { + "epoch": 12.370069133794225, + "grad_norm": 0.0005638330402012808, + "learning_rate": 1.2290271474390546e-07, + "loss": 0.0, + "step": 30418 + }, + { + "epoch": 12.37047580317202, + "grad_norm": 0.042768638110737674, + "learning_rate": 1.227444047662929e-07, + "loss": 0.0003, + "step": 30419 + }, + { + "epoch": 12.370882472549816, + "grad_norm": 0.08393488227380733, + "learning_rate": 1.225861961836794e-07, + "loss": 0.001, + "step": 30420 + }, + { + "epoch": 12.371289141927614, + "grad_norm": 0.10868297336829706, + "learning_rate": 1.224280889976892e-07, + "loss": 0.001, + "step": 30421 + }, + { + "epoch": 12.37169581130541, + "grad_norm": 0.3292437776268502, + "learning_rate": 1.222700832099477e-07, + "loss": 0.0031, + "step": 30422 + }, + { + "epoch": 12.372102480683205, + "grad_norm": 3.6508562638992554e-07, + "learning_rate": 1.221121788220747e-07, + "loss": 0.0, + "step": 30423 + }, + { + "epoch": 12.372509150061001, + "grad_norm": 0.00963633254294975, + "learning_rate": 1.2195437583569114e-07, + "loss": 0.0, + "step": 30424 + }, + { + "epoch": 12.372915819438797, + "grad_norm": 0.0007540950162978286, + "learning_rate": 1.2179667425241793e-07, + "loss": 0.0, + "step": 30425 + }, + { + "epoch": 12.373322488816592, + "grad_norm": 0.006701036156006552, + "learning_rate": 1.2163907407387265e-07, + "loss": 0.0001, + "step": 30426 + }, + { + "epoch": 12.373729158194388, + "grad_norm": 0.0007967832853759246, + "learning_rate": 1.2148157530167404e-07, + "loss": 0.0, + "step": 30427 + }, + { + "epoch": 12.374135827572184, + "grad_norm": 0.01832078636392597, + "learning_rate": 1.2132417793743967e-07, + "loss": 0.0001, + "step": 30428 + }, + { + "epoch": 12.37454249694998, + "grad_norm": 0.3626488236740062, + "learning_rate": 1.211668819827838e-07, + "loss": 0.0032, + "step": 30429 + }, + { + "epoch": 12.374949166327776, + "grad_norm": 0.016090770447665714, + "learning_rate": 1.2100968743932184e-07, + "loss": 0.0001, + "step": 30430 + }, + { + "epoch": 12.375355835705571, + "grad_norm": 0.003945752959501126, + "learning_rate": 1.2085259430866803e-07, + "loss": 0.0, + "step": 30431 + }, + { + "epoch": 12.375762505083367, + "grad_norm": 0.1518639202162745, + "learning_rate": 1.206956025924333e-07, + "loss": 0.0015, + "step": 30432 + }, + { + "epoch": 12.376169174461163, + "grad_norm": 0.007062887343807428, + "learning_rate": 1.205387122922308e-07, + "loss": 0.0001, + "step": 30433 + }, + { + "epoch": 12.376575843838959, + "grad_norm": 0.004610835529744976, + "learning_rate": 1.2038192340967035e-07, + "loss": 0.0, + "step": 30434 + }, + { + "epoch": 12.376982513216754, + "grad_norm": 0.0018689663450327035, + "learning_rate": 1.2022523594636292e-07, + "loss": 0.0, + "step": 30435 + }, + { + "epoch": 12.37738918259455, + "grad_norm": 0.0002522014112557146, + "learning_rate": 1.2006864990391498e-07, + "loss": 0.0, + "step": 30436 + }, + { + "epoch": 12.377795851972346, + "grad_norm": 0.011056400016430403, + "learning_rate": 1.1991216528393522e-07, + "loss": 0.0001, + "step": 30437 + }, + { + "epoch": 12.378202521350142, + "grad_norm": 0.005255130859844624, + "learning_rate": 1.1975578208802907e-07, + "loss": 0.0, + "step": 30438 + }, + { + "epoch": 12.378609190727937, + "grad_norm": 0.0018542815385478534, + "learning_rate": 1.1959950031780186e-07, + "loss": 0.0, + "step": 30439 + }, + { + "epoch": 12.379015860105733, + "grad_norm": 0.013978153036342192, + "learning_rate": 1.1944331997486013e-07, + "loss": 0.0001, + "step": 30440 + }, + { + "epoch": 12.37942252948353, + "grad_norm": 0.0020850139455209143, + "learning_rate": 1.192872410608048e-07, + "loss": 0.0, + "step": 30441 + }, + { + "epoch": 12.379829198861326, + "grad_norm": 0.0037353557951837002, + "learning_rate": 1.1913126357723792e-07, + "loss": 0.0, + "step": 30442 + }, + { + "epoch": 12.380235868239122, + "grad_norm": 0.012853093975209935, + "learning_rate": 1.1897538752576265e-07, + "loss": 0.0002, + "step": 30443 + }, + { + "epoch": 12.380642537616918, + "grad_norm": 0.08987716234897325, + "learning_rate": 1.1881961290797772e-07, + "loss": 0.001, + "step": 30444 + }, + { + "epoch": 12.381049206994714, + "grad_norm": 0.00043978732698799996, + "learning_rate": 1.1866393972548295e-07, + "loss": 0.0, + "step": 30445 + }, + { + "epoch": 12.38145587637251, + "grad_norm": 0.05135701743550598, + "learning_rate": 1.1850836797987486e-07, + "loss": 0.0001, + "step": 30446 + }, + { + "epoch": 12.381862545750305, + "grad_norm": 0.010800973580010153, + "learning_rate": 1.1835289767275216e-07, + "loss": 0.0001, + "step": 30447 + }, + { + "epoch": 12.382269215128101, + "grad_norm": 0.008129079195432216, + "learning_rate": 1.1819752880571134e-07, + "loss": 0.0, + "step": 30448 + }, + { + "epoch": 12.382675884505897, + "grad_norm": 5.829861129124833e-05, + "learning_rate": 1.1804226138034559e-07, + "loss": 0.0, + "step": 30449 + }, + { + "epoch": 12.383082553883693, + "grad_norm": 0.0001283321034521078, + "learning_rate": 1.1788709539824917e-07, + "loss": 0.0, + "step": 30450 + }, + { + "epoch": 12.383489223261488, + "grad_norm": 0.007249324375088958, + "learning_rate": 1.1773203086101525e-07, + "loss": 0.0001, + "step": 30451 + }, + { + "epoch": 12.383895892639284, + "grad_norm": 0.00011347342026317042, + "learning_rate": 1.1757706777023592e-07, + "loss": 0.0, + "step": 30452 + }, + { + "epoch": 12.38430256201708, + "grad_norm": 0.0005765442234603469, + "learning_rate": 1.174222061275021e-07, + "loss": 0.0, + "step": 30453 + }, + { + "epoch": 12.384709231394876, + "grad_norm": 0.03775785716766184, + "learning_rate": 1.1726744593440253e-07, + "loss": 0.0002, + "step": 30454 + }, + { + "epoch": 12.385115900772671, + "grad_norm": 6.474392815021741e-06, + "learning_rate": 1.1711278719252706e-07, + "loss": 0.0, + "step": 30455 + }, + { + "epoch": 12.385522570150467, + "grad_norm": 0.0003395954088507089, + "learning_rate": 1.1695822990346217e-07, + "loss": 0.0, + "step": 30456 + }, + { + "epoch": 12.385929239528263, + "grad_norm": 0.000533244358873211, + "learning_rate": 1.168037740687955e-07, + "loss": 0.0, + "step": 30457 + }, + { + "epoch": 12.386335908906059, + "grad_norm": 0.05054025555579788, + "learning_rate": 1.1664941969011246e-07, + "loss": 0.0006, + "step": 30458 + }, + { + "epoch": 12.386742578283854, + "grad_norm": 0.06801367972757612, + "learning_rate": 1.1649516676899619e-07, + "loss": 0.0006, + "step": 30459 + }, + { + "epoch": 12.38714924766165, + "grad_norm": 0.013617752383819902, + "learning_rate": 1.1634101530703213e-07, + "loss": 0.0001, + "step": 30460 + }, + { + "epoch": 12.387555917039446, + "grad_norm": 0.0007330119246994163, + "learning_rate": 1.1618696530580121e-07, + "loss": 0.0, + "step": 30461 + }, + { + "epoch": 12.387962586417244, + "grad_norm": 0.00242040535895368, + "learning_rate": 1.160330167668866e-07, + "loss": 0.0, + "step": 30462 + }, + { + "epoch": 12.38836925579504, + "grad_norm": 3.910486416401691e-05, + "learning_rate": 1.1587916969186708e-07, + "loss": 0.0, + "step": 30463 + }, + { + "epoch": 12.388775925172835, + "grad_norm": 0.004105386390089333, + "learning_rate": 1.1572542408232246e-07, + "loss": 0.0, + "step": 30464 + }, + { + "epoch": 12.38918259455063, + "grad_norm": 0.0017195297658157429, + "learning_rate": 1.1557177993983037e-07, + "loss": 0.0, + "step": 30465 + }, + { + "epoch": 12.389589263928427, + "grad_norm": 0.004474749373447617, + "learning_rate": 1.1541823726596957e-07, + "loss": 0.0, + "step": 30466 + }, + { + "epoch": 12.389995933306222, + "grad_norm": 0.00047844604272889046, + "learning_rate": 1.1526479606231544e-07, + "loss": 0.0, + "step": 30467 + }, + { + "epoch": 12.390402602684018, + "grad_norm": 0.02983581158533063, + "learning_rate": 1.151114563304423e-07, + "loss": 0.0003, + "step": 30468 + }, + { + "epoch": 12.390809272061814, + "grad_norm": 0.002216747835587123, + "learning_rate": 1.1495821807192554e-07, + "loss": 0.0, + "step": 30469 + }, + { + "epoch": 12.39121594143961, + "grad_norm": 0.027805116572113455, + "learning_rate": 1.1480508128833834e-07, + "loss": 0.0003, + "step": 30470 + }, + { + "epoch": 12.391622610817405, + "grad_norm": 0.06321103798754552, + "learning_rate": 1.1465204598125058e-07, + "loss": 0.0005, + "step": 30471 + }, + { + "epoch": 12.392029280195201, + "grad_norm": 0.00701448381658212, + "learning_rate": 1.1449911215223541e-07, + "loss": 0.0, + "step": 30472 + }, + { + "epoch": 12.392435949572997, + "grad_norm": 0.011187615061670859, + "learning_rate": 1.1434627980286272e-07, + "loss": 0.0001, + "step": 30473 + }, + { + "epoch": 12.392842618950793, + "grad_norm": 0.009449457525333197, + "learning_rate": 1.1419354893470014e-07, + "loss": 0.0001, + "step": 30474 + }, + { + "epoch": 12.393249288328589, + "grad_norm": 0.0016841487061683316, + "learning_rate": 1.140409195493164e-07, + "loss": 0.0, + "step": 30475 + }, + { + "epoch": 12.393655957706384, + "grad_norm": 0.026884286269227422, + "learning_rate": 1.1388839164827803e-07, + "loss": 0.0001, + "step": 30476 + }, + { + "epoch": 12.39406262708418, + "grad_norm": 0.0033570972120265784, + "learning_rate": 1.1373596523315156e-07, + "loss": 0.0, + "step": 30477 + }, + { + "epoch": 12.394469296461976, + "grad_norm": 0.0050704502451760355, + "learning_rate": 1.1358364030550018e-07, + "loss": 0.0, + "step": 30478 + }, + { + "epoch": 12.394875965839772, + "grad_norm": 0.0003384902506941452, + "learning_rate": 1.1343141686688819e-07, + "loss": 0.0, + "step": 30479 + }, + { + "epoch": 12.395282635217567, + "grad_norm": 0.0017579958862485115, + "learning_rate": 1.132792949188799e-07, + "loss": 0.0, + "step": 30480 + }, + { + "epoch": 12.395689304595363, + "grad_norm": 0.009575319275349686, + "learning_rate": 1.1312727446303406e-07, + "loss": 0.0001, + "step": 30481 + }, + { + "epoch": 12.39609597397316, + "grad_norm": 0.029135192047913677, + "learning_rate": 1.1297535550091388e-07, + "loss": 0.0002, + "step": 30482 + }, + { + "epoch": 12.396502643350956, + "grad_norm": 0.000795586859886482, + "learning_rate": 1.1282353803407698e-07, + "loss": 0.0, + "step": 30483 + }, + { + "epoch": 12.396909312728752, + "grad_norm": 0.020529532334386192, + "learning_rate": 1.1267182206408322e-07, + "loss": 0.0002, + "step": 30484 + }, + { + "epoch": 12.397315982106548, + "grad_norm": 0.024992409978838098, + "learning_rate": 1.1252020759248805e-07, + "loss": 0.0004, + "step": 30485 + }, + { + "epoch": 12.397722651484344, + "grad_norm": 0.11781053141308435, + "learning_rate": 1.1236869462085021e-07, + "loss": 0.001, + "step": 30486 + }, + { + "epoch": 12.39812932086214, + "grad_norm": 0.04151146640788903, + "learning_rate": 1.1221728315072399e-07, + "loss": 0.0003, + "step": 30487 + }, + { + "epoch": 12.398535990239935, + "grad_norm": 0.0006231197214070352, + "learning_rate": 1.1206597318366485e-07, + "loss": 0.0, + "step": 30488 + }, + { + "epoch": 12.398942659617731, + "grad_norm": 0.0038775412744504285, + "learning_rate": 1.1191476472122375e-07, + "loss": 0.0, + "step": 30489 + }, + { + "epoch": 12.399349328995527, + "grad_norm": 0.002559623999396161, + "learning_rate": 1.117636577649539e-07, + "loss": 0.0, + "step": 30490 + }, + { + "epoch": 12.399755998373323, + "grad_norm": 0.017863912278001114, + "learning_rate": 1.1161265231640739e-07, + "loss": 0.0001, + "step": 30491 + }, + { + "epoch": 12.400162667751118, + "grad_norm": 0.005923051385767723, + "learning_rate": 1.1146174837713297e-07, + "loss": 0.0001, + "step": 30492 + }, + { + "epoch": 12.400569337128914, + "grad_norm": 0.008787790333262532, + "learning_rate": 1.1131094594868163e-07, + "loss": 0.0, + "step": 30493 + }, + { + "epoch": 12.40097600650671, + "grad_norm": 0.0013345594860762657, + "learning_rate": 1.1116024503259993e-07, + "loss": 0.0, + "step": 30494 + }, + { + "epoch": 12.401382675884506, + "grad_norm": 0.0027256491167808594, + "learning_rate": 1.1100964563043438e-07, + "loss": 0.0, + "step": 30495 + }, + { + "epoch": 12.401789345262301, + "grad_norm": 0.0015406521120275773, + "learning_rate": 1.1085914774373262e-07, + "loss": 0.0, + "step": 30496 + }, + { + "epoch": 12.402196014640097, + "grad_norm": 0.0226670382145329, + "learning_rate": 1.1070875137403903e-07, + "loss": 0.0002, + "step": 30497 + }, + { + "epoch": 12.402602684017893, + "grad_norm": 0.02522461665506648, + "learning_rate": 1.1055845652289676e-07, + "loss": 0.0002, + "step": 30498 + }, + { + "epoch": 12.403009353395689, + "grad_norm": 0.005525917075467306, + "learning_rate": 1.1040826319184905e-07, + "loss": 0.0, + "step": 30499 + }, + { + "epoch": 12.403416022773484, + "grad_norm": 0.0003047553069493586, + "learning_rate": 1.10258171382438e-07, + "loss": 0.0, + "step": 30500 + }, + { + "epoch": 12.40382269215128, + "grad_norm": 3.525682058035176e-05, + "learning_rate": 1.1010818109620458e-07, + "loss": 0.0, + "step": 30501 + }, + { + "epoch": 12.404229361529076, + "grad_norm": 0.0004589384170742706, + "learning_rate": 1.0995829233468868e-07, + "loss": 0.0, + "step": 30502 + }, + { + "epoch": 12.404636030906874, + "grad_norm": 0.05613661058032478, + "learning_rate": 1.0980850509942798e-07, + "loss": 0.0005, + "step": 30503 + }, + { + "epoch": 12.40504270028467, + "grad_norm": 0.00047946574099243217, + "learning_rate": 1.09658819391959e-07, + "loss": 0.0, + "step": 30504 + }, + { + "epoch": 12.405449369662465, + "grad_norm": 0.00701631083234439, + "learning_rate": 1.0950923521382162e-07, + "loss": 0.0001, + "step": 30505 + }, + { + "epoch": 12.40585603904026, + "grad_norm": 0.0001288164220052208, + "learning_rate": 1.0935975256655018e-07, + "loss": 0.0, + "step": 30506 + }, + { + "epoch": 12.406262708418057, + "grad_norm": 0.00755605791687758, + "learning_rate": 1.092103714516779e-07, + "loss": 0.0, + "step": 30507 + }, + { + "epoch": 12.406669377795852, + "grad_norm": 0.012862288008948038, + "learning_rate": 1.090610918707391e-07, + "loss": 0.0001, + "step": 30508 + }, + { + "epoch": 12.407076047173648, + "grad_norm": 0.0004034353854370726, + "learning_rate": 1.0891191382526589e-07, + "loss": 0.0, + "step": 30509 + }, + { + "epoch": 12.407482716551444, + "grad_norm": 0.00790323686520989, + "learning_rate": 1.0876283731679039e-07, + "loss": 0.0001, + "step": 30510 + }, + { + "epoch": 12.40788938592924, + "grad_norm": 0.1454498934070575, + "learning_rate": 1.0861386234684245e-07, + "loss": 0.0015, + "step": 30511 + }, + { + "epoch": 12.408296055307035, + "grad_norm": 6.412496051411831e-05, + "learning_rate": 1.08464988916952e-07, + "loss": 0.0, + "step": 30512 + }, + { + "epoch": 12.408702724684831, + "grad_norm": 0.0027499332165696767, + "learning_rate": 1.0831621702864559e-07, + "loss": 0.0, + "step": 30513 + }, + { + "epoch": 12.409109394062627, + "grad_norm": 0.03747829924721733, + "learning_rate": 1.0816754668345197e-07, + "loss": 0.0003, + "step": 30514 + }, + { + "epoch": 12.409516063440423, + "grad_norm": 0.07949260755248723, + "learning_rate": 1.0801897788289773e-07, + "loss": 0.0007, + "step": 30515 + }, + { + "epoch": 12.409922732818218, + "grad_norm": 0.004001040126257233, + "learning_rate": 1.0787051062850718e-07, + "loss": 0.0, + "step": 30516 + }, + { + "epoch": 12.410329402196014, + "grad_norm": 0.0021606541748751917, + "learning_rate": 1.0772214492180355e-07, + "loss": 0.0, + "step": 30517 + }, + { + "epoch": 12.41073607157381, + "grad_norm": 0.00010826951592078047, + "learning_rate": 1.0757388076431119e-07, + "loss": 0.0, + "step": 30518 + }, + { + "epoch": 12.411142740951606, + "grad_norm": 0.00037890055168378654, + "learning_rate": 1.074257181575511e-07, + "loss": 0.0, + "step": 30519 + }, + { + "epoch": 12.411549410329402, + "grad_norm": 0.005225754043554956, + "learning_rate": 1.0727765710304539e-07, + "loss": 0.0, + "step": 30520 + }, + { + "epoch": 12.411956079707197, + "grad_norm": 0.007229056474262002, + "learning_rate": 1.0712969760231395e-07, + "loss": 0.0, + "step": 30521 + }, + { + "epoch": 12.412362749084993, + "grad_norm": 0.008365175829820971, + "learning_rate": 1.0698183965687558e-07, + "loss": 0.0001, + "step": 30522 + }, + { + "epoch": 12.41276941846279, + "grad_norm": 0.0006044220536825853, + "learning_rate": 1.0683408326824574e-07, + "loss": 0.0, + "step": 30523 + }, + { + "epoch": 12.413176087840586, + "grad_norm": 0.00020844852641694185, + "learning_rate": 1.066864284379443e-07, + "loss": 0.0, + "step": 30524 + }, + { + "epoch": 12.413582757218382, + "grad_norm": 0.007732617433016517, + "learning_rate": 1.0653887516748562e-07, + "loss": 0.0001, + "step": 30525 + }, + { + "epoch": 12.413989426596178, + "grad_norm": 0.004273914994756085, + "learning_rate": 1.0639142345838516e-07, + "loss": 0.0001, + "step": 30526 + }, + { + "epoch": 12.414396095973974, + "grad_norm": 0.06729210421267921, + "learning_rate": 1.0624407331215613e-07, + "loss": 0.0006, + "step": 30527 + }, + { + "epoch": 12.41480276535177, + "grad_norm": 0.005969683602691644, + "learning_rate": 1.0609682473031069e-07, + "loss": 0.0, + "step": 30528 + }, + { + "epoch": 12.415209434729565, + "grad_norm": 0.013583935287818408, + "learning_rate": 1.0594967771435982e-07, + "loss": 0.0001, + "step": 30529 + }, + { + "epoch": 12.415616104107361, + "grad_norm": 0.14250502254004815, + "learning_rate": 1.0580263226581678e-07, + "loss": 0.0012, + "step": 30530 + }, + { + "epoch": 12.416022773485157, + "grad_norm": 6.610672660721823e-05, + "learning_rate": 1.0565568838618811e-07, + "loss": 0.0, + "step": 30531 + }, + { + "epoch": 12.416429442862952, + "grad_norm": 4.333351421417871e-05, + "learning_rate": 1.0550884607698486e-07, + "loss": 0.0, + "step": 30532 + }, + { + "epoch": 12.416836112240748, + "grad_norm": 0.05862569295654222, + "learning_rate": 1.0536210533971247e-07, + "loss": 0.0005, + "step": 30533 + }, + { + "epoch": 12.417242781618544, + "grad_norm": 0.0011375686345176104, + "learning_rate": 1.0521546617587753e-07, + "loss": 0.0, + "step": 30534 + }, + { + "epoch": 12.41764945099634, + "grad_norm": 0.012618230151704133, + "learning_rate": 1.0506892858698658e-07, + "loss": 0.0001, + "step": 30535 + }, + { + "epoch": 12.418056120374136, + "grad_norm": 0.020561780720333416, + "learning_rate": 1.0492249257454179e-07, + "loss": 0.0002, + "step": 30536 + }, + { + "epoch": 12.418462789751931, + "grad_norm": 1.3360888701908151e-05, + "learning_rate": 1.047761581400486e-07, + "loss": 0.0, + "step": 30537 + }, + { + "epoch": 12.418869459129727, + "grad_norm": 0.01150502489518874, + "learning_rate": 1.0462992528500804e-07, + "loss": 0.0001, + "step": 30538 + }, + { + "epoch": 12.419276128507523, + "grad_norm": 0.001441340106140634, + "learning_rate": 1.0448379401092223e-07, + "loss": 0.0, + "step": 30539 + }, + { + "epoch": 12.419682797885319, + "grad_norm": 0.0005485107058194472, + "learning_rate": 1.0433776431928999e-07, + "loss": 0.0, + "step": 30540 + }, + { + "epoch": 12.420089467263114, + "grad_norm": 0.009311968614418511, + "learning_rate": 1.0419183621161122e-07, + "loss": 0.0001, + "step": 30541 + }, + { + "epoch": 12.42049613664091, + "grad_norm": 0.0035341292619089477, + "learning_rate": 1.040460096893825e-07, + "loss": 0.0, + "step": 30542 + }, + { + "epoch": 12.420902806018706, + "grad_norm": 0.0005117779086973119, + "learning_rate": 1.0390028475410374e-07, + "loss": 0.0, + "step": 30543 + }, + { + "epoch": 12.421309475396503, + "grad_norm": 0.018863400378688156, + "learning_rate": 1.037546614072682e-07, + "loss": 0.0001, + "step": 30544 + }, + { + "epoch": 12.4217161447743, + "grad_norm": 0.005452864247002631, + "learning_rate": 1.0360913965037245e-07, + "loss": 0.0, + "step": 30545 + }, + { + "epoch": 12.422122814152095, + "grad_norm": 0.021098475900380775, + "learning_rate": 1.0346371948490863e-07, + "loss": 0.0002, + "step": 30546 + }, + { + "epoch": 12.42252948352989, + "grad_norm": 0.014993908676784495, + "learning_rate": 1.033184009123711e-07, + "loss": 0.0001, + "step": 30547 + }, + { + "epoch": 12.422936152907686, + "grad_norm": 0.0013428565889511425, + "learning_rate": 1.031731839342509e-07, + "loss": 0.0, + "step": 30548 + }, + { + "epoch": 12.423342822285482, + "grad_norm": 0.014826162890300938, + "learning_rate": 1.0302806855203907e-07, + "loss": 0.0001, + "step": 30549 + }, + { + "epoch": 12.423749491663278, + "grad_norm": 0.002031925607477681, + "learning_rate": 1.0288305476722548e-07, + "loss": 0.0, + "step": 30550 + }, + { + "epoch": 12.424156161041074, + "grad_norm": 0.0009602375998900468, + "learning_rate": 1.0273814258129788e-07, + "loss": 0.0, + "step": 30551 + }, + { + "epoch": 12.42456283041887, + "grad_norm": 0.0023557928484766854, + "learning_rate": 1.0259333199574506e-07, + "loss": 0.0, + "step": 30552 + }, + { + "epoch": 12.424969499796665, + "grad_norm": 0.0042457856000189635, + "learning_rate": 1.024486230120525e-07, + "loss": 0.0, + "step": 30553 + }, + { + "epoch": 12.425376169174461, + "grad_norm": 0.009396536734538753, + "learning_rate": 1.0230401563170677e-07, + "loss": 0.0001, + "step": 30554 + }, + { + "epoch": 12.425782838552257, + "grad_norm": 0.005735008154236151, + "learning_rate": 1.0215950985619005e-07, + "loss": 0.0, + "step": 30555 + }, + { + "epoch": 12.426189507930053, + "grad_norm": 0.006233199234493037, + "learning_rate": 1.0201510568698892e-07, + "loss": 0.0, + "step": 30556 + }, + { + "epoch": 12.426596177307848, + "grad_norm": 0.0014153307695582706, + "learning_rate": 1.018708031255844e-07, + "loss": 0.0, + "step": 30557 + }, + { + "epoch": 12.427002846685644, + "grad_norm": 0.10898718031628608, + "learning_rate": 1.0172660217345753e-07, + "loss": 0.001, + "step": 30558 + }, + { + "epoch": 12.42740951606344, + "grad_norm": 0.03569768700954928, + "learning_rate": 1.0158250283208825e-07, + "loss": 0.0004, + "step": 30559 + }, + { + "epoch": 12.427816185441236, + "grad_norm": 0.0001306712557404645, + "learning_rate": 1.014385051029565e-07, + "loss": 0.0, + "step": 30560 + }, + { + "epoch": 12.428222854819031, + "grad_norm": 0.011697588278413188, + "learning_rate": 1.0129460898754106e-07, + "loss": 0.0001, + "step": 30561 + }, + { + "epoch": 12.428629524196827, + "grad_norm": 0.2473624593451748, + "learning_rate": 1.0115081448731634e-07, + "loss": 0.0026, + "step": 30562 + }, + { + "epoch": 12.429036193574623, + "grad_norm": 0.04551066212610465, + "learning_rate": 1.0100712160376225e-07, + "loss": 0.0003, + "step": 30563 + }, + { + "epoch": 12.42944286295242, + "grad_norm": 0.0116726398943797, + "learning_rate": 1.0086353033835206e-07, + "loss": 0.0, + "step": 30564 + }, + { + "epoch": 12.429849532330216, + "grad_norm": 2.3378906452722993e-05, + "learning_rate": 1.0072004069256014e-07, + "loss": 0.0, + "step": 30565 + }, + { + "epoch": 12.430256201708012, + "grad_norm": 0.003576086043717519, + "learning_rate": 1.0057665266785865e-07, + "loss": 0.0, + "step": 30566 + }, + { + "epoch": 12.430662871085808, + "grad_norm": 0.03504611702421173, + "learning_rate": 1.0043336626571976e-07, + "loss": 0.0003, + "step": 30567 + }, + { + "epoch": 12.431069540463604, + "grad_norm": 0.0010271091286521814, + "learning_rate": 1.0029018148761449e-07, + "loss": 0.0, + "step": 30568 + }, + { + "epoch": 12.4314762098414, + "grad_norm": 0.00014209189195425902, + "learning_rate": 1.0014709833501391e-07, + "loss": 0.0, + "step": 30569 + }, + { + "epoch": 12.431882879219195, + "grad_norm": 0.0011803329522244491, + "learning_rate": 1.000041168093857e-07, + "loss": 0.0, + "step": 30570 + }, + { + "epoch": 12.43228954859699, + "grad_norm": 0.13996336138798957, + "learning_rate": 9.986123691219762e-08, + "loss": 0.0008, + "step": 30571 + }, + { + "epoch": 12.432696217974787, + "grad_norm": 0.024395857040383474, + "learning_rate": 9.971845864491625e-08, + "loss": 0.0001, + "step": 30572 + }, + { + "epoch": 12.433102887352582, + "grad_norm": 0.004872472245352285, + "learning_rate": 9.957578200900819e-08, + "loss": 0.0, + "step": 30573 + }, + { + "epoch": 12.433509556730378, + "grad_norm": 0.0028410804867073933, + "learning_rate": 9.943320700593673e-08, + "loss": 0.0, + "step": 30574 + }, + { + "epoch": 12.433916226108174, + "grad_norm": 0.027123090723113225, + "learning_rate": 9.929073363716624e-08, + "loss": 0.0001, + "step": 30575 + }, + { + "epoch": 12.43432289548597, + "grad_norm": 0.039786678525746036, + "learning_rate": 9.914836190416e-08, + "loss": 0.0003, + "step": 30576 + }, + { + "epoch": 12.434729564863765, + "grad_norm": 0.011469800390340883, + "learning_rate": 9.900609180837905e-08, + "loss": 0.0001, + "step": 30577 + }, + { + "epoch": 12.435136234241561, + "grad_norm": 0.034486172204738014, + "learning_rate": 9.886392335128336e-08, + "loss": 0.0001, + "step": 30578 + }, + { + "epoch": 12.435542903619357, + "grad_norm": 0.00470755977865476, + "learning_rate": 9.872185653433286e-08, + "loss": 0.0, + "step": 30579 + }, + { + "epoch": 12.435949572997153, + "grad_norm": 0.002534138384299515, + "learning_rate": 9.857989135898527e-08, + "loss": 0.0, + "step": 30580 + }, + { + "epoch": 12.436356242374949, + "grad_norm": 0.0011022907365257715, + "learning_rate": 9.84380278266972e-08, + "loss": 0.0, + "step": 30581 + }, + { + "epoch": 12.436762911752744, + "grad_norm": 0.00024918448051859795, + "learning_rate": 9.829626593892749e-08, + "loss": 0.0, + "step": 30582 + }, + { + "epoch": 12.43716958113054, + "grad_norm": 5.736181189403263e-05, + "learning_rate": 9.815460569713054e-08, + "loss": 0.0, + "step": 30583 + }, + { + "epoch": 12.437576250508336, + "grad_norm": 0.029847535405878055, + "learning_rate": 9.80130471027596e-08, + "loss": 0.0002, + "step": 30584 + }, + { + "epoch": 12.437982919886133, + "grad_norm": 5.0693821534275427e-05, + "learning_rate": 9.7871590157268e-08, + "loss": 0.0, + "step": 30585 + }, + { + "epoch": 12.43838958926393, + "grad_norm": 0.005647276512061899, + "learning_rate": 9.773023486210786e-08, + "loss": 0.0, + "step": 30586 + }, + { + "epoch": 12.438796258641725, + "grad_norm": 0.0012848755456007156, + "learning_rate": 9.75889812187314e-08, + "loss": 0.0, + "step": 30587 + }, + { + "epoch": 12.43920292801952, + "grad_norm": 0.01183580598695886, + "learning_rate": 9.744782922858631e-08, + "loss": 0.0001, + "step": 30588 + }, + { + "epoch": 12.439609597397316, + "grad_norm": 0.0008727790172497364, + "learning_rate": 9.730677889312479e-08, + "loss": 0.0, + "step": 30589 + }, + { + "epoch": 12.440016266775112, + "grad_norm": 0.02432487169182322, + "learning_rate": 9.716583021379234e-08, + "loss": 0.0002, + "step": 30590 + }, + { + "epoch": 12.440422936152908, + "grad_norm": 0.017485673708044734, + "learning_rate": 9.702498319203668e-08, + "loss": 0.0001, + "step": 30591 + }, + { + "epoch": 12.440829605530704, + "grad_norm": 0.0061645425370802365, + "learning_rate": 9.688423782930445e-08, + "loss": 0.0001, + "step": 30592 + }, + { + "epoch": 12.4412362749085, + "grad_norm": 0.01662663584074795, + "learning_rate": 9.674359412703893e-08, + "loss": 0.0002, + "step": 30593 + }, + { + "epoch": 12.441642944286295, + "grad_norm": 0.089033396995634, + "learning_rate": 9.660305208668342e-08, + "loss": 0.001, + "step": 30594 + }, + { + "epoch": 12.442049613664091, + "grad_norm": 0.00013750300323884477, + "learning_rate": 9.64626117096834e-08, + "loss": 0.0, + "step": 30595 + }, + { + "epoch": 12.442456283041887, + "grad_norm": 0.03474665411400114, + "learning_rate": 9.632227299747888e-08, + "loss": 0.0005, + "step": 30596 + }, + { + "epoch": 12.442862952419683, + "grad_norm": 0.00046096804503719324, + "learning_rate": 9.618203595151088e-08, + "loss": 0.0, + "step": 30597 + }, + { + "epoch": 12.443269621797478, + "grad_norm": 1.748554571679223e-05, + "learning_rate": 9.604190057321827e-08, + "loss": 0.0, + "step": 30598 + }, + { + "epoch": 12.443676291175274, + "grad_norm": 0.00674111245602569, + "learning_rate": 9.590186686403991e-08, + "loss": 0.0001, + "step": 30599 + }, + { + "epoch": 12.44408296055307, + "grad_norm": 1.2217514661588681e-05, + "learning_rate": 9.576193482541462e-08, + "loss": 0.0, + "step": 30600 + }, + { + "epoch": 12.444489629930866, + "grad_norm": 0.0013189215517564416, + "learning_rate": 9.562210445877573e-08, + "loss": 0.0, + "step": 30601 + }, + { + "epoch": 12.444896299308661, + "grad_norm": 0.0031556550097301727, + "learning_rate": 9.548237576556208e-08, + "loss": 0.0, + "step": 30602 + }, + { + "epoch": 12.445302968686457, + "grad_norm": 0.01286077476792878, + "learning_rate": 9.534274874720695e-08, + "loss": 0.0001, + "step": 30603 + }, + { + "epoch": 12.445709638064255, + "grad_norm": 0.01920345856551467, + "learning_rate": 9.520322340514254e-08, + "loss": 0.0001, + "step": 30604 + }, + { + "epoch": 12.44611630744205, + "grad_norm": 0.0010844139255962381, + "learning_rate": 9.506379974080326e-08, + "loss": 0.0, + "step": 30605 + }, + { + "epoch": 12.446522976819846, + "grad_norm": 0.09397737988172813, + "learning_rate": 9.492447775561797e-08, + "loss": 0.0004, + "step": 30606 + }, + { + "epoch": 12.446929646197642, + "grad_norm": 0.00039499165095964873, + "learning_rate": 9.478525745101774e-08, + "loss": 0.0, + "step": 30607 + }, + { + "epoch": 12.447336315575438, + "grad_norm": 0.004998820334934031, + "learning_rate": 9.464613882843254e-08, + "loss": 0.0, + "step": 30608 + }, + { + "epoch": 12.447742984953234, + "grad_norm": 0.004604349140736922, + "learning_rate": 9.450712188929013e-08, + "loss": 0.0, + "step": 30609 + }, + { + "epoch": 12.44814965433103, + "grad_norm": 0.0008867339876640894, + "learning_rate": 9.436820663501823e-08, + "loss": 0.0, + "step": 30610 + }, + { + "epoch": 12.448556323708825, + "grad_norm": 5.023589307004153e-05, + "learning_rate": 9.422939306704127e-08, + "loss": 0.0, + "step": 30611 + }, + { + "epoch": 12.44896299308662, + "grad_norm": 0.0012630853710243913, + "learning_rate": 9.409068118678477e-08, + "loss": 0.0, + "step": 30612 + }, + { + "epoch": 12.449369662464417, + "grad_norm": 0.0004227112069542068, + "learning_rate": 9.395207099567316e-08, + "loss": 0.0, + "step": 30613 + }, + { + "epoch": 12.449776331842212, + "grad_norm": 0.006797841724806427, + "learning_rate": 9.381356249512974e-08, + "loss": 0.0001, + "step": 30614 + }, + { + "epoch": 12.450183001220008, + "grad_norm": 0.011185133074892896, + "learning_rate": 9.367515568657448e-08, + "loss": 0.0001, + "step": 30615 + }, + { + "epoch": 12.450589670597804, + "grad_norm": 0.060020449090082716, + "learning_rate": 9.35368505714307e-08, + "loss": 0.0007, + "step": 30616 + }, + { + "epoch": 12.4509963399756, + "grad_norm": 0.0007779932887080705, + "learning_rate": 9.339864715111723e-08, + "loss": 0.0, + "step": 30617 + }, + { + "epoch": 12.451403009353395, + "grad_norm": 0.8789895290418055, + "learning_rate": 9.326054542705188e-08, + "loss": 0.0082, + "step": 30618 + }, + { + "epoch": 12.451809678731191, + "grad_norm": 0.0022749446375594606, + "learning_rate": 9.312254540065235e-08, + "loss": 0.0, + "step": 30619 + }, + { + "epoch": 12.452216348108987, + "grad_norm": 0.005154600507415644, + "learning_rate": 9.298464707333643e-08, + "loss": 0.0, + "step": 30620 + }, + { + "epoch": 12.452623017486783, + "grad_norm": 0.00709074644666016, + "learning_rate": 9.284685044651965e-08, + "loss": 0.0001, + "step": 30621 + }, + { + "epoch": 12.453029686864578, + "grad_norm": 0.008020722882743967, + "learning_rate": 9.270915552161641e-08, + "loss": 0.0001, + "step": 30622 + }, + { + "epoch": 12.453436356242374, + "grad_norm": 0.0038053810084431707, + "learning_rate": 9.257156230003894e-08, + "loss": 0.0, + "step": 30623 + }, + { + "epoch": 12.45384302562017, + "grad_norm": 0.0015717449303336978, + "learning_rate": 9.243407078320165e-08, + "loss": 0.0, + "step": 30624 + }, + { + "epoch": 12.454249694997966, + "grad_norm": 0.026577791258033848, + "learning_rate": 9.229668097251454e-08, + "loss": 0.0003, + "step": 30625 + }, + { + "epoch": 12.454656364375763, + "grad_norm": 0.028712829553146735, + "learning_rate": 9.215939286938758e-08, + "loss": 0.0001, + "step": 30626 + }, + { + "epoch": 12.455063033753559, + "grad_norm": 0.0010588433031554296, + "learning_rate": 9.202220647523185e-08, + "loss": 0.0, + "step": 30627 + }, + { + "epoch": 12.455469703131355, + "grad_norm": 6.308919263069608e-05, + "learning_rate": 9.188512179145514e-08, + "loss": 0.0, + "step": 30628 + }, + { + "epoch": 12.45587637250915, + "grad_norm": 0.0005533432192778267, + "learning_rate": 9.17481388194641e-08, + "loss": 0.0, + "step": 30629 + }, + { + "epoch": 12.456283041886946, + "grad_norm": 0.026115034598789704, + "learning_rate": 9.161125756066424e-08, + "loss": 0.0003, + "step": 30630 + }, + { + "epoch": 12.456689711264742, + "grad_norm": 0.015939942377670484, + "learning_rate": 9.147447801646226e-08, + "loss": 0.0002, + "step": 30631 + }, + { + "epoch": 12.457096380642538, + "grad_norm": 0.0006338717472598179, + "learning_rate": 9.133780018826143e-08, + "loss": 0.0, + "step": 30632 + }, + { + "epoch": 12.457503050020334, + "grad_norm": 0.0006443513399222087, + "learning_rate": 9.120122407746512e-08, + "loss": 0.0, + "step": 30633 + }, + { + "epoch": 12.45790971939813, + "grad_norm": 0.008984747240593087, + "learning_rate": 9.106474968547551e-08, + "loss": 0.0001, + "step": 30634 + }, + { + "epoch": 12.458316388775925, + "grad_norm": 0.004219225212826302, + "learning_rate": 9.092837701369261e-08, + "loss": 0.0, + "step": 30635 + }, + { + "epoch": 12.458723058153721, + "grad_norm": 0.009300861364877663, + "learning_rate": 9.079210606351752e-08, + "loss": 0.0001, + "step": 30636 + }, + { + "epoch": 12.459129727531517, + "grad_norm": 0.006206473659490576, + "learning_rate": 9.06559368363491e-08, + "loss": 0.0, + "step": 30637 + }, + { + "epoch": 12.459536396909312, + "grad_norm": 0.08059147791011331, + "learning_rate": 9.051986933358403e-08, + "loss": 0.0005, + "step": 30638 + }, + { + "epoch": 12.459943066287108, + "grad_norm": 0.019356459173239655, + "learning_rate": 9.038390355662008e-08, + "loss": 0.0002, + "step": 30639 + }, + { + "epoch": 12.460349735664904, + "grad_norm": 0.0013273896960493477, + "learning_rate": 9.024803950685279e-08, + "loss": 0.0, + "step": 30640 + }, + { + "epoch": 12.4607564050427, + "grad_norm": 0.02927039464139566, + "learning_rate": 9.011227718567662e-08, + "loss": 0.0002, + "step": 30641 + }, + { + "epoch": 12.461163074420496, + "grad_norm": 0.006663655772832602, + "learning_rate": 8.997661659448598e-08, + "loss": 0.0001, + "step": 30642 + }, + { + "epoch": 12.461569743798291, + "grad_norm": 0.0028042015716481764, + "learning_rate": 8.984105773467311e-08, + "loss": 0.0, + "step": 30643 + }, + { + "epoch": 12.461976413176087, + "grad_norm": 0.0009395737310127212, + "learning_rate": 8.970560060762912e-08, + "loss": 0.0, + "step": 30644 + }, + { + "epoch": 12.462383082553885, + "grad_norm": 0.017582032615895773, + "learning_rate": 8.9570245214744e-08, + "loss": 0.0001, + "step": 30645 + }, + { + "epoch": 12.46278975193168, + "grad_norm": 0.0020010773229130582, + "learning_rate": 8.943499155740886e-08, + "loss": 0.0, + "step": 30646 + }, + { + "epoch": 12.463196421309476, + "grad_norm": 1.7396591821721313e-05, + "learning_rate": 8.92998396370115e-08, + "loss": 0.0, + "step": 30647 + }, + { + "epoch": 12.463603090687272, + "grad_norm": 0.0032561116851857067, + "learning_rate": 8.916478945493967e-08, + "loss": 0.0, + "step": 30648 + }, + { + "epoch": 12.464009760065068, + "grad_norm": 0.00011190861351890625, + "learning_rate": 8.902984101257894e-08, + "loss": 0.0, + "step": 30649 + }, + { + "epoch": 12.464416429442863, + "grad_norm": 0.0006947815163371099, + "learning_rate": 8.889499431131487e-08, + "loss": 0.0, + "step": 30650 + }, + { + "epoch": 12.46482309882066, + "grad_norm": 0.0013645220218534168, + "learning_rate": 8.876024935253191e-08, + "loss": 0.0, + "step": 30651 + }, + { + "epoch": 12.465229768198455, + "grad_norm": 0.0002611333749314421, + "learning_rate": 8.862560613761228e-08, + "loss": 0.0, + "step": 30652 + }, + { + "epoch": 12.46563643757625, + "grad_norm": 0.04610197542094903, + "learning_rate": 8.849106466794045e-08, + "loss": 0.0003, + "step": 30653 + }, + { + "epoch": 12.466043106954046, + "grad_norm": 0.0003693431703182055, + "learning_rate": 8.835662494489638e-08, + "loss": 0.0, + "step": 30654 + }, + { + "epoch": 12.466449776331842, + "grad_norm": 3.9291123945992095e-05, + "learning_rate": 8.822228696985901e-08, + "loss": 0.0, + "step": 30655 + }, + { + "epoch": 12.466856445709638, + "grad_norm": 0.00021210966201857728, + "learning_rate": 8.808805074420835e-08, + "loss": 0.0, + "step": 30656 + }, + { + "epoch": 12.467263115087434, + "grad_norm": 0.0017395049560505287, + "learning_rate": 8.795391626932325e-08, + "loss": 0.0, + "step": 30657 + }, + { + "epoch": 12.46766978446523, + "grad_norm": 0.016655012076271323, + "learning_rate": 8.781988354657823e-08, + "loss": 0.0002, + "step": 30658 + }, + { + "epoch": 12.468076453843025, + "grad_norm": 0.05687794544871106, + "learning_rate": 8.768595257735213e-08, + "loss": 0.0004, + "step": 30659 + }, + { + "epoch": 12.468483123220821, + "grad_norm": 0.11182044151249783, + "learning_rate": 8.755212336301832e-08, + "loss": 0.0009, + "step": 30660 + }, + { + "epoch": 12.468889792598617, + "grad_norm": 0.004041500709436346, + "learning_rate": 8.741839590495127e-08, + "loss": 0.0, + "step": 30661 + }, + { + "epoch": 12.469296461976413, + "grad_norm": 0.07561071833346095, + "learning_rate": 8.728477020452319e-08, + "loss": 0.0005, + "step": 30662 + }, + { + "epoch": 12.469703131354208, + "grad_norm": 0.0008440726046193936, + "learning_rate": 8.715124626310523e-08, + "loss": 0.0, + "step": 30663 + }, + { + "epoch": 12.470109800732004, + "grad_norm": 0.0005551970589146216, + "learning_rate": 8.70178240820685e-08, + "loss": 0.0, + "step": 30664 + }, + { + "epoch": 12.4705164701098, + "grad_norm": 0.0010592731477864828, + "learning_rate": 8.6884503662783e-08, + "loss": 0.0, + "step": 30665 + }, + { + "epoch": 12.470923139487596, + "grad_norm": 0.046036151639635536, + "learning_rate": 8.675128500661767e-08, + "loss": 0.0003, + "step": 30666 + }, + { + "epoch": 12.471329808865393, + "grad_norm": 0.00040867364690764173, + "learning_rate": 8.661816811494028e-08, + "loss": 0.0, + "step": 30667 + }, + { + "epoch": 12.471736478243189, + "grad_norm": 0.001985841084844699, + "learning_rate": 8.64851529891153e-08, + "loss": 0.0, + "step": 30668 + }, + { + "epoch": 12.472143147620985, + "grad_norm": 0.00612686595564104, + "learning_rate": 8.635223963051053e-08, + "loss": 0.0, + "step": 30669 + }, + { + "epoch": 12.47254981699878, + "grad_norm": 0.05601665334019495, + "learning_rate": 8.621942804048933e-08, + "loss": 0.0005, + "step": 30670 + }, + { + "epoch": 12.472956486376576, + "grad_norm": 0.9612005028952266, + "learning_rate": 8.608671822041504e-08, + "loss": 0.0087, + "step": 30671 + }, + { + "epoch": 12.473363155754372, + "grad_norm": 0.021357572814832266, + "learning_rate": 8.59541101716499e-08, + "loss": 0.0002, + "step": 30672 + }, + { + "epoch": 12.473769825132168, + "grad_norm": 0.006294233285883133, + "learning_rate": 8.582160389555616e-08, + "loss": 0.0, + "step": 30673 + }, + { + "epoch": 12.474176494509964, + "grad_norm": 0.006657081000740635, + "learning_rate": 8.568919939349274e-08, + "loss": 0.0, + "step": 30674 + }, + { + "epoch": 12.47458316388776, + "grad_norm": 0.0003874059130639663, + "learning_rate": 8.555689666681966e-08, + "loss": 0.0, + "step": 30675 + }, + { + "epoch": 12.474989833265555, + "grad_norm": 0.002324960178826348, + "learning_rate": 8.542469571689583e-08, + "loss": 0.0, + "step": 30676 + }, + { + "epoch": 12.47539650264335, + "grad_norm": 0.0018579208194521367, + "learning_rate": 8.52925965450757e-08, + "loss": 0.0, + "step": 30677 + }, + { + "epoch": 12.475803172021147, + "grad_norm": 0.004412585709332507, + "learning_rate": 8.516059915271823e-08, + "loss": 0.0, + "step": 30678 + }, + { + "epoch": 12.476209841398942, + "grad_norm": 0.022898175850516494, + "learning_rate": 8.502870354117676e-08, + "loss": 0.0002, + "step": 30679 + }, + { + "epoch": 12.476616510776738, + "grad_norm": 0.0018175243100510125, + "learning_rate": 8.489690971180575e-08, + "loss": 0.0, + "step": 30680 + }, + { + "epoch": 12.477023180154534, + "grad_norm": 0.008534122084223938, + "learning_rate": 8.476521766595858e-08, + "loss": 0.0001, + "step": 30681 + }, + { + "epoch": 12.47742984953233, + "grad_norm": 0.00699103855612931, + "learning_rate": 8.463362740498637e-08, + "loss": 0.0001, + "step": 30682 + }, + { + "epoch": 12.477836518910125, + "grad_norm": 4.569498404978728e-05, + "learning_rate": 8.450213893024028e-08, + "loss": 0.0, + "step": 30683 + }, + { + "epoch": 12.478243188287921, + "grad_norm": 0.00011538648454281918, + "learning_rate": 8.437075224306923e-08, + "loss": 0.0, + "step": 30684 + }, + { + "epoch": 12.478649857665717, + "grad_norm": 0.13496605806223394, + "learning_rate": 8.423946734482324e-08, + "loss": 0.0012, + "step": 30685 + }, + { + "epoch": 12.479056527043515, + "grad_norm": 0.000817043925102476, + "learning_rate": 8.410828423685013e-08, + "loss": 0.0, + "step": 30686 + }, + { + "epoch": 12.47946319642131, + "grad_norm": 0.02036179992464678, + "learning_rate": 8.397720292049661e-08, + "loss": 0.0001, + "step": 30687 + }, + { + "epoch": 12.479869865799106, + "grad_norm": 0.0011358360904530364, + "learning_rate": 8.384622339710713e-08, + "loss": 0.0, + "step": 30688 + }, + { + "epoch": 12.480276535176902, + "grad_norm": 0.021676593847630918, + "learning_rate": 8.371534566802619e-08, + "loss": 0.0002, + "step": 30689 + }, + { + "epoch": 12.480683204554698, + "grad_norm": 0.006602551782793865, + "learning_rate": 8.358456973459938e-08, + "loss": 0.0, + "step": 30690 + }, + { + "epoch": 12.481089873932493, + "grad_norm": 0.00042484863086008555, + "learning_rate": 8.345389559816563e-08, + "loss": 0.0, + "step": 30691 + }, + { + "epoch": 12.48149654331029, + "grad_norm": 0.009179641360219313, + "learning_rate": 8.332332326007053e-08, + "loss": 0.0001, + "step": 30692 + }, + { + "epoch": 12.481903212688085, + "grad_norm": 0.0008958707407270679, + "learning_rate": 8.319285272165189e-08, + "loss": 0.0, + "step": 30693 + }, + { + "epoch": 12.48230988206588, + "grad_norm": 0.0502887802461709, + "learning_rate": 8.306248398424977e-08, + "loss": 0.0004, + "step": 30694 + }, + { + "epoch": 12.482716551443676, + "grad_norm": 0.02059215174173842, + "learning_rate": 8.293221704920196e-08, + "loss": 0.0001, + "step": 30695 + }, + { + "epoch": 12.483123220821472, + "grad_norm": 0.0021817117710214036, + "learning_rate": 8.28020519178463e-08, + "loss": 0.0, + "step": 30696 + }, + { + "epoch": 12.483529890199268, + "grad_norm": 0.0015723101084089395, + "learning_rate": 8.267198859151837e-08, + "loss": 0.0, + "step": 30697 + }, + { + "epoch": 12.483936559577064, + "grad_norm": 0.018729994113714184, + "learning_rate": 8.254202707155379e-08, + "loss": 0.0002, + "step": 30698 + }, + { + "epoch": 12.48434322895486, + "grad_norm": 0.0023088364079661004, + "learning_rate": 8.241216735928703e-08, + "loss": 0.0, + "step": 30699 + }, + { + "epoch": 12.484749898332655, + "grad_norm": 0.004210648057619887, + "learning_rate": 8.228240945605148e-08, + "loss": 0.0, + "step": 30700 + }, + { + "epoch": 12.485156567710451, + "grad_norm": 0.0006954875787359804, + "learning_rate": 8.215275336317829e-08, + "loss": 0.0, + "step": 30701 + }, + { + "epoch": 12.485563237088247, + "grad_norm": 0.022893977738163377, + "learning_rate": 8.202319908199751e-08, + "loss": 0.0001, + "step": 30702 + }, + { + "epoch": 12.485969906466043, + "grad_norm": 0.06641275360012154, + "learning_rate": 8.189374661384141e-08, + "loss": 0.0006, + "step": 30703 + }, + { + "epoch": 12.486376575843838, + "grad_norm": 0.15931699237379393, + "learning_rate": 8.176439596003672e-08, + "loss": 0.0013, + "step": 30704 + }, + { + "epoch": 12.486783245221634, + "grad_norm": 0.000817599484558573, + "learning_rate": 8.163514712191233e-08, + "loss": 0.0, + "step": 30705 + }, + { + "epoch": 12.48718991459943, + "grad_norm": 0.2637885071393824, + "learning_rate": 8.150600010079502e-08, + "loss": 0.002, + "step": 30706 + }, + { + "epoch": 12.487596583977226, + "grad_norm": 0.0018104652176476395, + "learning_rate": 8.137695489801145e-08, + "loss": 0.0, + "step": 30707 + }, + { + "epoch": 12.488003253355023, + "grad_norm": 0.0008218937193775607, + "learning_rate": 8.124801151488505e-08, + "loss": 0.0, + "step": 30708 + }, + { + "epoch": 12.488409922732819, + "grad_norm": 0.07502171525182902, + "learning_rate": 8.111916995273917e-08, + "loss": 0.0008, + "step": 30709 + }, + { + "epoch": 12.488816592110615, + "grad_norm": 0.5000372923594906, + "learning_rate": 8.09904302128972e-08, + "loss": 0.0047, + "step": 30710 + }, + { + "epoch": 12.48922326148841, + "grad_norm": 0.00042230463857261463, + "learning_rate": 8.086179229668034e-08, + "loss": 0.0, + "step": 30711 + }, + { + "epoch": 12.489629930866206, + "grad_norm": 0.009724219074457795, + "learning_rate": 8.073325620540973e-08, + "loss": 0.0001, + "step": 30712 + }, + { + "epoch": 12.490036600244002, + "grad_norm": 0.006282351397265113, + "learning_rate": 8.060482194040431e-08, + "loss": 0.0, + "step": 30713 + }, + { + "epoch": 12.490443269621798, + "grad_norm": 0.0033840735968009897, + "learning_rate": 8.047648950298304e-08, + "loss": 0.0, + "step": 30714 + }, + { + "epoch": 12.490849938999594, + "grad_norm": 0.0010145917699940458, + "learning_rate": 8.034825889446374e-08, + "loss": 0.0, + "step": 30715 + }, + { + "epoch": 12.49125660837739, + "grad_norm": 0.005877842202066237, + "learning_rate": 8.022013011616092e-08, + "loss": 0.0001, + "step": 30716 + }, + { + "epoch": 12.491663277755185, + "grad_norm": 7.932085151089816e-06, + "learning_rate": 8.00921031693913e-08, + "loss": 0.0, + "step": 30717 + }, + { + "epoch": 12.49206994713298, + "grad_norm": 0.0005660490175633106, + "learning_rate": 7.99641780554683e-08, + "loss": 0.0, + "step": 30718 + }, + { + "epoch": 12.492476616510777, + "grad_norm": 0.0042858886795930275, + "learning_rate": 7.98363547757064e-08, + "loss": 0.0, + "step": 30719 + }, + { + "epoch": 12.492883285888572, + "grad_norm": 0.0063434554520551654, + "learning_rate": 7.970863333141676e-08, + "loss": 0.0001, + "step": 30720 + }, + { + "epoch": 12.493289955266368, + "grad_norm": 0.02384080905603996, + "learning_rate": 7.958101372391169e-08, + "loss": 0.0003, + "step": 30721 + }, + { + "epoch": 12.493696624644164, + "grad_norm": 0.00684481083064829, + "learning_rate": 7.945349595449902e-08, + "loss": 0.0001, + "step": 30722 + }, + { + "epoch": 12.49410329402196, + "grad_norm": 0.0011575575450259062, + "learning_rate": 7.932608002448994e-08, + "loss": 0.0, + "step": 30723 + }, + { + "epoch": 12.494509963399755, + "grad_norm": 0.00028074372070122794, + "learning_rate": 7.919876593519116e-08, + "loss": 0.0, + "step": 30724 + }, + { + "epoch": 12.494916632777551, + "grad_norm": 0.026496934675746826, + "learning_rate": 7.907155368791053e-08, + "loss": 0.0002, + "step": 30725 + }, + { + "epoch": 12.495323302155347, + "grad_norm": 0.06474069085847205, + "learning_rate": 7.894444328395256e-08, + "loss": 0.0008, + "step": 30726 + }, + { + "epoch": 12.495729971533144, + "grad_norm": 0.0015524835495368103, + "learning_rate": 7.8817434724624e-08, + "loss": 0.0, + "step": 30727 + }, + { + "epoch": 12.49613664091094, + "grad_norm": 4.1964900771932406e-05, + "learning_rate": 7.869052801122711e-08, + "loss": 0.0, + "step": 30728 + }, + { + "epoch": 12.496543310288736, + "grad_norm": 0.032711239922166346, + "learning_rate": 7.856372314506533e-08, + "loss": 0.0003, + "step": 30729 + }, + { + "epoch": 12.496949979666532, + "grad_norm": 0.0031610913132669428, + "learning_rate": 7.843702012744092e-08, + "loss": 0.0, + "step": 30730 + }, + { + "epoch": 12.497356649044328, + "grad_norm": 5.189675278791354e-05, + "learning_rate": 7.831041895965286e-08, + "loss": 0.0, + "step": 30731 + }, + { + "epoch": 12.497763318422123, + "grad_norm": 3.217578513927157e-05, + "learning_rate": 7.818391964300232e-08, + "loss": 0.0, + "step": 30732 + }, + { + "epoch": 12.498169987799919, + "grad_norm": 0.00013997197242604468, + "learning_rate": 7.805752217878826e-08, + "loss": 0.0, + "step": 30733 + }, + { + "epoch": 12.498576657177715, + "grad_norm": 0.0005059956523755912, + "learning_rate": 7.793122656830631e-08, + "loss": 0.0, + "step": 30734 + }, + { + "epoch": 12.49898332655551, + "grad_norm": 0.00436186418751614, + "learning_rate": 7.780503281285323e-08, + "loss": 0.0, + "step": 30735 + }, + { + "epoch": 12.499389995933306, + "grad_norm": 0.0006980584588300765, + "learning_rate": 7.767894091372685e-08, + "loss": 0.0, + "step": 30736 + }, + { + "epoch": 12.499796665311102, + "grad_norm": 4.461609026675725e-05, + "learning_rate": 7.755295087221948e-08, + "loss": 0.0, + "step": 30737 + }, + { + "epoch": 12.500203334688898, + "grad_norm": 0.13474716013337956, + "learning_rate": 7.742706268962452e-08, + "loss": 0.0019, + "step": 30738 + }, + { + "epoch": 12.500610004066694, + "grad_norm": 0.0012915725655154172, + "learning_rate": 7.730127636723539e-08, + "loss": 0.0, + "step": 30739 + }, + { + "epoch": 12.50101667344449, + "grad_norm": 0.014789816125254799, + "learning_rate": 7.717559190634217e-08, + "loss": 0.0001, + "step": 30740 + }, + { + "epoch": 12.501423342822285, + "grad_norm": 0.0007335385011581962, + "learning_rate": 7.705000930823603e-08, + "loss": 0.0, + "step": 30741 + }, + { + "epoch": 12.501830012200081, + "grad_norm": 0.002117348453160739, + "learning_rate": 7.692452857420484e-08, + "loss": 0.0, + "step": 30742 + }, + { + "epoch": 12.502236681577877, + "grad_norm": 0.0037315627835412383, + "learning_rate": 7.679914970553759e-08, + "loss": 0.0, + "step": 30743 + }, + { + "epoch": 12.502643350955672, + "grad_norm": 0.0027843381748697063, + "learning_rate": 7.667387270352211e-08, + "loss": 0.0, + "step": 30744 + }, + { + "epoch": 12.503050020333468, + "grad_norm": 0.003773257426548542, + "learning_rate": 7.654869756944294e-08, + "loss": 0.0, + "step": 30745 + }, + { + "epoch": 12.503456689711264, + "grad_norm": 0.0011900783826795463, + "learning_rate": 7.642362430458572e-08, + "loss": 0.0, + "step": 30746 + }, + { + "epoch": 12.50386335908906, + "grad_norm": 7.444213975840045e-05, + "learning_rate": 7.629865291023497e-08, + "loss": 0.0, + "step": 30747 + }, + { + "epoch": 12.504270028466856, + "grad_norm": 0.0003864210906175111, + "learning_rate": 7.617378338767189e-08, + "loss": 0.0, + "step": 30748 + }, + { + "epoch": 12.504676697844653, + "grad_norm": 0.00010959212676125834, + "learning_rate": 7.604901573818102e-08, + "loss": 0.0, + "step": 30749 + }, + { + "epoch": 12.505083367222449, + "grad_norm": 0.013376052198609788, + "learning_rate": 7.592434996304132e-08, + "loss": 0.0001, + "step": 30750 + }, + { + "epoch": 12.505490036600245, + "grad_norm": 0.00659512725381742, + "learning_rate": 7.579978606353178e-08, + "loss": 0.0, + "step": 30751 + }, + { + "epoch": 12.50589670597804, + "grad_norm": 0.01032512282129539, + "learning_rate": 7.567532404093359e-08, + "loss": 0.0001, + "step": 30752 + }, + { + "epoch": 12.506303375355836, + "grad_norm": 0.0007591178668125089, + "learning_rate": 7.555096389652239e-08, + "loss": 0.0, + "step": 30753 + }, + { + "epoch": 12.506710044733632, + "grad_norm": 0.006466234710996646, + "learning_rate": 7.542670563157494e-08, + "loss": 0.0001, + "step": 30754 + }, + { + "epoch": 12.507116714111428, + "grad_norm": 0.00019586455104530188, + "learning_rate": 7.530254924736691e-08, + "loss": 0.0, + "step": 30755 + }, + { + "epoch": 12.507523383489223, + "grad_norm": 0.0048651124272435795, + "learning_rate": 7.517849474517502e-08, + "loss": 0.0, + "step": 30756 + }, + { + "epoch": 12.50793005286702, + "grad_norm": 0.022017292185419812, + "learning_rate": 7.50545421262694e-08, + "loss": 0.0002, + "step": 30757 + }, + { + "epoch": 12.508336722244815, + "grad_norm": 0.0004813670402735711, + "learning_rate": 7.493069139192454e-08, + "loss": 0.0, + "step": 30758 + }, + { + "epoch": 12.50874339162261, + "grad_norm": 0.0036158829579616535, + "learning_rate": 7.48069425434117e-08, + "loss": 0.0, + "step": 30759 + }, + { + "epoch": 12.509150061000406, + "grad_norm": 0.036070295862762856, + "learning_rate": 7.468329558199982e-08, + "loss": 0.0003, + "step": 30760 + }, + { + "epoch": 12.509556730378202, + "grad_norm": 0.0002463595922658677, + "learning_rate": 7.455975050896014e-08, + "loss": 0.0, + "step": 30761 + }, + { + "epoch": 12.509963399755998, + "grad_norm": 0.06416138635486372, + "learning_rate": 7.44363073255594e-08, + "loss": 0.0006, + "step": 30762 + }, + { + "epoch": 12.510370069133794, + "grad_norm": 0.01164427067129198, + "learning_rate": 7.431296603306549e-08, + "loss": 0.0001, + "step": 30763 + }, + { + "epoch": 12.51077673851159, + "grad_norm": 0.00016878328479511863, + "learning_rate": 7.418972663274516e-08, + "loss": 0.0, + "step": 30764 + }, + { + "epoch": 12.511183407889385, + "grad_norm": 0.0005522657141404112, + "learning_rate": 7.406658912586296e-08, + "loss": 0.0, + "step": 30765 + }, + { + "epoch": 12.511590077267181, + "grad_norm": 0.012862080036197428, + "learning_rate": 7.394355351368121e-08, + "loss": 0.0001, + "step": 30766 + }, + { + "epoch": 12.511996746644977, + "grad_norm": 0.00016793241667060747, + "learning_rate": 7.382061979746669e-08, + "loss": 0.0, + "step": 30767 + }, + { + "epoch": 12.512403416022774, + "grad_norm": 0.0004844619470026668, + "learning_rate": 7.369778797847726e-08, + "loss": 0.0, + "step": 30768 + }, + { + "epoch": 12.51281008540057, + "grad_norm": 4.2380413666199294e-05, + "learning_rate": 7.357505805797638e-08, + "loss": 0.0, + "step": 30769 + }, + { + "epoch": 12.513216754778366, + "grad_norm": 0.0006576378706269129, + "learning_rate": 7.345243003722413e-08, + "loss": 0.0, + "step": 30770 + }, + { + "epoch": 12.513623424156162, + "grad_norm": 0.007870132624115019, + "learning_rate": 7.33299039174784e-08, + "loss": 0.0001, + "step": 30771 + }, + { + "epoch": 12.514030093533957, + "grad_norm": 0.017338608804422832, + "learning_rate": 7.320747969999709e-08, + "loss": 0.0001, + "step": 30772 + }, + { + "epoch": 12.514436762911753, + "grad_norm": 0.004848774696787552, + "learning_rate": 7.308515738603694e-08, + "loss": 0.0, + "step": 30773 + }, + { + "epoch": 12.514843432289549, + "grad_norm": 0.00016833274403232077, + "learning_rate": 7.296293697685363e-08, + "loss": 0.0, + "step": 30774 + }, + { + "epoch": 12.515250101667345, + "grad_norm": 0.00022788189143142692, + "learning_rate": 7.284081847370172e-08, + "loss": 0.0, + "step": 30775 + }, + { + "epoch": 12.51565677104514, + "grad_norm": 0.03268423660304281, + "learning_rate": 7.271880187783576e-08, + "loss": 0.0002, + "step": 30776 + }, + { + "epoch": 12.516063440422936, + "grad_norm": 0.0002624418502562234, + "learning_rate": 7.259688719050695e-08, + "loss": 0.0, + "step": 30777 + }, + { + "epoch": 12.516470109800732, + "grad_norm": 0.011816454336621846, + "learning_rate": 7.247507441296875e-08, + "loss": 0.0001, + "step": 30778 + }, + { + "epoch": 12.516876779178528, + "grad_norm": 0.07312546524629408, + "learning_rate": 7.235336354646904e-08, + "loss": 0.0005, + "step": 30779 + }, + { + "epoch": 12.517283448556324, + "grad_norm": 0.020726524448300143, + "learning_rate": 7.223175459225906e-08, + "loss": 0.0002, + "step": 30780 + }, + { + "epoch": 12.51769011793412, + "grad_norm": 0.024561726293170157, + "learning_rate": 7.211024755158558e-08, + "loss": 0.0001, + "step": 30781 + }, + { + "epoch": 12.518096787311915, + "grad_norm": 0.0040162272730515135, + "learning_rate": 7.19888424256987e-08, + "loss": 0.0, + "step": 30782 + }, + { + "epoch": 12.51850345668971, + "grad_norm": 0.046099204382894755, + "learning_rate": 7.186753921584189e-08, + "loss": 0.0006, + "step": 30783 + }, + { + "epoch": 12.518910126067507, + "grad_norm": 0.024498028031533448, + "learning_rate": 7.17463379232619e-08, + "loss": 0.0002, + "step": 30784 + }, + { + "epoch": 12.519316795445302, + "grad_norm": 0.015116088591114811, + "learning_rate": 7.162523854920222e-08, + "loss": 0.0001, + "step": 30785 + }, + { + "epoch": 12.519723464823098, + "grad_norm": 0.03593345632803601, + "learning_rate": 7.150424109490739e-08, + "loss": 0.0003, + "step": 30786 + }, + { + "epoch": 12.520130134200894, + "grad_norm": 8.260215028083099e-05, + "learning_rate": 7.138334556161641e-08, + "loss": 0.0, + "step": 30787 + }, + { + "epoch": 12.52053680357869, + "grad_norm": 0.0009099346370631669, + "learning_rate": 7.126255195057385e-08, + "loss": 0.0, + "step": 30788 + }, + { + "epoch": 12.520943472956485, + "grad_norm": 0.0012536048166075524, + "learning_rate": 7.114186026301761e-08, + "loss": 0.0, + "step": 30789 + }, + { + "epoch": 12.521350142334283, + "grad_norm": 0.003860053788320971, + "learning_rate": 7.102127050018782e-08, + "loss": 0.0, + "step": 30790 + }, + { + "epoch": 12.521756811712079, + "grad_norm": 0.007713283111887545, + "learning_rate": 7.090078266332234e-08, + "loss": 0.0, + "step": 30791 + }, + { + "epoch": 12.522163481089875, + "grad_norm": 0.015911701034088056, + "learning_rate": 7.078039675365689e-08, + "loss": 0.0001, + "step": 30792 + }, + { + "epoch": 12.52257015046767, + "grad_norm": 0.0011973406353919453, + "learning_rate": 7.066011277242713e-08, + "loss": 0.0, + "step": 30793 + }, + { + "epoch": 12.522976819845466, + "grad_norm": 8.26763251885644e-05, + "learning_rate": 7.053993072086873e-08, + "loss": 0.0, + "step": 30794 + }, + { + "epoch": 12.523383489223262, + "grad_norm": 0.0011254504848754686, + "learning_rate": 7.041985060021627e-08, + "loss": 0.0, + "step": 30795 + }, + { + "epoch": 12.523790158601058, + "grad_norm": 0.032387607197981436, + "learning_rate": 7.0299872411701e-08, + "loss": 0.0002, + "step": 30796 + }, + { + "epoch": 12.524196827978853, + "grad_norm": 0.01218873391630933, + "learning_rate": 7.017999615655635e-08, + "loss": 0.0002, + "step": 30797 + }, + { + "epoch": 12.52460349735665, + "grad_norm": 0.00036529763623847043, + "learning_rate": 7.006022183601024e-08, + "loss": 0.0, + "step": 30798 + }, + { + "epoch": 12.525010166734445, + "grad_norm": 0.01128910250936866, + "learning_rate": 6.994054945129391e-08, + "loss": 0.0001, + "step": 30799 + }, + { + "epoch": 12.52541683611224, + "grad_norm": 0.022544164408442705, + "learning_rate": 6.982097900363527e-08, + "loss": 0.0002, + "step": 30800 + }, + { + "epoch": 12.525823505490036, + "grad_norm": 0.0008283998066892571, + "learning_rate": 6.970151049426222e-08, + "loss": 0.0, + "step": 30801 + }, + { + "epoch": 12.526230174867832, + "grad_norm": 0.0006954244191327337, + "learning_rate": 6.958214392440043e-08, + "loss": 0.0, + "step": 30802 + }, + { + "epoch": 12.526636844245628, + "grad_norm": 0.026043922996776575, + "learning_rate": 6.946287929527673e-08, + "loss": 0.0001, + "step": 30803 + }, + { + "epoch": 12.527043513623424, + "grad_norm": 0.007046818503319045, + "learning_rate": 6.934371660811456e-08, + "loss": 0.0001, + "step": 30804 + }, + { + "epoch": 12.52745018300122, + "grad_norm": 0.0002223260778716275, + "learning_rate": 6.92246558641363e-08, + "loss": 0.0, + "step": 30805 + }, + { + "epoch": 12.527856852379015, + "grad_norm": 0.01212691902275226, + "learning_rate": 6.910569706456537e-08, + "loss": 0.0001, + "step": 30806 + }, + { + "epoch": 12.528263521756811, + "grad_norm": 0.03471464944173434, + "learning_rate": 6.898684021062308e-08, + "loss": 0.0003, + "step": 30807 + }, + { + "epoch": 12.528670191134607, + "grad_norm": 0.005966640125343559, + "learning_rate": 6.88680853035284e-08, + "loss": 0.0, + "step": 30808 + }, + { + "epoch": 12.529076860512404, + "grad_norm": 0.11109414019835007, + "learning_rate": 6.874943234450149e-08, + "loss": 0.0011, + "step": 30809 + }, + { + "epoch": 12.5294835298902, + "grad_norm": 0.043492357632300686, + "learning_rate": 6.863088133475915e-08, + "loss": 0.0004, + "step": 30810 + }, + { + "epoch": 12.529890199267996, + "grad_norm": 0.2838823511463301, + "learning_rate": 6.85124322755204e-08, + "loss": 0.0026, + "step": 30811 + }, + { + "epoch": 12.530296868645792, + "grad_norm": 0.004481176515754579, + "learning_rate": 6.839408516799873e-08, + "loss": 0.0, + "step": 30812 + }, + { + "epoch": 12.530703538023587, + "grad_norm": 0.001409335490824121, + "learning_rate": 6.827584001341092e-08, + "loss": 0.0, + "step": 30813 + }, + { + "epoch": 12.531110207401383, + "grad_norm": 0.021910988431224823, + "learning_rate": 6.815769681297046e-08, + "loss": 0.0002, + "step": 30814 + }, + { + "epoch": 12.531516876779179, + "grad_norm": 0.004570737748766555, + "learning_rate": 6.80396555678886e-08, + "loss": 0.0, + "step": 30815 + }, + { + "epoch": 12.531923546156975, + "grad_norm": 0.057987117753923796, + "learning_rate": 6.792171627937994e-08, + "loss": 0.0005, + "step": 30816 + }, + { + "epoch": 12.53233021553477, + "grad_norm": 0.018815195073715, + "learning_rate": 6.780387894865237e-08, + "loss": 0.0002, + "step": 30817 + }, + { + "epoch": 12.532736884912566, + "grad_norm": 0.0017643000195539807, + "learning_rate": 6.768614357691716e-08, + "loss": 0.0, + "step": 30818 + }, + { + "epoch": 12.533143554290362, + "grad_norm": 0.14795075078307138, + "learning_rate": 6.756851016538224e-08, + "loss": 0.0004, + "step": 30819 + }, + { + "epoch": 12.533550223668158, + "grad_norm": 0.005176599110084445, + "learning_rate": 6.745097871525552e-08, + "loss": 0.0, + "step": 30820 + }, + { + "epoch": 12.533956893045954, + "grad_norm": 0.00042714253154003343, + "learning_rate": 6.733354922774493e-08, + "loss": 0.0, + "step": 30821 + }, + { + "epoch": 12.53436356242375, + "grad_norm": 4.529387794497352e-05, + "learning_rate": 6.721622170405284e-08, + "loss": 0.0, + "step": 30822 + }, + { + "epoch": 12.534770231801545, + "grad_norm": 0.017226706855864925, + "learning_rate": 6.709899614538607e-08, + "loss": 0.0001, + "step": 30823 + }, + { + "epoch": 12.53517690117934, + "grad_norm": 0.009412062792967505, + "learning_rate": 6.698187255294697e-08, + "loss": 0.0001, + "step": 30824 + }, + { + "epoch": 12.535583570557137, + "grad_norm": 2.110703072938451e-06, + "learning_rate": 6.686485092793682e-08, + "loss": 0.0, + "step": 30825 + }, + { + "epoch": 12.535990239934932, + "grad_norm": 0.01594155403983752, + "learning_rate": 6.67479312715602e-08, + "loss": 0.0001, + "step": 30826 + }, + { + "epoch": 12.536396909312728, + "grad_norm": 0.006070856842531175, + "learning_rate": 6.663111358501395e-08, + "loss": 0.0, + "step": 30827 + }, + { + "epoch": 12.536803578690524, + "grad_norm": 0.00025162194594818626, + "learning_rate": 6.651439786949931e-08, + "loss": 0.0, + "step": 30828 + }, + { + "epoch": 12.53721024806832, + "grad_norm": 0.0030075675118381114, + "learning_rate": 6.639778412621423e-08, + "loss": 0.0, + "step": 30829 + }, + { + "epoch": 12.537616917446115, + "grad_norm": 0.031179649968214606, + "learning_rate": 6.62812723563544e-08, + "loss": 0.0003, + "step": 30830 + }, + { + "epoch": 12.538023586823913, + "grad_norm": 0.024659814390657658, + "learning_rate": 6.616486256111776e-08, + "loss": 0.0001, + "step": 30831 + }, + { + "epoch": 12.538430256201709, + "grad_norm": 0.0005350352970342326, + "learning_rate": 6.60485547416978e-08, + "loss": 0.0, + "step": 30832 + }, + { + "epoch": 12.538836925579504, + "grad_norm": 0.001436571386442609, + "learning_rate": 6.593234889928913e-08, + "loss": 0.0, + "step": 30833 + }, + { + "epoch": 12.5392435949573, + "grad_norm": 0.006364696853551709, + "learning_rate": 6.581624503508522e-08, + "loss": 0.0001, + "step": 30834 + }, + { + "epoch": 12.539650264335096, + "grad_norm": 0.24520245623681622, + "learning_rate": 6.570024315027624e-08, + "loss": 0.0021, + "step": 30835 + }, + { + "epoch": 12.540056933712892, + "grad_norm": 0.008620612880774191, + "learning_rate": 6.558434324605456e-08, + "loss": 0.0001, + "step": 30836 + }, + { + "epoch": 12.540463603090688, + "grad_norm": 0.001297001737702512, + "learning_rate": 6.546854532361036e-08, + "loss": 0.0, + "step": 30837 + }, + { + "epoch": 12.540870272468483, + "grad_norm": 0.01232515113207644, + "learning_rate": 6.535284938413045e-08, + "loss": 0.0001, + "step": 30838 + }, + { + "epoch": 12.541276941846279, + "grad_norm": 0.0066252886698788685, + "learning_rate": 6.523725542880388e-08, + "loss": 0.0, + "step": 30839 + }, + { + "epoch": 12.541683611224075, + "grad_norm": 0.0031702640877500927, + "learning_rate": 6.512176345881638e-08, + "loss": 0.0, + "step": 30840 + }, + { + "epoch": 12.54209028060187, + "grad_norm": 0.00016851570986500865, + "learning_rate": 6.500637347535477e-08, + "loss": 0.0, + "step": 30841 + }, + { + "epoch": 12.542496949979666, + "grad_norm": 0.0034255373571093557, + "learning_rate": 6.489108547960255e-08, + "loss": 0.0, + "step": 30842 + }, + { + "epoch": 12.542903619357462, + "grad_norm": 0.0882153423219584, + "learning_rate": 6.477589947274432e-08, + "loss": 0.0008, + "step": 30843 + }, + { + "epoch": 12.543310288735258, + "grad_norm": 0.014519203758035224, + "learning_rate": 6.466081545596137e-08, + "loss": 0.0001, + "step": 30844 + }, + { + "epoch": 12.543716958113054, + "grad_norm": 0.00030509542477176016, + "learning_rate": 6.454583343043496e-08, + "loss": 0.0, + "step": 30845 + }, + { + "epoch": 12.54412362749085, + "grad_norm": 0.0007019133565700589, + "learning_rate": 6.44309533973464e-08, + "loss": 0.0, + "step": 30846 + }, + { + "epoch": 12.544530296868645, + "grad_norm": 0.014807512259226477, + "learning_rate": 6.431617535787471e-08, + "loss": 0.0001, + "step": 30847 + }, + { + "epoch": 12.544936966246441, + "grad_norm": 0.012986658757293584, + "learning_rate": 6.420149931319786e-08, + "loss": 0.0002, + "step": 30848 + }, + { + "epoch": 12.545343635624237, + "grad_norm": 0.0206533577081234, + "learning_rate": 6.408692526449267e-08, + "loss": 0.0001, + "step": 30849 + }, + { + "epoch": 12.545750305002034, + "grad_norm": 0.002473012416890839, + "learning_rate": 6.39724532129371e-08, + "loss": 0.0, + "step": 30850 + }, + { + "epoch": 12.54615697437983, + "grad_norm": 0.0015170129569593246, + "learning_rate": 6.385808315970355e-08, + "loss": 0.0, + "step": 30851 + }, + { + "epoch": 12.546563643757626, + "grad_norm": 0.005515162264017719, + "learning_rate": 6.374381510596883e-08, + "loss": 0.0, + "step": 30852 + }, + { + "epoch": 12.546970313135422, + "grad_norm": 0.008393915434248507, + "learning_rate": 6.362964905290425e-08, + "loss": 0.0001, + "step": 30853 + }, + { + "epoch": 12.547376982513217, + "grad_norm": 0.0009623469556949755, + "learning_rate": 6.35155850016811e-08, + "loss": 0.0, + "step": 30854 + }, + { + "epoch": 12.547783651891013, + "grad_norm": 0.01872547873376108, + "learning_rate": 6.340162295347285e-08, + "loss": 0.0002, + "step": 30855 + }, + { + "epoch": 12.548190321268809, + "grad_norm": 0.0021897381908296872, + "learning_rate": 6.32877629094475e-08, + "loss": 0.0, + "step": 30856 + }, + { + "epoch": 12.548596990646605, + "grad_norm": 0.09292156911109385, + "learning_rate": 6.317400487077407e-08, + "loss": 0.0008, + "step": 30857 + }, + { + "epoch": 12.5490036600244, + "grad_norm": 0.09960012785241705, + "learning_rate": 6.306034883862055e-08, + "loss": 0.001, + "step": 30858 + }, + { + "epoch": 12.549410329402196, + "grad_norm": 0.0475800979994574, + "learning_rate": 6.294679481415378e-08, + "loss": 0.0005, + "step": 30859 + }, + { + "epoch": 12.549816998779992, + "grad_norm": 0.01215090881837718, + "learning_rate": 6.283334279853948e-08, + "loss": 0.0001, + "step": 30860 + }, + { + "epoch": 12.550223668157788, + "grad_norm": 0.001260559023291063, + "learning_rate": 6.271999279294228e-08, + "loss": 0.0, + "step": 30861 + }, + { + "epoch": 12.550630337535583, + "grad_norm": 0.046752349046823044, + "learning_rate": 6.260674479852568e-08, + "loss": 0.0004, + "step": 30862 + }, + { + "epoch": 12.55103700691338, + "grad_norm": 0.0151400547138476, + "learning_rate": 6.249359881645212e-08, + "loss": 0.0002, + "step": 30863 + }, + { + "epoch": 12.551443676291175, + "grad_norm": 0.0009336674533892891, + "learning_rate": 6.238055484788286e-08, + "loss": 0.0, + "step": 30864 + }, + { + "epoch": 12.55185034566897, + "grad_norm": 0.007308334606002593, + "learning_rate": 6.22676128939792e-08, + "loss": 0.0001, + "step": 30865 + }, + { + "epoch": 12.552257015046766, + "grad_norm": 0.000851450413450257, + "learning_rate": 6.215477295590022e-08, + "loss": 0.0, + "step": 30866 + }, + { + "epoch": 12.552663684424562, + "grad_norm": 0.00051784134653923, + "learning_rate": 6.204203503480388e-08, + "loss": 0.0, + "step": 30867 + }, + { + "epoch": 12.553070353802358, + "grad_norm": 0.016649479772258183, + "learning_rate": 6.192939913184703e-08, + "loss": 0.0002, + "step": 30868 + }, + { + "epoch": 12.553477023180154, + "grad_norm": 0.03977231233721362, + "learning_rate": 6.181686524818765e-08, + "loss": 0.0003, + "step": 30869 + }, + { + "epoch": 12.55388369255795, + "grad_norm": 0.00018332621720719514, + "learning_rate": 6.170443338497922e-08, + "loss": 0.0, + "step": 30870 + }, + { + "epoch": 12.554290361935745, + "grad_norm": 0.003727754747609628, + "learning_rate": 6.159210354337753e-08, + "loss": 0.0, + "step": 30871 + }, + { + "epoch": 12.554697031313543, + "grad_norm": 0.00013153693278665667, + "learning_rate": 6.147987572453384e-08, + "loss": 0.0, + "step": 30872 + }, + { + "epoch": 12.555103700691339, + "grad_norm": 0.010948213783631517, + "learning_rate": 6.13677499296017e-08, + "loss": 0.0001, + "step": 30873 + }, + { + "epoch": 12.555510370069134, + "grad_norm": 0.04252227084605891, + "learning_rate": 6.12557261597313e-08, + "loss": 0.0004, + "step": 30874 + }, + { + "epoch": 12.55591703944693, + "grad_norm": 0.008405858953324734, + "learning_rate": 6.114380441607281e-08, + "loss": 0.0001, + "step": 30875 + }, + { + "epoch": 12.556323708824726, + "grad_norm": 0.003107266473356562, + "learning_rate": 6.103198469977645e-08, + "loss": 0.0, + "step": 30876 + }, + { + "epoch": 12.556730378202522, + "grad_norm": 2.2292887640061882e-05, + "learning_rate": 6.092026701198683e-08, + "loss": 0.0, + "step": 30877 + }, + { + "epoch": 12.557137047580317, + "grad_norm": 0.002264047516281121, + "learning_rate": 6.080865135385416e-08, + "loss": 0.0, + "step": 30878 + }, + { + "epoch": 12.557543716958113, + "grad_norm": 0.0001237930816069748, + "learning_rate": 6.069713772652308e-08, + "loss": 0.0, + "step": 30879 + }, + { + "epoch": 12.557950386335909, + "grad_norm": 0.05821059225596925, + "learning_rate": 6.058572613113822e-08, + "loss": 0.0005, + "step": 30880 + }, + { + "epoch": 12.558357055713705, + "grad_norm": 0.023897032593564356, + "learning_rate": 6.047441656884312e-08, + "loss": 0.0002, + "step": 30881 + }, + { + "epoch": 12.5587637250915, + "grad_norm": 0.020613515747529, + "learning_rate": 6.03632090407813e-08, + "loss": 0.0001, + "step": 30882 + }, + { + "epoch": 12.559170394469296, + "grad_norm": 0.025450755828197502, + "learning_rate": 6.025210354809296e-08, + "loss": 0.0002, + "step": 30883 + }, + { + "epoch": 12.559577063847092, + "grad_norm": 0.011382972815042528, + "learning_rate": 6.014110009191831e-08, + "loss": 0.0001, + "step": 30884 + }, + { + "epoch": 12.559983733224888, + "grad_norm": 0.00017300592394069462, + "learning_rate": 6.003019867339976e-08, + "loss": 0.0, + "step": 30885 + }, + { + "epoch": 12.560390402602684, + "grad_norm": 0.00028830498392330494, + "learning_rate": 5.991939929367307e-08, + "loss": 0.0, + "step": 30886 + }, + { + "epoch": 12.56079707198048, + "grad_norm": 0.004412798403760185, + "learning_rate": 5.980870195387623e-08, + "loss": 0.0, + "step": 30887 + }, + { + "epoch": 12.561203741358275, + "grad_norm": 0.004417479308463183, + "learning_rate": 5.969810665514609e-08, + "loss": 0.0, + "step": 30888 + }, + { + "epoch": 12.56161041073607, + "grad_norm": 0.6702432323920596, + "learning_rate": 5.9587613398618406e-08, + "loss": 0.0056, + "step": 30889 + }, + { + "epoch": 12.562017080113867, + "grad_norm": 0.11537669033503223, + "learning_rate": 5.947722218542562e-08, + "loss": 0.0008, + "step": 30890 + }, + { + "epoch": 12.562423749491664, + "grad_norm": 0.0014688369146608382, + "learning_rate": 5.936693301670238e-08, + "loss": 0.0, + "step": 30891 + }, + { + "epoch": 12.56283041886946, + "grad_norm": 0.0009040784410126796, + "learning_rate": 5.92567458935811e-08, + "loss": 0.0, + "step": 30892 + }, + { + "epoch": 12.563237088247256, + "grad_norm": 0.000298140192640252, + "learning_rate": 5.9146660817192e-08, + "loss": 0.0, + "step": 30893 + }, + { + "epoch": 12.563643757625051, + "grad_norm": 0.007786107022537739, + "learning_rate": 5.9036677788665284e-08, + "loss": 0.0001, + "step": 30894 + }, + { + "epoch": 12.564050427002847, + "grad_norm": 0.0004254774984235781, + "learning_rate": 5.892679680913116e-08, + "loss": 0.0, + "step": 30895 + }, + { + "epoch": 12.564457096380643, + "grad_norm": 0.0005466407718114834, + "learning_rate": 5.88170178797165e-08, + "loss": 0.0, + "step": 30896 + }, + { + "epoch": 12.564863765758439, + "grad_norm": 0.0034402030004058557, + "learning_rate": 5.870734100154707e-08, + "loss": 0.0, + "step": 30897 + }, + { + "epoch": 12.565270435136235, + "grad_norm": 0.006443803353413427, + "learning_rate": 5.859776617575197e-08, + "loss": 0.0001, + "step": 30898 + }, + { + "epoch": 12.56567710451403, + "grad_norm": 0.006483334052123593, + "learning_rate": 5.8488293403453636e-08, + "loss": 0.0001, + "step": 30899 + }, + { + "epoch": 12.566083773891826, + "grad_norm": 0.019617421282817318, + "learning_rate": 5.8378922685775607e-08, + "loss": 0.0002, + "step": 30900 + }, + { + "epoch": 12.566490443269622, + "grad_norm": 6.993956143786328e-06, + "learning_rate": 5.826965402384255e-08, + "loss": 0.0, + "step": 30901 + }, + { + "epoch": 12.566897112647418, + "grad_norm": 0.041903311780754404, + "learning_rate": 5.816048741877467e-08, + "loss": 0.0005, + "step": 30902 + }, + { + "epoch": 12.567303782025213, + "grad_norm": 5.396673219883612e-06, + "learning_rate": 5.8051422871692184e-08, + "loss": 0.0, + "step": 30903 + }, + { + "epoch": 12.56771045140301, + "grad_norm": 0.04469396658491706, + "learning_rate": 5.7942460383716425e-08, + "loss": 0.0005, + "step": 30904 + }, + { + "epoch": 12.568117120780805, + "grad_norm": 0.0018129241785053566, + "learning_rate": 5.783359995596538e-08, + "loss": 0.0, + "step": 30905 + }, + { + "epoch": 12.5685237901586, + "grad_norm": 0.0012447282419027713, + "learning_rate": 5.7724841589555936e-08, + "loss": 0.0, + "step": 30906 + }, + { + "epoch": 12.568930459536396, + "grad_norm": 0.012611969000769165, + "learning_rate": 5.761618528560387e-08, + "loss": 0.0001, + "step": 30907 + }, + { + "epoch": 12.569337128914192, + "grad_norm": 0.004739429028503923, + "learning_rate": 5.7507631045227165e-08, + "loss": 0.0, + "step": 30908 + }, + { + "epoch": 12.569743798291988, + "grad_norm": 0.0017404990869391765, + "learning_rate": 5.7399178869537164e-08, + "loss": 0.0, + "step": 30909 + }, + { + "epoch": 12.570150467669784, + "grad_norm": 0.0005417091393497518, + "learning_rate": 5.729082875964964e-08, + "loss": 0.0, + "step": 30910 + }, + { + "epoch": 12.57055713704758, + "grad_norm": 0.44669995763858544, + "learning_rate": 5.7182580716674817e-08, + "loss": 0.0047, + "step": 30911 + }, + { + "epoch": 12.570963806425375, + "grad_norm": 0.029150683688511016, + "learning_rate": 5.707443474172514e-08, + "loss": 0.0002, + "step": 30912 + }, + { + "epoch": 12.571370475803173, + "grad_norm": 0.002278801128397835, + "learning_rate": 5.696639083591082e-08, + "loss": 0.0, + "step": 30913 + }, + { + "epoch": 12.571777145180969, + "grad_norm": 0.0023955291636458854, + "learning_rate": 5.685844900034099e-08, + "loss": 0.0, + "step": 30914 + }, + { + "epoch": 12.572183814558764, + "grad_norm": 0.027814805789618216, + "learning_rate": 5.675060923612363e-08, + "loss": 0.0003, + "step": 30915 + }, + { + "epoch": 12.57259048393656, + "grad_norm": 0.005018340746199863, + "learning_rate": 5.664287154436454e-08, + "loss": 0.0, + "step": 30916 + }, + { + "epoch": 12.572997153314356, + "grad_norm": 0.0016431250784261632, + "learning_rate": 5.6535235926171714e-08, + "loss": 0.0, + "step": 30917 + }, + { + "epoch": 12.573403822692152, + "grad_norm": 0.003402258954536121, + "learning_rate": 5.642770238264872e-08, + "loss": 0.0, + "step": 30918 + }, + { + "epoch": 12.573810492069947, + "grad_norm": 0.004401075562573, + "learning_rate": 5.632027091490022e-08, + "loss": 0.0, + "step": 30919 + }, + { + "epoch": 12.574217161447743, + "grad_norm": 0.007407532926122803, + "learning_rate": 5.621294152402867e-08, + "loss": 0.0001, + "step": 30920 + }, + { + "epoch": 12.574623830825539, + "grad_norm": 0.00593127757187908, + "learning_rate": 5.610571421113542e-08, + "loss": 0.0001, + "step": 30921 + }, + { + "epoch": 12.575030500203335, + "grad_norm": 0.0030596111396745686, + "learning_rate": 5.59985889773218e-08, + "loss": 0.0, + "step": 30922 + }, + { + "epoch": 12.57543716958113, + "grad_norm": 0.032862923376930316, + "learning_rate": 5.589156582368804e-08, + "loss": 0.0003, + "step": 30923 + }, + { + "epoch": 12.575843838958926, + "grad_norm": 0.002751211334375812, + "learning_rate": 5.578464475133216e-08, + "loss": 0.0, + "step": 30924 + }, + { + "epoch": 12.576250508336722, + "grad_norm": 0.025022925864721136, + "learning_rate": 5.5677825761351056e-08, + "loss": 0.0002, + "step": 30925 + }, + { + "epoch": 12.576657177714518, + "grad_norm": 0.0009530428198071564, + "learning_rate": 5.557110885484163e-08, + "loss": 0.0, + "step": 30926 + }, + { + "epoch": 12.577063847092314, + "grad_norm": 8.772417662085399e-05, + "learning_rate": 5.5464494032900776e-08, + "loss": 0.0, + "step": 30927 + }, + { + "epoch": 12.57747051647011, + "grad_norm": 0.0014170162487924494, + "learning_rate": 5.535798129661984e-08, + "loss": 0.0, + "step": 30928 + }, + { + "epoch": 12.577877185847905, + "grad_norm": 0.00040015391658198814, + "learning_rate": 5.525157064709574e-08, + "loss": 0.0, + "step": 30929 + }, + { + "epoch": 12.5782838552257, + "grad_norm": 0.0012235263015866276, + "learning_rate": 5.514526208541982e-08, + "loss": 0.0, + "step": 30930 + }, + { + "epoch": 12.578690524603497, + "grad_norm": 0.014763623990485997, + "learning_rate": 5.50390556126823e-08, + "loss": 0.0001, + "step": 30931 + }, + { + "epoch": 12.579097193981294, + "grad_norm": 0.0014546974322600555, + "learning_rate": 5.4932951229974554e-08, + "loss": 0.0, + "step": 30932 + }, + { + "epoch": 12.57950386335909, + "grad_norm": 0.002010517566175647, + "learning_rate": 5.4826948938384586e-08, + "loss": 0.0, + "step": 30933 + }, + { + "epoch": 12.579910532736886, + "grad_norm": 0.0003898422126453097, + "learning_rate": 5.4721048739001524e-08, + "loss": 0.0, + "step": 30934 + }, + { + "epoch": 12.580317202114681, + "grad_norm": 7.669661752737645e-06, + "learning_rate": 5.4615250632912286e-08, + "loss": 0.0, + "step": 30935 + }, + { + "epoch": 12.580723871492477, + "grad_norm": 0.009653968953718544, + "learning_rate": 5.450955462120267e-08, + "loss": 0.0, + "step": 30936 + }, + { + "epoch": 12.581130540870273, + "grad_norm": 0.004861844602696374, + "learning_rate": 5.440396070495846e-08, + "loss": 0.0, + "step": 30937 + }, + { + "epoch": 12.581537210248069, + "grad_norm": 0.0494165127587452, + "learning_rate": 5.4298468885262136e-08, + "loss": 0.0005, + "step": 30938 + }, + { + "epoch": 12.581943879625864, + "grad_norm": 0.0471436623999692, + "learning_rate": 5.4193079163198384e-08, + "loss": 0.0005, + "step": 30939 + }, + { + "epoch": 12.58235054900366, + "grad_norm": 0.008288635145264856, + "learning_rate": 5.408779153984856e-08, + "loss": 0.0, + "step": 30940 + }, + { + "epoch": 12.582757218381456, + "grad_norm": 6.41113424302117e-05, + "learning_rate": 5.3982606016292905e-08, + "loss": 0.0, + "step": 30941 + }, + { + "epoch": 12.583163887759252, + "grad_norm": 0.00048082824394065905, + "learning_rate": 5.3877522593610564e-08, + "loss": 0.0, + "step": 30942 + }, + { + "epoch": 12.583570557137048, + "grad_norm": 0.0009844589464083648, + "learning_rate": 5.37725412728829e-08, + "loss": 0.0, + "step": 30943 + }, + { + "epoch": 12.583977226514843, + "grad_norm": 0.02701371809281886, + "learning_rate": 5.3667662055184586e-08, + "loss": 0.0002, + "step": 30944 + }, + { + "epoch": 12.584383895892639, + "grad_norm": 0.006325710293995445, + "learning_rate": 5.356288494159478e-08, + "loss": 0.0001, + "step": 30945 + }, + { + "epoch": 12.584790565270435, + "grad_norm": 0.0014723495519640823, + "learning_rate": 5.345820993318707e-08, + "loss": 0.0, + "step": 30946 + }, + { + "epoch": 12.58519723464823, + "grad_norm": 0.0009724681152467395, + "learning_rate": 5.335363703103724e-08, + "loss": 0.0, + "step": 30947 + }, + { + "epoch": 12.585603904026026, + "grad_norm": 0.00018640500097898426, + "learning_rate": 5.3249166236217785e-08, + "loss": 0.0, + "step": 30948 + }, + { + "epoch": 12.586010573403822, + "grad_norm": 0.000863999135063735, + "learning_rate": 5.3144797549802285e-08, + "loss": 0.0, + "step": 30949 + }, + { + "epoch": 12.586417242781618, + "grad_norm": 0.006230869095221036, + "learning_rate": 5.30405309728621e-08, + "loss": 0.0001, + "step": 30950 + }, + { + "epoch": 12.586823912159414, + "grad_norm": 0.01736745123849368, + "learning_rate": 5.2936366506466386e-08, + "loss": 0.0001, + "step": 30951 + }, + { + "epoch": 12.58723058153721, + "grad_norm": 0.022716253239861303, + "learning_rate": 5.283230415168539e-08, + "loss": 0.0002, + "step": 30952 + }, + { + "epoch": 12.587637250915005, + "grad_norm": 0.002330198908928645, + "learning_rate": 5.2728343909586033e-08, + "loss": 0.0, + "step": 30953 + }, + { + "epoch": 12.588043920292803, + "grad_norm": 0.00419057408974495, + "learning_rate": 5.262448578123747e-08, + "loss": 0.0, + "step": 30954 + }, + { + "epoch": 12.588450589670598, + "grad_norm": 0.00017346548838823156, + "learning_rate": 5.2520729767704395e-08, + "loss": 0.0, + "step": 30955 + }, + { + "epoch": 12.588857259048394, + "grad_norm": 0.004703265727938438, + "learning_rate": 5.2417075870053734e-08, + "loss": 0.0, + "step": 30956 + }, + { + "epoch": 12.58926392842619, + "grad_norm": 0.0008090317511075395, + "learning_rate": 5.231352408934687e-08, + "loss": 0.0, + "step": 30957 + }, + { + "epoch": 12.589670597803986, + "grad_norm": 0.03329463213667101, + "learning_rate": 5.22100744266496e-08, + "loss": 0.0003, + "step": 30958 + }, + { + "epoch": 12.590077267181782, + "grad_norm": 0.011853129155669242, + "learning_rate": 5.210672688302221e-08, + "loss": 0.0001, + "step": 30959 + }, + { + "epoch": 12.590483936559577, + "grad_norm": 0.02637978247506088, + "learning_rate": 5.200348145952494e-08, + "loss": 0.0001, + "step": 30960 + }, + { + "epoch": 12.590890605937373, + "grad_norm": 0.0012472533500205616, + "learning_rate": 5.190033815721918e-08, + "loss": 0.0, + "step": 30961 + }, + { + "epoch": 12.591297275315169, + "grad_norm": 0.002218666913407479, + "learning_rate": 5.179729697716407e-08, + "loss": 0.0, + "step": 30962 + }, + { + "epoch": 12.591703944692965, + "grad_norm": 0.011913952527114581, + "learning_rate": 5.169435792041544e-08, + "loss": 0.0001, + "step": 30963 + }, + { + "epoch": 12.59211061407076, + "grad_norm": 0.02725105236350154, + "learning_rate": 5.159152098803133e-08, + "loss": 0.0003, + "step": 30964 + }, + { + "epoch": 12.592517283448556, + "grad_norm": 0.009635065310741172, + "learning_rate": 5.148878618106756e-08, + "loss": 0.0001, + "step": 30965 + }, + { + "epoch": 12.592923952826352, + "grad_norm": 0.0006756475285247359, + "learning_rate": 5.138615350057885e-08, + "loss": 0.0, + "step": 30966 + }, + { + "epoch": 12.593330622204148, + "grad_norm": 7.552274488673244e-05, + "learning_rate": 5.128362294761657e-08, + "loss": 0.0, + "step": 30967 + }, + { + "epoch": 12.593737291581943, + "grad_norm": 0.005118979449117728, + "learning_rate": 5.118119452323655e-08, + "loss": 0.0, + "step": 30968 + }, + { + "epoch": 12.59414396095974, + "grad_norm": 0.01806103352901182, + "learning_rate": 5.107886822848795e-08, + "loss": 0.0001, + "step": 30969 + }, + { + "epoch": 12.594550630337535, + "grad_norm": 0.031256267466650486, + "learning_rate": 5.0976644064422156e-08, + "loss": 0.0003, + "step": 30970 + }, + { + "epoch": 12.59495729971533, + "grad_norm": 0.0024843699230416125, + "learning_rate": 5.087452203208831e-08, + "loss": 0.0, + "step": 30971 + }, + { + "epoch": 12.595363969093126, + "grad_norm": 0.006730407042986017, + "learning_rate": 5.07725021325356e-08, + "loss": 0.0, + "step": 30972 + }, + { + "epoch": 12.595770638470924, + "grad_norm": 0.01934474761883308, + "learning_rate": 5.0670584366809825e-08, + "loss": 0.0001, + "step": 30973 + }, + { + "epoch": 12.59617730784872, + "grad_norm": 0.013543256976217163, + "learning_rate": 5.056876873595684e-08, + "loss": 0.0001, + "step": 30974 + }, + { + "epoch": 12.596583977226516, + "grad_norm": 0.0013437236774360897, + "learning_rate": 5.0467055241024686e-08, + "loss": 0.0, + "step": 30975 + }, + { + "epoch": 12.596990646604311, + "grad_norm": 0.4835300500292217, + "learning_rate": 5.036544388305476e-08, + "loss": 0.0051, + "step": 30976 + }, + { + "epoch": 12.597397315982107, + "grad_norm": 0.05246881882669361, + "learning_rate": 5.0263934663090655e-08, + "loss": 0.0002, + "step": 30977 + }, + { + "epoch": 12.597803985359903, + "grad_norm": 0.0003711219492725587, + "learning_rate": 5.016252758217599e-08, + "loss": 0.0, + "step": 30978 + }, + { + "epoch": 12.598210654737699, + "grad_norm": 0.002475127819685672, + "learning_rate": 5.006122264134994e-08, + "loss": 0.0, + "step": 30979 + }, + { + "epoch": 12.598617324115494, + "grad_norm": 0.0005031784126522691, + "learning_rate": 4.9960019841652774e-08, + "loss": 0.0, + "step": 30980 + }, + { + "epoch": 12.59902399349329, + "grad_norm": 0.000549550673458197, + "learning_rate": 4.9858919184124774e-08, + "loss": 0.0, + "step": 30981 + }, + { + "epoch": 12.599430662871086, + "grad_norm": 0.02062027196931598, + "learning_rate": 4.975792066980179e-08, + "loss": 0.0002, + "step": 30982 + }, + { + "epoch": 12.599837332248882, + "grad_norm": 0.10726976328414456, + "learning_rate": 4.9657024299722966e-08, + "loss": 0.001, + "step": 30983 + }, + { + "epoch": 12.600244001626677, + "grad_norm": 0.012270502503221056, + "learning_rate": 4.955623007492083e-08, + "loss": 0.0001, + "step": 30984 + }, + { + "epoch": 12.600650671004473, + "grad_norm": 0.0003791501358278862, + "learning_rate": 4.9455537996433437e-08, + "loss": 0.0, + "step": 30985 + }, + { + "epoch": 12.601057340382269, + "grad_norm": 0.004729712497127763, + "learning_rate": 4.9354948065292176e-08, + "loss": 0.0, + "step": 30986 + }, + { + "epoch": 12.601464009760065, + "grad_norm": 0.004212506385098455, + "learning_rate": 4.925446028253067e-08, + "loss": 0.0001, + "step": 30987 + }, + { + "epoch": 12.60187067913786, + "grad_norm": 0.041624800112035495, + "learning_rate": 4.915407464918032e-08, + "loss": 0.0004, + "step": 30988 + }, + { + "epoch": 12.602277348515656, + "grad_norm": 0.0037605194047644667, + "learning_rate": 4.90537911662714e-08, + "loss": 0.0, + "step": 30989 + }, + { + "epoch": 12.602684017893452, + "grad_norm": 0.0015877979809014182, + "learning_rate": 4.895360983483422e-08, + "loss": 0.0, + "step": 30990 + }, + { + "epoch": 12.603090687271248, + "grad_norm": 0.005073758151863118, + "learning_rate": 4.885353065589571e-08, + "loss": 0.0, + "step": 30991 + }, + { + "epoch": 12.603497356649044, + "grad_norm": 0.0018692510523419389, + "learning_rate": 4.875355363048395e-08, + "loss": 0.0, + "step": 30992 + }, + { + "epoch": 12.60390402602684, + "grad_norm": 0.0038668488836005087, + "learning_rate": 4.865367875962479e-08, + "loss": 0.0, + "step": 30993 + }, + { + "epoch": 12.604310695404635, + "grad_norm": 0.08230739641482863, + "learning_rate": 4.855390604434518e-08, + "loss": 0.0012, + "step": 30994 + }, + { + "epoch": 12.604717364782433, + "grad_norm": 0.0044513169594262315, + "learning_rate": 4.845423548566763e-08, + "loss": 0.0, + "step": 30995 + }, + { + "epoch": 12.605124034160228, + "grad_norm": 0.002207970386325247, + "learning_rate": 4.835466708461689e-08, + "loss": 0.0, + "step": 30996 + }, + { + "epoch": 12.605530703538024, + "grad_norm": 0.04508083681773518, + "learning_rate": 4.8255200842213246e-08, + "loss": 0.0002, + "step": 30997 + }, + { + "epoch": 12.60593737291582, + "grad_norm": 0.0031650158033175394, + "learning_rate": 4.81558367594781e-08, + "loss": 0.0, + "step": 30998 + }, + { + "epoch": 12.606344042293616, + "grad_norm": 0.015157683138735226, + "learning_rate": 4.805657483743287e-08, + "loss": 0.0001, + "step": 30999 + }, + { + "epoch": 12.606750711671411, + "grad_norm": 0.0023127486527378047, + "learning_rate": 4.7957415077094504e-08, + "loss": 0.0, + "step": 31000 + }, + { + "epoch": 12.607157381049207, + "grad_norm": 0.00012759480005754628, + "learning_rate": 4.78583574794822e-08, + "loss": 0.0, + "step": 31001 + }, + { + "epoch": 12.607564050427003, + "grad_norm": 0.06580732058914515, + "learning_rate": 4.775940204561291e-08, + "loss": 0.0007, + "step": 31002 + }, + { + "epoch": 12.607970719804799, + "grad_norm": 0.03195645555696934, + "learning_rate": 4.76605487765025e-08, + "loss": 0.0001, + "step": 31003 + }, + { + "epoch": 12.608377389182595, + "grad_norm": 0.0008435243488431645, + "learning_rate": 4.756179767316571e-08, + "loss": 0.0, + "step": 31004 + }, + { + "epoch": 12.60878405856039, + "grad_norm": 0.0033885730390575087, + "learning_rate": 4.7463148736616175e-08, + "loss": 0.0, + "step": 31005 + }, + { + "epoch": 12.609190727938186, + "grad_norm": 0.0064418181493385756, + "learning_rate": 4.73646019678653e-08, + "loss": 0.0001, + "step": 31006 + }, + { + "epoch": 12.609597397315982, + "grad_norm": 0.00599934061087381, + "learning_rate": 4.726615736792672e-08, + "loss": 0.0001, + "step": 31007 + }, + { + "epoch": 12.610004066693778, + "grad_norm": 0.0013579328068749827, + "learning_rate": 4.716781493780964e-08, + "loss": 0.0, + "step": 31008 + }, + { + "epoch": 12.610410736071573, + "grad_norm": 0.0014659876113611305, + "learning_rate": 4.706957467852435e-08, + "loss": 0.0, + "step": 31009 + }, + { + "epoch": 12.61081740544937, + "grad_norm": 7.707106368995372e-05, + "learning_rate": 4.6971436591078944e-08, + "loss": 0.0, + "step": 31010 + }, + { + "epoch": 12.611224074827165, + "grad_norm": 0.014602544837700524, + "learning_rate": 4.6873400676481494e-08, + "loss": 0.0001, + "step": 31011 + }, + { + "epoch": 12.61163074420496, + "grad_norm": 0.07264699203492316, + "learning_rate": 4.677546693573787e-08, + "loss": 0.0008, + "step": 31012 + }, + { + "epoch": 12.612037413582756, + "grad_norm": 0.000558754015346425, + "learning_rate": 4.6677635369852816e-08, + "loss": 0.0, + "step": 31013 + }, + { + "epoch": 12.612444082960554, + "grad_norm": 0.0036569015231066523, + "learning_rate": 4.657990597983109e-08, + "loss": 0.0, + "step": 31014 + }, + { + "epoch": 12.61285075233835, + "grad_norm": 0.10183135384117935, + "learning_rate": 4.648227876667632e-08, + "loss": 0.0014, + "step": 31015 + }, + { + "epoch": 12.613257421716146, + "grad_norm": 0.0693467310853123, + "learning_rate": 4.6384753731391065e-08, + "loss": 0.0007, + "step": 31016 + }, + { + "epoch": 12.613664091093941, + "grad_norm": 0.004846278185156056, + "learning_rate": 4.6287330874975613e-08, + "loss": 0.0, + "step": 31017 + }, + { + "epoch": 12.614070760471737, + "grad_norm": 0.02171136006623639, + "learning_rate": 4.619001019843028e-08, + "loss": 0.0002, + "step": 31018 + }, + { + "epoch": 12.614477429849533, + "grad_norm": 0.059367891393616944, + "learning_rate": 4.609279170275316e-08, + "loss": 0.0007, + "step": 31019 + }, + { + "epoch": 12.614884099227329, + "grad_norm": 0.006281982822356924, + "learning_rate": 4.599567538894456e-08, + "loss": 0.0, + "step": 31020 + }, + { + "epoch": 12.615290768605124, + "grad_norm": 0.0003972146280097741, + "learning_rate": 4.5898661258000354e-08, + "loss": 0.0, + "step": 31021 + }, + { + "epoch": 12.61569743798292, + "grad_norm": 0.07729370334682464, + "learning_rate": 4.58017493109153e-08, + "loss": 0.0009, + "step": 31022 + }, + { + "epoch": 12.616104107360716, + "grad_norm": 0.0051827866015677895, + "learning_rate": 4.570493954868527e-08, + "loss": 0.0001, + "step": 31023 + }, + { + "epoch": 12.616510776738512, + "grad_norm": 0.018062321812166195, + "learning_rate": 4.56082319723028e-08, + "loss": 0.0001, + "step": 31024 + }, + { + "epoch": 12.616917446116307, + "grad_norm": 0.0033272899168623146, + "learning_rate": 4.5511626582763754e-08, + "loss": 0.0, + "step": 31025 + }, + { + "epoch": 12.617324115494103, + "grad_norm": 0.030038088327563633, + "learning_rate": 4.541512338105625e-08, + "loss": 0.0002, + "step": 31026 + }, + { + "epoch": 12.617730784871899, + "grad_norm": 0.0822482920043959, + "learning_rate": 4.5318722368173916e-08, + "loss": 0.0004, + "step": 31027 + }, + { + "epoch": 12.618137454249695, + "grad_norm": 0.10497344977387608, + "learning_rate": 4.522242354510376e-08, + "loss": 0.0004, + "step": 31028 + }, + { + "epoch": 12.61854412362749, + "grad_norm": 0.027553614153091526, + "learning_rate": 4.512622691283719e-08, + "loss": 0.0002, + "step": 31029 + }, + { + "epoch": 12.618950793005286, + "grad_norm": 0.019072373938159984, + "learning_rate": 4.503013247235899e-08, + "loss": 0.0002, + "step": 31030 + }, + { + "epoch": 12.619357462383082, + "grad_norm": 0.006204447247760514, + "learning_rate": 4.493414022465614e-08, + "loss": 0.0, + "step": 31031 + }, + { + "epoch": 12.619764131760878, + "grad_norm": 0.001098165030576502, + "learning_rate": 4.4838250170715634e-08, + "loss": 0.0, + "step": 31032 + }, + { + "epoch": 12.620170801138674, + "grad_norm": 0.0011010374898943339, + "learning_rate": 4.474246231152002e-08, + "loss": 0.0, + "step": 31033 + }, + { + "epoch": 12.62057747051647, + "grad_norm": 0.01309374729797079, + "learning_rate": 4.464677664805406e-08, + "loss": 0.0001, + "step": 31034 + }, + { + "epoch": 12.620984139894265, + "grad_norm": 4.723828706630742e-05, + "learning_rate": 4.455119318129919e-08, + "loss": 0.0, + "step": 31035 + }, + { + "epoch": 12.621390809272063, + "grad_norm": 0.0028505462011972687, + "learning_rate": 4.445571191223686e-08, + "loss": 0.0, + "step": 31036 + }, + { + "epoch": 12.621797478649858, + "grad_norm": 0.034960852267751474, + "learning_rate": 4.436033284184627e-08, + "loss": 0.0002, + "step": 31037 + }, + { + "epoch": 12.622204148027654, + "grad_norm": 0.006639464359675327, + "learning_rate": 4.426505597110886e-08, + "loss": 0.0001, + "step": 31038 + }, + { + "epoch": 12.62261081740545, + "grad_norm": 0.0007735856881042613, + "learning_rate": 4.4169881301000526e-08, + "loss": 0.0, + "step": 31039 + }, + { + "epoch": 12.623017486783246, + "grad_norm": 0.0062660110400803, + "learning_rate": 4.407480883249826e-08, + "loss": 0.0001, + "step": 31040 + }, + { + "epoch": 12.623424156161041, + "grad_norm": 0.05032730852849124, + "learning_rate": 4.397983856658017e-08, + "loss": 0.0004, + "step": 31041 + }, + { + "epoch": 12.623830825538837, + "grad_norm": 0.003844504823168251, + "learning_rate": 4.3884970504219916e-08, + "loss": 0.0, + "step": 31042 + }, + { + "epoch": 12.624237494916633, + "grad_norm": 0.025058912411904673, + "learning_rate": 4.3790204646390055e-08, + "loss": 0.0001, + "step": 31043 + }, + { + "epoch": 12.624644164294429, + "grad_norm": 0.0077274854912685416, + "learning_rate": 4.369554099406537e-08, + "loss": 0.0001, + "step": 31044 + }, + { + "epoch": 12.625050833672224, + "grad_norm": 0.09469179742421681, + "learning_rate": 4.3600979548216185e-08, + "loss": 0.0008, + "step": 31045 + }, + { + "epoch": 12.62545750305002, + "grad_norm": 0.08643204576683423, + "learning_rate": 4.350652030981395e-08, + "loss": 0.0006, + "step": 31046 + }, + { + "epoch": 12.625864172427816, + "grad_norm": 0.008286219797317735, + "learning_rate": 4.3412163279829e-08, + "loss": 0.0001, + "step": 31047 + }, + { + "epoch": 12.626270841805612, + "grad_norm": 0.0011340317995219677, + "learning_rate": 4.331790845922945e-08, + "loss": 0.0, + "step": 31048 + }, + { + "epoch": 12.626677511183408, + "grad_norm": 0.000563866578470562, + "learning_rate": 4.322375584898231e-08, + "loss": 0.0, + "step": 31049 + }, + { + "epoch": 12.627084180561203, + "grad_norm": 0.12359913903418471, + "learning_rate": 4.312970545005346e-08, + "loss": 0.0008, + "step": 31050 + }, + { + "epoch": 12.627490849938999, + "grad_norm": 0.0006262503698111854, + "learning_rate": 4.3035757263409916e-08, + "loss": 0.0, + "step": 31051 + }, + { + "epoch": 12.627897519316795, + "grad_norm": 0.018341306112695237, + "learning_rate": 4.294191129001646e-08, + "loss": 0.0001, + "step": 31052 + }, + { + "epoch": 12.62830418869459, + "grad_norm": 0.009193601643163254, + "learning_rate": 4.284816753083454e-08, + "loss": 0.0002, + "step": 31053 + }, + { + "epoch": 12.628710858072386, + "grad_norm": 0.05439976810374671, + "learning_rate": 4.275452598682783e-08, + "loss": 0.0005, + "step": 31054 + }, + { + "epoch": 12.629117527450184, + "grad_norm": 0.051192293954779734, + "learning_rate": 4.2660986658957794e-08, + "loss": 0.0005, + "step": 31055 + }, + { + "epoch": 12.62952419682798, + "grad_norm": 0.0015085542023329152, + "learning_rate": 4.256754954818365e-08, + "loss": 0.0, + "step": 31056 + }, + { + "epoch": 12.629930866205775, + "grad_norm": 0.03418309411902259, + "learning_rate": 4.247421465546464e-08, + "loss": 0.0002, + "step": 31057 + }, + { + "epoch": 12.630337535583571, + "grad_norm": 0.007633078618366614, + "learning_rate": 4.2380981981759994e-08, + "loss": 0.0001, + "step": 31058 + }, + { + "epoch": 12.630744204961367, + "grad_norm": 0.01694263880149077, + "learning_rate": 4.228785152802561e-08, + "loss": 0.0001, + "step": 31059 + }, + { + "epoch": 12.631150874339163, + "grad_norm": 6.25419732237217e-05, + "learning_rate": 4.219482329521851e-08, + "loss": 0.0, + "step": 31060 + }, + { + "epoch": 12.631557543716958, + "grad_norm": 0.006909893377123399, + "learning_rate": 4.210189728429348e-08, + "loss": 0.0, + "step": 31061 + }, + { + "epoch": 12.631964213094754, + "grad_norm": 0.00354247799143269, + "learning_rate": 4.20090734962042e-08, + "loss": 0.0, + "step": 31062 + }, + { + "epoch": 12.63237088247255, + "grad_norm": 0.005771430830378257, + "learning_rate": 4.1916351931903245e-08, + "loss": 0.0001, + "step": 31063 + }, + { + "epoch": 12.632777551850346, + "grad_norm": 0.013340821262314827, + "learning_rate": 4.182373259234207e-08, + "loss": 0.0001, + "step": 31064 + }, + { + "epoch": 12.633184221228142, + "grad_norm": 0.004231042866382724, + "learning_rate": 4.173121547847325e-08, + "loss": 0.0, + "step": 31065 + }, + { + "epoch": 12.633590890605937, + "grad_norm": 4.3256326177021435e-05, + "learning_rate": 4.163880059124492e-08, + "loss": 0.0, + "step": 31066 + }, + { + "epoch": 12.633997559983733, + "grad_norm": 0.2686978612254692, + "learning_rate": 4.1546487931605203e-08, + "loss": 0.0016, + "step": 31067 + }, + { + "epoch": 12.634404229361529, + "grad_norm": 0.00014010841589982533, + "learning_rate": 4.145427750050446e-08, + "loss": 0.0, + "step": 31068 + }, + { + "epoch": 12.634810898739325, + "grad_norm": 0.038996394519051916, + "learning_rate": 4.136216929888637e-08, + "loss": 0.0004, + "step": 31069 + }, + { + "epoch": 12.63521756811712, + "grad_norm": 0.00022052707328367872, + "learning_rate": 4.1270163327697957e-08, + "loss": 0.0, + "step": 31070 + }, + { + "epoch": 12.635624237494916, + "grad_norm": 0.0048202684020972235, + "learning_rate": 4.117825958788402e-08, + "loss": 0.0, + "step": 31071 + }, + { + "epoch": 12.636030906872712, + "grad_norm": 0.007304340792541864, + "learning_rate": 4.1086458080387134e-08, + "loss": 0.0001, + "step": 31072 + }, + { + "epoch": 12.636437576250508, + "grad_norm": 0.0029655069340778165, + "learning_rate": 4.099475880614989e-08, + "loss": 0.0, + "step": 31073 + }, + { + "epoch": 12.636844245628303, + "grad_norm": 0.005720708290002, + "learning_rate": 4.090316176611375e-08, + "loss": 0.0, + "step": 31074 + }, + { + "epoch": 12.6372509150061, + "grad_norm": 0.011823991702381412, + "learning_rate": 4.081166696121908e-08, + "loss": 0.0001, + "step": 31075 + }, + { + "epoch": 12.637657584383895, + "grad_norm": 0.000589967859783926, + "learning_rate": 4.072027439240511e-08, + "loss": 0.0, + "step": 31076 + }, + { + "epoch": 12.638064253761693, + "grad_norm": 0.0006447275886765939, + "learning_rate": 4.062898406060889e-08, + "loss": 0.0, + "step": 31077 + }, + { + "epoch": 12.638470923139488, + "grad_norm": 0.009902010064452236, + "learning_rate": 4.0537795966769657e-08, + "loss": 0.0001, + "step": 31078 + }, + { + "epoch": 12.638877592517284, + "grad_norm": 0.002371477107084785, + "learning_rate": 4.044671011182222e-08, + "loss": 0.0, + "step": 31079 + }, + { + "epoch": 12.63928426189508, + "grad_norm": 3.2889066933469444e-05, + "learning_rate": 4.0355726496701384e-08, + "loss": 0.0, + "step": 31080 + }, + { + "epoch": 12.639690931272876, + "grad_norm": 0.0005838729858500476, + "learning_rate": 4.026484512234197e-08, + "loss": 0.0, + "step": 31081 + }, + { + "epoch": 12.640097600650671, + "grad_norm": 0.12073110491120012, + "learning_rate": 4.017406598967655e-08, + "loss": 0.0011, + "step": 31082 + }, + { + "epoch": 12.640504270028467, + "grad_norm": 0.003215488004213112, + "learning_rate": 4.0083389099637714e-08, + "loss": 0.0, + "step": 31083 + }, + { + "epoch": 12.640910939406263, + "grad_norm": 0.02749734749811636, + "learning_rate": 3.9992814453154725e-08, + "loss": 0.0002, + "step": 31084 + }, + { + "epoch": 12.641317608784059, + "grad_norm": 0.007957393748820549, + "learning_rate": 3.990234205115795e-08, + "loss": 0.0001, + "step": 31085 + }, + { + "epoch": 12.641724278161854, + "grad_norm": 0.0010080757973755581, + "learning_rate": 3.981197189457775e-08, + "loss": 0.0, + "step": 31086 + }, + { + "epoch": 12.64213094753965, + "grad_norm": 0.0009300929279229406, + "learning_rate": 3.9721703984338945e-08, + "loss": 0.0, + "step": 31087 + }, + { + "epoch": 12.642537616917446, + "grad_norm": 0.023788443335818407, + "learning_rate": 3.963153832137079e-08, + "loss": 0.0001, + "step": 31088 + }, + { + "epoch": 12.642944286295242, + "grad_norm": 0.2893889828841125, + "learning_rate": 3.95414749065981e-08, + "loss": 0.0029, + "step": 31089 + }, + { + "epoch": 12.643350955673037, + "grad_norm": 0.0002725723597006183, + "learning_rate": 3.945151374094458e-08, + "loss": 0.0, + "step": 31090 + }, + { + "epoch": 12.643757625050833, + "grad_norm": 0.002072979865389711, + "learning_rate": 3.9361654825333936e-08, + "loss": 0.0, + "step": 31091 + }, + { + "epoch": 12.644164294428629, + "grad_norm": 0.05710619188714187, + "learning_rate": 3.927189816068988e-08, + "loss": 0.0004, + "step": 31092 + }, + { + "epoch": 12.644570963806425, + "grad_norm": 0.10767078527366226, + "learning_rate": 3.918224374793278e-08, + "loss": 0.0009, + "step": 31093 + }, + { + "epoch": 12.64497763318422, + "grad_norm": 0.0026928690437633997, + "learning_rate": 3.909269158798301e-08, + "loss": 0.0, + "step": 31094 + }, + { + "epoch": 12.645384302562016, + "grad_norm": 0.008188713579479317, + "learning_rate": 3.900324168175984e-08, + "loss": 0.0, + "step": 31095 + }, + { + "epoch": 12.645790971939814, + "grad_norm": 0.1371553444533163, + "learning_rate": 3.891389403018031e-08, + "loss": 0.0009, + "step": 31096 + }, + { + "epoch": 12.64619764131761, + "grad_norm": 0.00847898406386569, + "learning_rate": 3.88246486341648e-08, + "loss": 0.0001, + "step": 31097 + }, + { + "epoch": 12.646604310695405, + "grad_norm": 0.004938742874246289, + "learning_rate": 3.8735505494627015e-08, + "loss": 0.0, + "step": 31098 + }, + { + "epoch": 12.647010980073201, + "grad_norm": 0.017788998426393164, + "learning_rate": 3.864646461248289e-08, + "loss": 0.0001, + "step": 31099 + }, + { + "epoch": 12.647417649450997, + "grad_norm": 0.001986810168150825, + "learning_rate": 3.855752598864615e-08, + "loss": 0.0, + "step": 31100 + }, + { + "epoch": 12.647824318828793, + "grad_norm": 5.6336033365652855e-05, + "learning_rate": 3.8468689624030496e-08, + "loss": 0.0, + "step": 31101 + }, + { + "epoch": 12.648230988206588, + "grad_norm": 0.02775042103772177, + "learning_rate": 3.837995551954632e-08, + "loss": 0.0002, + "step": 31102 + }, + { + "epoch": 12.648637657584384, + "grad_norm": 0.0007531822887028574, + "learning_rate": 3.829132367610511e-08, + "loss": 0.0, + "step": 31103 + }, + { + "epoch": 12.64904432696218, + "grad_norm": 0.0034851778319639286, + "learning_rate": 3.820279409461835e-08, + "loss": 0.0, + "step": 31104 + }, + { + "epoch": 12.649450996339976, + "grad_norm": 0.0021017959766367763, + "learning_rate": 3.811436677599312e-08, + "loss": 0.0, + "step": 31105 + }, + { + "epoch": 12.649857665717771, + "grad_norm": 0.0008681822675212122, + "learning_rate": 3.802604172113755e-08, + "loss": 0.0, + "step": 31106 + }, + { + "epoch": 12.650264335095567, + "grad_norm": 0.004334634444066824, + "learning_rate": 3.793781893095761e-08, + "loss": 0.0, + "step": 31107 + }, + { + "epoch": 12.650671004473363, + "grad_norm": 0.14444257611105416, + "learning_rate": 3.784969840636144e-08, + "loss": 0.0015, + "step": 31108 + }, + { + "epoch": 12.651077673851159, + "grad_norm": 0.008225654458259377, + "learning_rate": 3.776168014825055e-08, + "loss": 0.0, + "step": 31109 + }, + { + "epoch": 12.651484343228955, + "grad_norm": 0.0006150104865425367, + "learning_rate": 3.767376415753088e-08, + "loss": 0.0, + "step": 31110 + }, + { + "epoch": 12.65189101260675, + "grad_norm": 0.05927175532489226, + "learning_rate": 3.7585950435102826e-08, + "loss": 0.0006, + "step": 31111 + }, + { + "epoch": 12.652297681984546, + "grad_norm": 0.0001793778629727845, + "learning_rate": 3.74982389818701e-08, + "loss": 0.0, + "step": 31112 + }, + { + "epoch": 12.652704351362342, + "grad_norm": 0.0017100012818595136, + "learning_rate": 3.741062979873089e-08, + "loss": 0.0, + "step": 31113 + }, + { + "epoch": 12.653111020740138, + "grad_norm": 0.007977831446365597, + "learning_rate": 3.732312288658668e-08, + "loss": 0.0001, + "step": 31114 + }, + { + "epoch": 12.653517690117933, + "grad_norm": 0.0005909510730999618, + "learning_rate": 3.723571824633454e-08, + "loss": 0.0, + "step": 31115 + }, + { + "epoch": 12.65392435949573, + "grad_norm": 0.04314950616533957, + "learning_rate": 3.7148415878871526e-08, + "loss": 0.0003, + "step": 31116 + }, + { + "epoch": 12.654331028873525, + "grad_norm": 0.0007936485217717419, + "learning_rate": 3.7061215785093586e-08, + "loss": 0.0, + "step": 31117 + }, + { + "epoch": 12.654737698251322, + "grad_norm": 0.00046465953083409545, + "learning_rate": 3.697411796589778e-08, + "loss": 0.0, + "step": 31118 + }, + { + "epoch": 12.655144367629118, + "grad_norm": 0.00011309949754434289, + "learning_rate": 3.6887122422176736e-08, + "loss": 0.0, + "step": 31119 + }, + { + "epoch": 12.655551037006914, + "grad_norm": 0.0003178501147407938, + "learning_rate": 3.6800229154823067e-08, + "loss": 0.0, + "step": 31120 + }, + { + "epoch": 12.65595770638471, + "grad_norm": 0.00048256967307007403, + "learning_rate": 3.67134381647305e-08, + "loss": 0.0, + "step": 31121 + }, + { + "epoch": 12.656364375762506, + "grad_norm": 0.00213169909920611, + "learning_rate": 3.6626749452787216e-08, + "loss": 0.0, + "step": 31122 + }, + { + "epoch": 12.656771045140301, + "grad_norm": 0.011171566196464993, + "learning_rate": 3.654016301988583e-08, + "loss": 0.0001, + "step": 31123 + }, + { + "epoch": 12.657177714518097, + "grad_norm": 0.03520443599205637, + "learning_rate": 3.645367886691453e-08, + "loss": 0.0002, + "step": 31124 + }, + { + "epoch": 12.657584383895893, + "grad_norm": 0.008157207531744926, + "learning_rate": 3.636729699476038e-08, + "loss": 0.0001, + "step": 31125 + }, + { + "epoch": 12.657991053273689, + "grad_norm": 0.0011854016288519967, + "learning_rate": 3.6281017404310446e-08, + "loss": 0.0, + "step": 31126 + }, + { + "epoch": 12.658397722651484, + "grad_norm": 0.0001920551468591565, + "learning_rate": 3.619484009645069e-08, + "loss": 0.0, + "step": 31127 + }, + { + "epoch": 12.65880439202928, + "grad_norm": 0.019087014587614085, + "learning_rate": 3.6108765072064846e-08, + "loss": 0.0001, + "step": 31128 + }, + { + "epoch": 12.659211061407076, + "grad_norm": 0.0018363461806400884, + "learning_rate": 3.602279233203776e-08, + "loss": 0.0, + "step": 31129 + }, + { + "epoch": 12.659617730784872, + "grad_norm": 0.0004933734691609814, + "learning_rate": 3.593692187725206e-08, + "loss": 0.0, + "step": 31130 + }, + { + "epoch": 12.660024400162667, + "grad_norm": 0.02086195550930308, + "learning_rate": 3.585115370858816e-08, + "loss": 0.0002, + "step": 31131 + }, + { + "epoch": 12.660431069540463, + "grad_norm": 0.0006912500564957581, + "learning_rate": 3.576548782692757e-08, + "loss": 0.0, + "step": 31132 + }, + { + "epoch": 12.660837738918259, + "grad_norm": 0.028445516016768035, + "learning_rate": 3.567992423314848e-08, + "loss": 0.0002, + "step": 31133 + }, + { + "epoch": 12.661244408296055, + "grad_norm": 0.021562950317096957, + "learning_rate": 3.559446292812907e-08, + "loss": 0.0002, + "step": 31134 + }, + { + "epoch": 12.66165107767385, + "grad_norm": 0.005147130204480159, + "learning_rate": 3.550910391274864e-08, + "loss": 0.0, + "step": 31135 + }, + { + "epoch": 12.662057747051646, + "grad_norm": 0.0009272345433347396, + "learning_rate": 3.542384718788205e-08, + "loss": 0.0, + "step": 31136 + }, + { + "epoch": 12.662464416429444, + "grad_norm": 0.08343599077499213, + "learning_rate": 3.5338692754405266e-08, + "loss": 0.0007, + "step": 31137 + }, + { + "epoch": 12.66287108580724, + "grad_norm": 0.015382284424700802, + "learning_rate": 3.525364061319092e-08, + "loss": 0.0002, + "step": 31138 + }, + { + "epoch": 12.663277755185035, + "grad_norm": 0.010507788924351858, + "learning_rate": 3.5168690765113866e-08, + "loss": 0.0001, + "step": 31139 + }, + { + "epoch": 12.663684424562831, + "grad_norm": 0.05318193380439417, + "learning_rate": 3.5083843211044523e-08, + "loss": 0.0002, + "step": 31140 + }, + { + "epoch": 12.664091093940627, + "grad_norm": 0.014707754167172953, + "learning_rate": 3.499909795185552e-08, + "loss": 0.0001, + "step": 31141 + }, + { + "epoch": 12.664497763318423, + "grad_norm": 0.00025217609694841236, + "learning_rate": 3.491445498841617e-08, + "loss": 0.0, + "step": 31142 + }, + { + "epoch": 12.664904432696218, + "grad_norm": 0.005274580768233635, + "learning_rate": 3.482991432159466e-08, + "loss": 0.0, + "step": 31143 + }, + { + "epoch": 12.665311102074014, + "grad_norm": 5.958731149112719e-05, + "learning_rate": 3.47454759522603e-08, + "loss": 0.0, + "step": 31144 + }, + { + "epoch": 12.66571777145181, + "grad_norm": 0.00017933404695931208, + "learning_rate": 3.466113988127795e-08, + "loss": 0.0, + "step": 31145 + }, + { + "epoch": 12.666124440829606, + "grad_norm": 0.00024751862834267033, + "learning_rate": 3.4576906109515804e-08, + "loss": 0.0, + "step": 31146 + }, + { + "epoch": 12.666531110207401, + "grad_norm": 2.6059001428362246e-05, + "learning_rate": 3.44927746378354e-08, + "loss": 0.0, + "step": 31147 + }, + { + "epoch": 12.666937779585197, + "grad_norm": 0.0005873438668824399, + "learning_rate": 3.440874546710382e-08, + "loss": 0.0, + "step": 31148 + }, + { + "epoch": 12.667344448962993, + "grad_norm": 0.016495903277845718, + "learning_rate": 3.432481859818149e-08, + "loss": 0.0001, + "step": 31149 + }, + { + "epoch": 12.667751118340789, + "grad_norm": 0.016502651207206856, + "learning_rate": 3.424099403192993e-08, + "loss": 0.0001, + "step": 31150 + }, + { + "epoch": 12.668157787718584, + "grad_norm": 0.02134892639487606, + "learning_rate": 3.4157271769210687e-08, + "loss": 0.0002, + "step": 31151 + }, + { + "epoch": 12.66856445709638, + "grad_norm": 0.0011478743383309918, + "learning_rate": 3.4073651810883066e-08, + "loss": 0.0, + "step": 31152 + }, + { + "epoch": 12.668971126474176, + "grad_norm": 0.0038146173847857503, + "learning_rate": 3.3990134157805274e-08, + "loss": 0.0, + "step": 31153 + }, + { + "epoch": 12.669377795851972, + "grad_norm": 0.0036068728195304744, + "learning_rate": 3.39067188108333e-08, + "loss": 0.0, + "step": 31154 + }, + { + "epoch": 12.669784465229768, + "grad_norm": 0.03512171100296095, + "learning_rate": 3.3823405770826436e-08, + "loss": 0.0002, + "step": 31155 + }, + { + "epoch": 12.670191134607563, + "grad_norm": 0.022531090569833722, + "learning_rate": 3.3740195038637344e-08, + "loss": 0.0002, + "step": 31156 + }, + { + "epoch": 12.670597803985359, + "grad_norm": 0.001098445372035799, + "learning_rate": 3.365708661512091e-08, + "loss": 0.0, + "step": 31157 + }, + { + "epoch": 12.671004473363155, + "grad_norm": 0.0047909172947297705, + "learning_rate": 3.357408050113087e-08, + "loss": 0.0, + "step": 31158 + }, + { + "epoch": 12.671411142740952, + "grad_norm": 0.005705202473802722, + "learning_rate": 3.349117669751767e-08, + "loss": 0.0001, + "step": 31159 + }, + { + "epoch": 12.671817812118748, + "grad_norm": 5.145044752890641e-05, + "learning_rate": 3.340837520513507e-08, + "loss": 0.0, + "step": 31160 + }, + { + "epoch": 12.672224481496544, + "grad_norm": 0.00017677880884626752, + "learning_rate": 3.332567602483017e-08, + "loss": 0.0, + "step": 31161 + }, + { + "epoch": 12.67263115087434, + "grad_norm": 0.0036507363391639416, + "learning_rate": 3.324307915745451e-08, + "loss": 0.0, + "step": 31162 + }, + { + "epoch": 12.673037820252135, + "grad_norm": 0.010464281484345073, + "learning_rate": 3.316058460385518e-08, + "loss": 0.0001, + "step": 31163 + }, + { + "epoch": 12.673444489629931, + "grad_norm": 0.01863076235668137, + "learning_rate": 3.307819236487708e-08, + "loss": 0.0002, + "step": 31164 + }, + { + "epoch": 12.673851159007727, + "grad_norm": 0.003867652223038312, + "learning_rate": 3.299590244136841e-08, + "loss": 0.0, + "step": 31165 + }, + { + "epoch": 12.674257828385523, + "grad_norm": 0.04108395166037329, + "learning_rate": 3.291371483417405e-08, + "loss": 0.0002, + "step": 31166 + }, + { + "epoch": 12.674664497763318, + "grad_norm": 0.00024268067254342644, + "learning_rate": 3.2831629544135544e-08, + "loss": 0.0, + "step": 31167 + }, + { + "epoch": 12.675071167141114, + "grad_norm": 0.0004153190883464072, + "learning_rate": 3.274964657209667e-08, + "loss": 0.0, + "step": 31168 + }, + { + "epoch": 12.67547783651891, + "grad_norm": 0.0015129278989908167, + "learning_rate": 3.2667765918898974e-08, + "loss": 0.0, + "step": 31169 + }, + { + "epoch": 12.675884505896706, + "grad_norm": 0.06041639964011731, + "learning_rate": 3.2585987585384004e-08, + "loss": 0.0005, + "step": 31170 + }, + { + "epoch": 12.676291175274502, + "grad_norm": 1.2510313343915409e-05, + "learning_rate": 3.250431157238998e-08, + "loss": 0.0, + "step": 31171 + }, + { + "epoch": 12.676697844652297, + "grad_norm": 0.0006459494436303304, + "learning_rate": 3.2422737880756225e-08, + "loss": 0.0, + "step": 31172 + }, + { + "epoch": 12.677104514030093, + "grad_norm": 0.0006396356792100613, + "learning_rate": 3.234126651131875e-08, + "loss": 0.0, + "step": 31173 + }, + { + "epoch": 12.677511183407889, + "grad_norm": 6.900428522515081e-05, + "learning_rate": 3.225989746491465e-08, + "loss": 0.0, + "step": 31174 + }, + { + "epoch": 12.677917852785685, + "grad_norm": 0.030647505063695867, + "learning_rate": 3.217863074237992e-08, + "loss": 0.0003, + "step": 31175 + }, + { + "epoch": 12.67832452216348, + "grad_norm": 0.009283676417257391, + "learning_rate": 3.2097466344548355e-08, + "loss": 0.0001, + "step": 31176 + }, + { + "epoch": 12.678731191541276, + "grad_norm": 0.005283302991154276, + "learning_rate": 3.201640427225261e-08, + "loss": 0.0, + "step": 31177 + }, + { + "epoch": 12.679137860919074, + "grad_norm": 0.0002366571668823999, + "learning_rate": 3.193544452632535e-08, + "loss": 0.0, + "step": 31178 + }, + { + "epoch": 12.67954453029687, + "grad_norm": 0.003824773702955618, + "learning_rate": 3.1854587107598146e-08, + "loss": 0.0, + "step": 31179 + }, + { + "epoch": 12.679951199674665, + "grad_norm": 0.03509020624656523, + "learning_rate": 3.177383201689921e-08, + "loss": 0.0002, + "step": 31180 + }, + { + "epoch": 12.680357869052461, + "grad_norm": 0.008102086122783013, + "learning_rate": 3.16931792550601e-08, + "loss": 0.0001, + "step": 31181 + }, + { + "epoch": 12.680764538430257, + "grad_norm": 0.0005754680436765412, + "learning_rate": 3.1612628822906835e-08, + "loss": 0.0, + "step": 31182 + }, + { + "epoch": 12.681171207808053, + "grad_norm": 0.01436824002295112, + "learning_rate": 3.1532180721267625e-08, + "loss": 0.0001, + "step": 31183 + }, + { + "epoch": 12.681577877185848, + "grad_norm": 0.01975845863510297, + "learning_rate": 3.145183495096626e-08, + "loss": 0.0002, + "step": 31184 + }, + { + "epoch": 12.681984546563644, + "grad_norm": 0.00022234631656340635, + "learning_rate": 3.137159151282987e-08, + "loss": 0.0, + "step": 31185 + }, + { + "epoch": 12.68239121594144, + "grad_norm": 0.01193442472418167, + "learning_rate": 3.129145040768111e-08, + "loss": 0.0001, + "step": 31186 + }, + { + "epoch": 12.682797885319236, + "grad_norm": 0.002140864617399925, + "learning_rate": 3.121141163634267e-08, + "loss": 0.0, + "step": 31187 + }, + { + "epoch": 12.683204554697031, + "grad_norm": 0.007356367155924355, + "learning_rate": 3.113147519963611e-08, + "loss": 0.0001, + "step": 31188 + }, + { + "epoch": 12.683611224074827, + "grad_norm": 0.0021457736129880506, + "learning_rate": 3.105164109838188e-08, + "loss": 0.0, + "step": 31189 + }, + { + "epoch": 12.684017893452623, + "grad_norm": 0.40625667425049555, + "learning_rate": 3.097190933340044e-08, + "loss": 0.0042, + "step": 31190 + }, + { + "epoch": 12.684424562830419, + "grad_norm": 0.0005114101711526637, + "learning_rate": 3.0892279905510027e-08, + "loss": 0.0, + "step": 31191 + }, + { + "epoch": 12.684831232208214, + "grad_norm": 0.021107164579888742, + "learning_rate": 3.081275281552665e-08, + "loss": 0.0002, + "step": 31192 + }, + { + "epoch": 12.68523790158601, + "grad_norm": 0.0007389311381607412, + "learning_rate": 3.073332806426854e-08, + "loss": 0.0, + "step": 31193 + }, + { + "epoch": 12.685644570963806, + "grad_norm": 0.0025763189221238224, + "learning_rate": 3.06540056525495e-08, + "loss": 0.0, + "step": 31194 + }, + { + "epoch": 12.686051240341602, + "grad_norm": 0.0008811860523150977, + "learning_rate": 3.057478558118554e-08, + "loss": 0.0, + "step": 31195 + }, + { + "epoch": 12.686457909719397, + "grad_norm": 0.017726516269228998, + "learning_rate": 3.0495667850988234e-08, + "loss": 0.0002, + "step": 31196 + }, + { + "epoch": 12.686864579097193, + "grad_norm": 0.004615326222618644, + "learning_rate": 3.041665246277026e-08, + "loss": 0.0, + "step": 31197 + }, + { + "epoch": 12.687271248474989, + "grad_norm": 0.003245421296500537, + "learning_rate": 3.0337739417343195e-08, + "loss": 0.0, + "step": 31198 + }, + { + "epoch": 12.687677917852785, + "grad_norm": 0.005718377582983799, + "learning_rate": 3.02589287155175e-08, + "loss": 0.0, + "step": 31199 + }, + { + "epoch": 12.688084587230582, + "grad_norm": 0.00457549192406179, + "learning_rate": 3.018022035810031e-08, + "loss": 0.0, + "step": 31200 + }, + { + "epoch": 12.688491256608378, + "grad_norm": 0.0002807025744573149, + "learning_rate": 3.010161434590209e-08, + "loss": 0.0, + "step": 31201 + }, + { + "epoch": 12.688897925986174, + "grad_norm": 8.021696207423222e-05, + "learning_rate": 3.002311067972774e-08, + "loss": 0.0, + "step": 31202 + }, + { + "epoch": 12.68930459536397, + "grad_norm": 0.01067280471691042, + "learning_rate": 2.9944709360384406e-08, + "loss": 0.0001, + "step": 31203 + }, + { + "epoch": 12.689711264741765, + "grad_norm": 0.00407101757982968, + "learning_rate": 2.986641038867588e-08, + "loss": 0.0, + "step": 31204 + }, + { + "epoch": 12.690117934119561, + "grad_norm": 0.004404920761586599, + "learning_rate": 2.9788213765407082e-08, + "loss": 0.0001, + "step": 31205 + }, + { + "epoch": 12.690524603497357, + "grad_norm": 0.000226221776556451, + "learning_rate": 2.9710119491379586e-08, + "loss": 0.0, + "step": 31206 + }, + { + "epoch": 12.690931272875153, + "grad_norm": 0.013850311377350478, + "learning_rate": 2.9632127567396085e-08, + "loss": 0.0001, + "step": 31207 + }, + { + "epoch": 12.691337942252948, + "grad_norm": 0.0018278121234079185, + "learning_rate": 2.955423799425705e-08, + "loss": 0.0, + "step": 31208 + }, + { + "epoch": 12.691744611630744, + "grad_norm": 0.027568924670330043, + "learning_rate": 2.9476450772761846e-08, + "loss": 0.0003, + "step": 31209 + }, + { + "epoch": 12.69215128100854, + "grad_norm": 0.1329617839574135, + "learning_rate": 2.9398765903708714e-08, + "loss": 0.0013, + "step": 31210 + }, + { + "epoch": 12.692557950386336, + "grad_norm": 0.0032747775214680844, + "learning_rate": 2.93211833878948e-08, + "loss": 0.0, + "step": 31211 + }, + { + "epoch": 12.692964619764131, + "grad_norm": 0.014784823250307137, + "learning_rate": 2.9243703226117247e-08, + "loss": 0.0001, + "step": 31212 + }, + { + "epoch": 12.693371289141927, + "grad_norm": 0.07279366610047801, + "learning_rate": 2.916632541917208e-08, + "loss": 0.0004, + "step": 31213 + }, + { + "epoch": 12.693777958519723, + "grad_norm": 0.0011976401264847988, + "learning_rate": 2.9089049967852002e-08, + "loss": 0.0, + "step": 31214 + }, + { + "epoch": 12.694184627897519, + "grad_norm": 0.04719018615985143, + "learning_rate": 2.9011876872950818e-08, + "loss": 0.0003, + "step": 31215 + }, + { + "epoch": 12.694591297275315, + "grad_norm": 0.002685222651076584, + "learning_rate": 2.893480613526234e-08, + "loss": 0.0, + "step": 31216 + }, + { + "epoch": 12.69499796665311, + "grad_norm": 1.737230184055722e-05, + "learning_rate": 2.8857837755574823e-08, + "loss": 0.0, + "step": 31217 + }, + { + "epoch": 12.695404636030906, + "grad_norm": 0.017454572884488245, + "learning_rate": 2.8780971734680974e-08, + "loss": 0.0001, + "step": 31218 + }, + { + "epoch": 12.695811305408704, + "grad_norm": 0.0001021091087700615, + "learning_rate": 2.8704208073367933e-08, + "loss": 0.0, + "step": 31219 + }, + { + "epoch": 12.6962179747865, + "grad_norm": 0.00018703098635991565, + "learning_rate": 2.8627546772426184e-08, + "loss": 0.0, + "step": 31220 + }, + { + "epoch": 12.696624644164295, + "grad_norm": 0.0675132123159113, + "learning_rate": 2.8550987832640654e-08, + "loss": 0.0007, + "step": 31221 + }, + { + "epoch": 12.697031313542091, + "grad_norm": 0.0020488655508234315, + "learning_rate": 2.847453125479738e-08, + "loss": 0.0, + "step": 31222 + }, + { + "epoch": 12.697437982919887, + "grad_norm": 0.0024533211692471385, + "learning_rate": 2.8398177039681284e-08, + "loss": 0.0, + "step": 31223 + }, + { + "epoch": 12.697844652297682, + "grad_norm": 0.004130478165964963, + "learning_rate": 2.8321925188076195e-08, + "loss": 0.0, + "step": 31224 + }, + { + "epoch": 12.698251321675478, + "grad_norm": 0.012177894368010618, + "learning_rate": 2.824577570076592e-08, + "loss": 0.0001, + "step": 31225 + }, + { + "epoch": 12.698657991053274, + "grad_norm": 0.0010398528019562236, + "learning_rate": 2.8169728578532062e-08, + "loss": 0.0, + "step": 31226 + }, + { + "epoch": 12.69906466043107, + "grad_norm": 0.008400346513399768, + "learning_rate": 2.8093783822154e-08, + "loss": 0.0001, + "step": 31227 + }, + { + "epoch": 12.699471329808866, + "grad_norm": 0.04923635022325887, + "learning_rate": 2.80179414324111e-08, + "loss": 0.0004, + "step": 31228 + }, + { + "epoch": 12.699877999186661, + "grad_norm": 0.011903916599369464, + "learning_rate": 2.794220141008386e-08, + "loss": 0.0001, + "step": 31229 + }, + { + "epoch": 12.700284668564457, + "grad_norm": 0.0014819296136359369, + "learning_rate": 2.786656375594832e-08, + "loss": 0.0, + "step": 31230 + }, + { + "epoch": 12.700691337942253, + "grad_norm": 0.003928659365664872, + "learning_rate": 2.7791028470781633e-08, + "loss": 0.0, + "step": 31231 + }, + { + "epoch": 12.701098007320049, + "grad_norm": 0.0037779234909369394, + "learning_rate": 2.771559555535874e-08, + "loss": 0.0, + "step": 31232 + }, + { + "epoch": 12.701504676697844, + "grad_norm": 0.0026729872405875745, + "learning_rate": 2.764026501045458e-08, + "loss": 0.0, + "step": 31233 + }, + { + "epoch": 12.70191134607564, + "grad_norm": 0.026033489841454073, + "learning_rate": 2.7565036836841864e-08, + "loss": 0.0002, + "step": 31234 + }, + { + "epoch": 12.702318015453436, + "grad_norm": 0.005709172464744634, + "learning_rate": 2.7489911035293303e-08, + "loss": 0.0, + "step": 31235 + }, + { + "epoch": 12.702724684831232, + "grad_norm": 0.0005575842262715479, + "learning_rate": 2.74148876065794e-08, + "loss": 0.0, + "step": 31236 + }, + { + "epoch": 12.703131354209027, + "grad_norm": 0.012546622704615184, + "learning_rate": 2.7339966551471753e-08, + "loss": 0.0001, + "step": 31237 + }, + { + "epoch": 12.703538023586823, + "grad_norm": 0.06595252587748791, + "learning_rate": 2.726514787073753e-08, + "loss": 0.0005, + "step": 31238 + }, + { + "epoch": 12.703944692964619, + "grad_norm": 0.19321465640745117, + "learning_rate": 2.7190431565146112e-08, + "loss": 0.0023, + "step": 31239 + }, + { + "epoch": 12.704351362342415, + "grad_norm": 0.009397807682562381, + "learning_rate": 2.711581763546356e-08, + "loss": 0.0001, + "step": 31240 + }, + { + "epoch": 12.704758031720212, + "grad_norm": 3.163177857022491e-05, + "learning_rate": 2.704130608245703e-08, + "loss": 0.0, + "step": 31241 + }, + { + "epoch": 12.705164701098008, + "grad_norm": 0.0008290755009664186, + "learning_rate": 2.696689690689036e-08, + "loss": 0.0, + "step": 31242 + }, + { + "epoch": 12.705571370475804, + "grad_norm": 0.03662460453588317, + "learning_rate": 2.6892590109527385e-08, + "loss": 0.0002, + "step": 31243 + }, + { + "epoch": 12.7059780398536, + "grad_norm": 0.000869043586568874, + "learning_rate": 2.681838569113193e-08, + "loss": 0.0, + "step": 31244 + }, + { + "epoch": 12.706384709231395, + "grad_norm": 0.000309487759598617, + "learning_rate": 2.674428365246562e-08, + "loss": 0.0, + "step": 31245 + }, + { + "epoch": 12.706791378609191, + "grad_norm": 0.000931458382926711, + "learning_rate": 2.667028399428784e-08, + "loss": 0.0, + "step": 31246 + }, + { + "epoch": 12.707198047986987, + "grad_norm": 0.005421800918331189, + "learning_rate": 2.65963867173602e-08, + "loss": 0.0, + "step": 31247 + }, + { + "epoch": 12.707604717364783, + "grad_norm": 0.0006375353577345958, + "learning_rate": 2.652259182243877e-08, + "loss": 0.0, + "step": 31248 + }, + { + "epoch": 12.708011386742578, + "grad_norm": 0.002822560201651091, + "learning_rate": 2.644889931028405e-08, + "loss": 0.0, + "step": 31249 + }, + { + "epoch": 12.708418056120374, + "grad_norm": 0.00013189841780248253, + "learning_rate": 2.637530918164877e-08, + "loss": 0.0, + "step": 31250 + }, + { + "epoch": 12.70882472549817, + "grad_norm": 0.06036442982310223, + "learning_rate": 2.630182143729232e-08, + "loss": 0.0004, + "step": 31251 + }, + { + "epoch": 12.709231394875966, + "grad_norm": 0.004702229865043105, + "learning_rate": 2.6228436077967436e-08, + "loss": 0.0, + "step": 31252 + }, + { + "epoch": 12.709638064253761, + "grad_norm": 2.6221786234139475e-05, + "learning_rate": 2.6155153104426845e-08, + "loss": 0.0, + "step": 31253 + }, + { + "epoch": 12.710044733631557, + "grad_norm": 0.007867961682782658, + "learning_rate": 2.6081972517423283e-08, + "loss": 0.0001, + "step": 31254 + }, + { + "epoch": 12.710451403009353, + "grad_norm": 2.5325773148919122e-05, + "learning_rate": 2.6008894317707256e-08, + "loss": 0.0, + "step": 31255 + }, + { + "epoch": 12.710858072387149, + "grad_norm": 9.901265200108216e-05, + "learning_rate": 2.593591850603039e-08, + "loss": 0.0, + "step": 31256 + }, + { + "epoch": 12.711264741764944, + "grad_norm": 0.0009030062779261245, + "learning_rate": 2.5863045083140968e-08, + "loss": 0.0, + "step": 31257 + }, + { + "epoch": 12.71167141114274, + "grad_norm": 0.004046132014460337, + "learning_rate": 2.57902740497884e-08, + "loss": 0.0, + "step": 31258 + }, + { + "epoch": 12.712078080520536, + "grad_norm": 0.0002033210412060076, + "learning_rate": 2.571760540671764e-08, + "loss": 0.0, + "step": 31259 + }, + { + "epoch": 12.712484749898334, + "grad_norm": 0.0027977756113915196, + "learning_rate": 2.5645039154675867e-08, + "loss": 0.0, + "step": 31260 + }, + { + "epoch": 12.71289141927613, + "grad_norm": 0.016232616014879776, + "learning_rate": 2.557257529440693e-08, + "loss": 0.0001, + "step": 31261 + }, + { + "epoch": 12.713298088653925, + "grad_norm": 0.0025431646579598343, + "learning_rate": 2.5500213826656906e-08, + "loss": 0.0, + "step": 31262 + }, + { + "epoch": 12.71370475803172, + "grad_norm": 0.03810511595516027, + "learning_rate": 2.5427954752166306e-08, + "loss": 0.0003, + "step": 31263 + }, + { + "epoch": 12.714111427409517, + "grad_norm": 0.0015618408827745873, + "learning_rate": 2.5355798071677874e-08, + "loss": 0.0, + "step": 31264 + }, + { + "epoch": 12.714518096787312, + "grad_norm": 0.013939917860669762, + "learning_rate": 2.5283743785932124e-08, + "loss": 0.0001, + "step": 31265 + }, + { + "epoch": 12.714924766165108, + "grad_norm": 0.028280156597974938, + "learning_rate": 2.521179189566958e-08, + "loss": 0.0001, + "step": 31266 + }, + { + "epoch": 12.715331435542904, + "grad_norm": 0.005710959319734222, + "learning_rate": 2.5139942401627425e-08, + "loss": 0.0, + "step": 31267 + }, + { + "epoch": 12.7157381049207, + "grad_norm": 0.00023498770465978403, + "learning_rate": 2.5068195304545074e-08, + "loss": 0.0, + "step": 31268 + }, + { + "epoch": 12.716144774298495, + "grad_norm": 0.0025509379287394804, + "learning_rate": 2.499655060515749e-08, + "loss": 0.0, + "step": 31269 + }, + { + "epoch": 12.716551443676291, + "grad_norm": 0.17938213093197775, + "learning_rate": 2.492500830419964e-08, + "loss": 0.0016, + "step": 31270 + }, + { + "epoch": 12.716958113054087, + "grad_norm": 1.7774749924297614e-05, + "learning_rate": 2.4853568402408714e-08, + "loss": 0.0, + "step": 31271 + }, + { + "epoch": 12.717364782431883, + "grad_norm": 0.09779757194534065, + "learning_rate": 2.4782230900515237e-08, + "loss": 0.001, + "step": 31272 + }, + { + "epoch": 12.717771451809678, + "grad_norm": 0.06172430963307501, + "learning_rate": 2.4710995799253067e-08, + "loss": 0.0002, + "step": 31273 + }, + { + "epoch": 12.718178121187474, + "grad_norm": 0.0020554152862385386, + "learning_rate": 2.4639863099352733e-08, + "loss": 0.0, + "step": 31274 + }, + { + "epoch": 12.71858479056527, + "grad_norm": 0.012332093410765271, + "learning_rate": 2.4568832801544763e-08, + "loss": 0.0001, + "step": 31275 + }, + { + "epoch": 12.718991459943066, + "grad_norm": 0.009973786829155737, + "learning_rate": 2.4497904906557458e-08, + "loss": 0.0001, + "step": 31276 + }, + { + "epoch": 12.719398129320862, + "grad_norm": 0.11089079606144125, + "learning_rate": 2.4427079415120236e-08, + "loss": 0.0007, + "step": 31277 + }, + { + "epoch": 12.719804798698657, + "grad_norm": 0.014472692469890844, + "learning_rate": 2.4356356327960295e-08, + "loss": 0.0001, + "step": 31278 + }, + { + "epoch": 12.720211468076453, + "grad_norm": 0.0024669585816035873, + "learning_rate": 2.428573564580261e-08, + "loss": 0.0, + "step": 31279 + }, + { + "epoch": 12.720618137454249, + "grad_norm": 0.0003649968570153687, + "learning_rate": 2.4215217369372157e-08, + "loss": 0.0, + "step": 31280 + }, + { + "epoch": 12.721024806832045, + "grad_norm": 0.0006430974012969102, + "learning_rate": 2.41448014993928e-08, + "loss": 0.0, + "step": 31281 + }, + { + "epoch": 12.721431476209842, + "grad_norm": 0.0015098231876337976, + "learning_rate": 2.4074488036589515e-08, + "loss": 0.0, + "step": 31282 + }, + { + "epoch": 12.721838145587638, + "grad_norm": 0.00950301551631684, + "learning_rate": 2.400427698168062e-08, + "loss": 0.0001, + "step": 31283 + }, + { + "epoch": 12.722244814965434, + "grad_norm": 0.007278393972425979, + "learning_rate": 2.3934168335388865e-08, + "loss": 0.0001, + "step": 31284 + }, + { + "epoch": 12.72265148434323, + "grad_norm": 0.007426361238757451, + "learning_rate": 2.3864162098434784e-08, + "loss": 0.0001, + "step": 31285 + }, + { + "epoch": 12.723058153721025, + "grad_norm": 0.014816977309172827, + "learning_rate": 2.3794258271535586e-08, + "loss": 0.0001, + "step": 31286 + }, + { + "epoch": 12.723464823098821, + "grad_norm": 0.00027730980481470766, + "learning_rate": 2.3724456855408473e-08, + "loss": 0.0, + "step": 31287 + }, + { + "epoch": 12.723871492476617, + "grad_norm": 0.016477574129354827, + "learning_rate": 2.3654757850771758e-08, + "loss": 0.0001, + "step": 31288 + }, + { + "epoch": 12.724278161854413, + "grad_norm": 0.00011468607822213957, + "learning_rate": 2.358516125833932e-08, + "loss": 0.0, + "step": 31289 + }, + { + "epoch": 12.724684831232208, + "grad_norm": 0.001926163887981114, + "learning_rate": 2.3515667078826133e-08, + "loss": 0.0, + "step": 31290 + }, + { + "epoch": 12.725091500610004, + "grad_norm": 0.0041512282257609385, + "learning_rate": 2.3446275312946075e-08, + "loss": 0.0, + "step": 31291 + }, + { + "epoch": 12.7254981699878, + "grad_norm": 0.0037310251596797657, + "learning_rate": 2.3376985961410804e-08, + "loss": 0.0, + "step": 31292 + }, + { + "epoch": 12.725904839365596, + "grad_norm": 0.008404130405279868, + "learning_rate": 2.3307799024933075e-08, + "loss": 0.0001, + "step": 31293 + }, + { + "epoch": 12.726311508743391, + "grad_norm": 0.01466320136392545, + "learning_rate": 2.3238714504220107e-08, + "loss": 0.0001, + "step": 31294 + }, + { + "epoch": 12.726718178121187, + "grad_norm": 0.0015355032931161465, + "learning_rate": 2.3169732399984657e-08, + "loss": 0.0, + "step": 31295 + }, + { + "epoch": 12.727124847498983, + "grad_norm": 0.0036767603677673065, + "learning_rate": 2.3100852712931722e-08, + "loss": 0.0, + "step": 31296 + }, + { + "epoch": 12.727531516876779, + "grad_norm": 0.0099569812071498, + "learning_rate": 2.303207544377073e-08, + "loss": 0.0001, + "step": 31297 + }, + { + "epoch": 12.727938186254574, + "grad_norm": 0.0027153453298081136, + "learning_rate": 2.2963400593205565e-08, + "loss": 0.0, + "step": 31298 + }, + { + "epoch": 12.72834485563237, + "grad_norm": 0.002393416760706641, + "learning_rate": 2.2894828161943437e-08, + "loss": 0.0, + "step": 31299 + }, + { + "epoch": 12.728751525010166, + "grad_norm": 0.002314052566096877, + "learning_rate": 2.2826358150687122e-08, + "loss": 0.0, + "step": 31300 + }, + { + "epoch": 12.729158194387963, + "grad_norm": 0.0004966067620738825, + "learning_rate": 2.2757990560140497e-08, + "loss": 0.0, + "step": 31301 + }, + { + "epoch": 12.72956486376576, + "grad_norm": 0.0114199145129446, + "learning_rate": 2.2689725391003003e-08, + "loss": 0.0001, + "step": 31302 + }, + { + "epoch": 12.729971533143555, + "grad_norm": 0.002476832388274415, + "learning_rate": 2.2621562643977413e-08, + "loss": 0.0, + "step": 31303 + }, + { + "epoch": 12.73037820252135, + "grad_norm": 0.0003924835668701161, + "learning_rate": 2.2553502319763164e-08, + "loss": 0.0, + "step": 31304 + }, + { + "epoch": 12.730784871899147, + "grad_norm": 0.0012479142781838214, + "learning_rate": 2.24855444190597e-08, + "loss": 0.0, + "step": 31305 + }, + { + "epoch": 12.731191541276942, + "grad_norm": 0.01794984903005168, + "learning_rate": 2.241768894256202e-08, + "loss": 0.0002, + "step": 31306 + }, + { + "epoch": 12.731598210654738, + "grad_norm": 0.03597051115044182, + "learning_rate": 2.2349935890969566e-08, + "loss": 0.0003, + "step": 31307 + }, + { + "epoch": 12.732004880032534, + "grad_norm": 0.017152292900133037, + "learning_rate": 2.2282285264976224e-08, + "loss": 0.0001, + "step": 31308 + }, + { + "epoch": 12.73241154941033, + "grad_norm": 0.022790524738391164, + "learning_rate": 2.221473706527699e-08, + "loss": 0.0001, + "step": 31309 + }, + { + "epoch": 12.732818218788125, + "grad_norm": 0.0272856787470679, + "learning_rate": 2.2147291292564654e-08, + "loss": 0.0002, + "step": 31310 + }, + { + "epoch": 12.733224888165921, + "grad_norm": 0.00015093469236024214, + "learning_rate": 2.2079947947533098e-08, + "loss": 0.0, + "step": 31311 + }, + { + "epoch": 12.733631557543717, + "grad_norm": 0.03304307617907759, + "learning_rate": 2.2012707030871773e-08, + "loss": 0.0003, + "step": 31312 + }, + { + "epoch": 12.734038226921513, + "grad_norm": 0.02338190008277607, + "learning_rate": 2.1945568543272346e-08, + "loss": 0.0002, + "step": 31313 + }, + { + "epoch": 12.734444896299308, + "grad_norm": 0.002583223519819157, + "learning_rate": 2.1878532485423154e-08, + "loss": 0.0, + "step": 31314 + }, + { + "epoch": 12.734851565677104, + "grad_norm": 0.003584536727763696, + "learning_rate": 2.181159885801365e-08, + "loss": 0.0, + "step": 31315 + }, + { + "epoch": 12.7352582350549, + "grad_norm": 0.002408278583024181, + "learning_rate": 2.174476766172884e-08, + "loss": 0.0, + "step": 31316 + }, + { + "epoch": 12.735664904432696, + "grad_norm": 0.009351204282979263, + "learning_rate": 2.167803889725595e-08, + "loss": 0.0001, + "step": 31317 + }, + { + "epoch": 12.736071573810491, + "grad_norm": 0.016043033312927186, + "learning_rate": 2.16114125652811e-08, + "loss": 0.0001, + "step": 31318 + }, + { + "epoch": 12.736478243188287, + "grad_norm": 0.02231077132463361, + "learning_rate": 2.1544888666487073e-08, + "loss": 0.0002, + "step": 31319 + }, + { + "epoch": 12.736884912566083, + "grad_norm": 0.0018840097305158514, + "learning_rate": 2.1478467201555552e-08, + "loss": 0.0, + "step": 31320 + }, + { + "epoch": 12.737291581943879, + "grad_norm": 0.00032167190090626505, + "learning_rate": 2.1412148171170434e-08, + "loss": 0.0, + "step": 31321 + }, + { + "epoch": 12.737698251321675, + "grad_norm": 0.002846856861548123, + "learning_rate": 2.134593157601117e-08, + "loss": 0.0, + "step": 31322 + }, + { + "epoch": 12.738104920699472, + "grad_norm": 0.00264622061424258, + "learning_rate": 2.127981741675833e-08, + "loss": 0.0, + "step": 31323 + }, + { + "epoch": 12.738511590077268, + "grad_norm": 0.013444949068746383, + "learning_rate": 2.1213805694091373e-08, + "loss": 0.0002, + "step": 31324 + }, + { + "epoch": 12.738918259455064, + "grad_norm": 0.0010303545078286108, + "learning_rate": 2.114789640868531e-08, + "loss": 0.0, + "step": 31325 + }, + { + "epoch": 12.73932492883286, + "grad_norm": 0.002828741245005783, + "learning_rate": 2.1082089561219598e-08, + "loss": 0.0, + "step": 31326 + }, + { + "epoch": 12.739731598210655, + "grad_norm": 0.003434849560895651, + "learning_rate": 2.1016385152368146e-08, + "loss": 0.0, + "step": 31327 + }, + { + "epoch": 12.740138267588451, + "grad_norm": 0.36482289127680634, + "learning_rate": 2.0950783182804858e-08, + "loss": 0.0028, + "step": 31328 + }, + { + "epoch": 12.740544936966247, + "grad_norm": 0.08851053698953212, + "learning_rate": 2.0885283653205858e-08, + "loss": 0.0009, + "step": 31329 + }, + { + "epoch": 12.740951606344042, + "grad_norm": 0.006658958806540985, + "learning_rate": 2.081988656424061e-08, + "loss": 0.0, + "step": 31330 + }, + { + "epoch": 12.741358275721838, + "grad_norm": 0.011975810946833509, + "learning_rate": 2.075459191658191e-08, + "loss": 0.0, + "step": 31331 + }, + { + "epoch": 12.741764945099634, + "grad_norm": 0.003143266759256645, + "learning_rate": 2.0689399710899227e-08, + "loss": 0.0, + "step": 31332 + }, + { + "epoch": 12.74217161447743, + "grad_norm": 0.00250399343073404, + "learning_rate": 2.062430994786313e-08, + "loss": 0.0, + "step": 31333 + }, + { + "epoch": 12.742578283855226, + "grad_norm": 0.005137862348104328, + "learning_rate": 2.0559322628140866e-08, + "loss": 0.0, + "step": 31334 + }, + { + "epoch": 12.742984953233021, + "grad_norm": 0.001603451863788168, + "learning_rate": 2.0494437752399675e-08, + "loss": 0.0, + "step": 31335 + }, + { + "epoch": 12.743391622610817, + "grad_norm": 0.0330243331700847, + "learning_rate": 2.0429655321305697e-08, + "loss": 0.0003, + "step": 31336 + }, + { + "epoch": 12.743798291988613, + "grad_norm": 0.0010095083482039979, + "learning_rate": 2.036497533552395e-08, + "loss": 0.0, + "step": 31337 + }, + { + "epoch": 12.744204961366409, + "grad_norm": 0.0038285956943938256, + "learning_rate": 2.0300397795718352e-08, + "loss": 0.0, + "step": 31338 + }, + { + "epoch": 12.744611630744204, + "grad_norm": 0.0002677378605820541, + "learning_rate": 2.0235922702551703e-08, + "loss": 0.0, + "step": 31339 + }, + { + "epoch": 12.745018300122, + "grad_norm": 0.00035806341410470706, + "learning_rate": 2.0171550056685697e-08, + "loss": 0.0, + "step": 31340 + }, + { + "epoch": 12.745424969499796, + "grad_norm": 0.23661954451492465, + "learning_rate": 2.0107279858782024e-08, + "loss": 0.0018, + "step": 31341 + }, + { + "epoch": 12.745831638877593, + "grad_norm": 0.0001189338884005468, + "learning_rate": 2.0043112109500163e-08, + "loss": 0.0, + "step": 31342 + }, + { + "epoch": 12.74623830825539, + "grad_norm": 0.007601370506252721, + "learning_rate": 1.997904680949736e-08, + "loss": 0.0001, + "step": 31343 + }, + { + "epoch": 12.746644977633185, + "grad_norm": 7.625161695360337e-06, + "learning_rate": 1.991508395943309e-08, + "loss": 0.0, + "step": 31344 + }, + { + "epoch": 12.74705164701098, + "grad_norm": 0.0036123050138627513, + "learning_rate": 1.9851223559963494e-08, + "loss": 0.0, + "step": 31345 + }, + { + "epoch": 12.747458316388776, + "grad_norm": 0.0062559163632703455, + "learning_rate": 1.9787465611744717e-08, + "loss": 0.0001, + "step": 31346 + }, + { + "epoch": 12.747864985766572, + "grad_norm": 0.00034973640494395165, + "learning_rate": 1.9723810115429565e-08, + "loss": 0.0, + "step": 31347 + }, + { + "epoch": 12.748271655144368, + "grad_norm": 0.0735347162956101, + "learning_rate": 1.9660257071673074e-08, + "loss": 0.0003, + "step": 31348 + }, + { + "epoch": 12.748678324522164, + "grad_norm": 0.0083103521410183, + "learning_rate": 1.959680648112583e-08, + "loss": 0.0001, + "step": 31349 + }, + { + "epoch": 12.74908499389996, + "grad_norm": 0.0031721040736105654, + "learning_rate": 1.9533458344441757e-08, + "loss": 0.0, + "step": 31350 + }, + { + "epoch": 12.749491663277755, + "grad_norm": 0.13116644750021836, + "learning_rate": 1.9470212662269227e-08, + "loss": 0.0013, + "step": 31351 + }, + { + "epoch": 12.749898332655551, + "grad_norm": 0.0021710907393619615, + "learning_rate": 1.940706943525883e-08, + "loss": 0.0, + "step": 31352 + }, + { + "epoch": 12.750305002033347, + "grad_norm": 0.0007252151370528462, + "learning_rate": 1.9344028664056715e-08, + "loss": 0.0, + "step": 31353 + }, + { + "epoch": 12.750711671411143, + "grad_norm": 0.003511966049698401, + "learning_rate": 1.928109034931125e-08, + "loss": 0.0, + "step": 31354 + }, + { + "epoch": 12.751118340788938, + "grad_norm": 0.00024388946649094048, + "learning_rate": 1.9218254491669696e-08, + "loss": 0.0, + "step": 31355 + }, + { + "epoch": 12.751525010166734, + "grad_norm": 0.0019966450512054123, + "learning_rate": 1.9155521091775986e-08, + "loss": 0.0, + "step": 31356 + }, + { + "epoch": 12.75193167954453, + "grad_norm": 0.011744521437263629, + "learning_rate": 1.9092890150272935e-08, + "loss": 0.0001, + "step": 31357 + }, + { + "epoch": 12.752338348922326, + "grad_norm": 0.051799944331926744, + "learning_rate": 1.9030361667805585e-08, + "loss": 0.0004, + "step": 31358 + }, + { + "epoch": 12.752745018300121, + "grad_norm": 0.03366936115666236, + "learning_rate": 1.8967935645014534e-08, + "loss": 0.0001, + "step": 31359 + }, + { + "epoch": 12.753151687677917, + "grad_norm": 0.004409361327259824, + "learning_rate": 1.890561208254149e-08, + "loss": 0.0, + "step": 31360 + }, + { + "epoch": 12.753558357055713, + "grad_norm": 0.0030595522539581467, + "learning_rate": 1.8843390981024835e-08, + "loss": 0.0, + "step": 31361 + }, + { + "epoch": 12.753965026433509, + "grad_norm": 0.01840257918173954, + "learning_rate": 1.8781272341105163e-08, + "loss": 0.0002, + "step": 31362 + }, + { + "epoch": 12.754371695811304, + "grad_norm": 0.010539817125215668, + "learning_rate": 1.871925616341863e-08, + "loss": 0.0001, + "step": 31363 + }, + { + "epoch": 12.754778365189102, + "grad_norm": 0.021102458783378842, + "learning_rate": 1.8657342448602512e-08, + "loss": 0.0002, + "step": 31364 + }, + { + "epoch": 12.755185034566898, + "grad_norm": 0.003286325123370555, + "learning_rate": 1.859553119729185e-08, + "loss": 0.0, + "step": 31365 + }, + { + "epoch": 12.755591703944694, + "grad_norm": 0.0002151929583207561, + "learning_rate": 1.8533822410121695e-08, + "loss": 0.0, + "step": 31366 + }, + { + "epoch": 12.75599837332249, + "grad_norm": 0.026255098508700465, + "learning_rate": 1.847221608772598e-08, + "loss": 0.0003, + "step": 31367 + }, + { + "epoch": 12.756405042700285, + "grad_norm": 0.00035363961796127996, + "learning_rate": 1.8410712230736426e-08, + "loss": 0.0, + "step": 31368 + }, + { + "epoch": 12.75681171207808, + "grad_norm": 0.006158866703141747, + "learning_rate": 1.834931083978475e-08, + "loss": 0.0, + "step": 31369 + }, + { + "epoch": 12.757218381455877, + "grad_norm": 0.004467417595111017, + "learning_rate": 1.828801191550045e-08, + "loss": 0.0, + "step": 31370 + }, + { + "epoch": 12.757625050833672, + "grad_norm": 0.0007376718649554361, + "learning_rate": 1.8226815458514124e-08, + "loss": 0.0, + "step": 31371 + }, + { + "epoch": 12.758031720211468, + "grad_norm": 0.00022214824953884323, + "learning_rate": 1.8165721469451946e-08, + "loss": 0.0, + "step": 31372 + }, + { + "epoch": 12.758438389589264, + "grad_norm": 0.0002662115437073557, + "learning_rate": 1.810472994894341e-08, + "loss": 0.0, + "step": 31373 + }, + { + "epoch": 12.75884505896706, + "grad_norm": 0.012577299464698021, + "learning_rate": 1.8043840897613574e-08, + "loss": 0.0001, + "step": 31374 + }, + { + "epoch": 12.759251728344855, + "grad_norm": 0.0008396524769337732, + "learning_rate": 1.798305431608749e-08, + "loss": 0.0, + "step": 31375 + }, + { + "epoch": 12.759658397722651, + "grad_norm": 0.0004439557416966591, + "learning_rate": 1.7922370204989102e-08, + "loss": 0.0, + "step": 31376 + }, + { + "epoch": 12.760065067100447, + "grad_norm": 0.014124214665675885, + "learning_rate": 1.786178856494125e-08, + "loss": 0.0002, + "step": 31377 + }, + { + "epoch": 12.760471736478243, + "grad_norm": 0.007257570917830226, + "learning_rate": 1.7801309396566768e-08, + "loss": 0.0, + "step": 31378 + }, + { + "epoch": 12.760878405856038, + "grad_norm": 0.0005291254733739275, + "learning_rate": 1.774093270048405e-08, + "loss": 0.0, + "step": 31379 + }, + { + "epoch": 12.761285075233834, + "grad_norm": 0.004278235113685479, + "learning_rate": 1.7680658477315925e-08, + "loss": 0.0, + "step": 31380 + }, + { + "epoch": 12.76169174461163, + "grad_norm": 0.07345505611212924, + "learning_rate": 1.7620486727679686e-08, + "loss": 0.0004, + "step": 31381 + }, + { + "epoch": 12.762098413989426, + "grad_norm": 0.0009495437862735071, + "learning_rate": 1.7560417452193724e-08, + "loss": 0.0, + "step": 31382 + }, + { + "epoch": 12.762505083367223, + "grad_norm": 0.0005149270494791872, + "learning_rate": 1.750045065147421e-08, + "loss": 0.0, + "step": 31383 + }, + { + "epoch": 12.762911752745019, + "grad_norm": 0.01517124656206106, + "learning_rate": 1.74405863261351e-08, + "loss": 0.0001, + "step": 31384 + }, + { + "epoch": 12.763318422122815, + "grad_norm": 0.0014522534202941462, + "learning_rate": 1.738082447679368e-08, + "loss": 0.0, + "step": 31385 + }, + { + "epoch": 12.76372509150061, + "grad_norm": 0.013026103350469005, + "learning_rate": 1.7321165104061677e-08, + "loss": 0.0001, + "step": 31386 + }, + { + "epoch": 12.764131760878406, + "grad_norm": 0.0017854605249044654, + "learning_rate": 1.7261608208553048e-08, + "loss": 0.0, + "step": 31387 + }, + { + "epoch": 12.764538430256202, + "grad_norm": 0.03840763968561123, + "learning_rate": 1.7202153790877306e-08, + "loss": 0.0002, + "step": 31388 + }, + { + "epoch": 12.764945099633998, + "grad_norm": 0.006248255674940014, + "learning_rate": 1.7142801851646186e-08, + "loss": 0.0, + "step": 31389 + }, + { + "epoch": 12.765351769011794, + "grad_norm": 0.006786471978658034, + "learning_rate": 1.7083552391468084e-08, + "loss": 0.0, + "step": 31390 + }, + { + "epoch": 12.76575843838959, + "grad_norm": 0.010215285482775036, + "learning_rate": 1.7024405410952517e-08, + "loss": 0.0001, + "step": 31391 + }, + { + "epoch": 12.766165107767385, + "grad_norm": 0.0004987289343890133, + "learning_rate": 1.6965360910704555e-08, + "loss": 0.0, + "step": 31392 + }, + { + "epoch": 12.766571777145181, + "grad_norm": 0.0020925503301460545, + "learning_rate": 1.69064188913326e-08, + "loss": 0.0, + "step": 31393 + }, + { + "epoch": 12.766978446522977, + "grad_norm": 0.004228590878401199, + "learning_rate": 1.6847579353439503e-08, + "loss": 0.0, + "step": 31394 + }, + { + "epoch": 12.767385115900773, + "grad_norm": 0.08698249414745039, + "learning_rate": 1.6788842297631448e-08, + "loss": 0.0005, + "step": 31395 + }, + { + "epoch": 12.767791785278568, + "grad_norm": 0.0037508583047860186, + "learning_rate": 1.6730207724509064e-08, + "loss": 0.0, + "step": 31396 + }, + { + "epoch": 12.768198454656364, + "grad_norm": 0.013003264630820867, + "learning_rate": 1.667167563467742e-08, + "loss": 0.0001, + "step": 31397 + }, + { + "epoch": 12.76860512403416, + "grad_norm": 0.009144415892264744, + "learning_rate": 1.661324602873382e-08, + "loss": 0.0001, + "step": 31398 + }, + { + "epoch": 12.769011793411956, + "grad_norm": 0.004128774987160604, + "learning_rate": 1.6554918907280004e-08, + "loss": 0.0, + "step": 31399 + }, + { + "epoch": 12.769418462789751, + "grad_norm": 0.0008460114271619246, + "learning_rate": 1.649669427091549e-08, + "loss": 0.0, + "step": 31400 + }, + { + "epoch": 12.769825132167547, + "grad_norm": 0.06775033371262813, + "learning_rate": 1.6438572120236474e-08, + "loss": 0.0005, + "step": 31401 + }, + { + "epoch": 12.770231801545343, + "grad_norm": 0.034039619911048535, + "learning_rate": 1.638055245584025e-08, + "loss": 0.0003, + "step": 31402 + }, + { + "epoch": 12.770638470923139, + "grad_norm": 0.015320420253603053, + "learning_rate": 1.6322635278321897e-08, + "loss": 0.0001, + "step": 31403 + }, + { + "epoch": 12.771045140300934, + "grad_norm": 5.845045242390632e-05, + "learning_rate": 1.626482058827761e-08, + "loss": 0.0, + "step": 31404 + }, + { + "epoch": 12.771451809678732, + "grad_norm": 0.0046375535053410025, + "learning_rate": 1.620710838629802e-08, + "loss": 0.0, + "step": 31405 + }, + { + "epoch": 12.771858479056528, + "grad_norm": 0.001616207011423234, + "learning_rate": 1.6149498672978216e-08, + "loss": 0.0, + "step": 31406 + }, + { + "epoch": 12.772265148434323, + "grad_norm": 0.0005907986034081281, + "learning_rate": 1.609199144890772e-08, + "loss": 0.0, + "step": 31407 + }, + { + "epoch": 12.77267181781212, + "grad_norm": 0.004848022160361608, + "learning_rate": 1.603458671467828e-08, + "loss": 0.0, + "step": 31408 + }, + { + "epoch": 12.773078487189915, + "grad_norm": 6.537095396961388e-05, + "learning_rate": 1.597728447087943e-08, + "loss": 0.0, + "step": 31409 + }, + { + "epoch": 12.77348515656771, + "grad_norm": 0.0379603209407945, + "learning_rate": 1.592008471809736e-08, + "loss": 0.0003, + "step": 31410 + }, + { + "epoch": 12.773891825945507, + "grad_norm": 0.0009043830158732589, + "learning_rate": 1.5862987456921607e-08, + "loss": 0.0, + "step": 31411 + }, + { + "epoch": 12.774298495323302, + "grad_norm": 0.0026215095609745556, + "learning_rate": 1.5805992687936145e-08, + "loss": 0.0, + "step": 31412 + }, + { + "epoch": 12.774705164701098, + "grad_norm": 0.024529057820065807, + "learning_rate": 1.574910041172717e-08, + "loss": 0.0001, + "step": 31413 + }, + { + "epoch": 12.775111834078894, + "grad_norm": 0.0016250020020195234, + "learning_rate": 1.569231062887866e-08, + "loss": 0.0, + "step": 31414 + }, + { + "epoch": 12.77551850345669, + "grad_norm": 0.0003274561920387351, + "learning_rate": 1.5635623339974594e-08, + "loss": 0.0, + "step": 31415 + }, + { + "epoch": 12.775925172834485, + "grad_norm": 0.018899560432962342, + "learning_rate": 1.5579038545595616e-08, + "loss": 0.0001, + "step": 31416 + }, + { + "epoch": 12.776331842212281, + "grad_norm": 0.0021919434590883797, + "learning_rate": 1.5522556246322373e-08, + "loss": 0.0, + "step": 31417 + }, + { + "epoch": 12.776738511590077, + "grad_norm": 0.008616132722541678, + "learning_rate": 1.5466176442734405e-08, + "loss": 0.0001, + "step": 31418 + }, + { + "epoch": 12.777145180967873, + "grad_norm": 0.033198412773442584, + "learning_rate": 1.5409899135412353e-08, + "loss": 0.0004, + "step": 31419 + }, + { + "epoch": 12.777551850345668, + "grad_norm": 8.946167521614935e-05, + "learning_rate": 1.5353724324932428e-08, + "loss": 0.0, + "step": 31420 + }, + { + "epoch": 12.777958519723464, + "grad_norm": 0.00539953049344947, + "learning_rate": 1.5297652011871945e-08, + "loss": 0.0001, + "step": 31421 + }, + { + "epoch": 12.77836518910126, + "grad_norm": 0.00017873499656937867, + "learning_rate": 1.5241682196805997e-08, + "loss": 0.0, + "step": 31422 + }, + { + "epoch": 12.778771858479056, + "grad_norm": 0.049978641736696146, + "learning_rate": 1.518581488030968e-08, + "loss": 0.0005, + "step": 31423 + }, + { + "epoch": 12.779178527856853, + "grad_norm": 0.032447441818108776, + "learning_rate": 1.513005006295587e-08, + "loss": 0.0003, + "step": 31424 + }, + { + "epoch": 12.779585197234649, + "grad_norm": 0.013616724926349308, + "learning_rate": 1.5074387745318552e-08, + "loss": 0.0001, + "step": 31425 + }, + { + "epoch": 12.779991866612445, + "grad_norm": 9.079729957168226e-05, + "learning_rate": 1.5018827927966163e-08, + "loss": 0.0, + "step": 31426 + }, + { + "epoch": 12.78039853599024, + "grad_norm": 0.04864252136782452, + "learning_rate": 1.496337061147157e-08, + "loss": 0.0005, + "step": 31427 + }, + { + "epoch": 12.780805205368036, + "grad_norm": 0.08899731132129936, + "learning_rate": 1.490801579640433e-08, + "loss": 0.0007, + "step": 31428 + }, + { + "epoch": 12.781211874745832, + "grad_norm": 0.008213200426682325, + "learning_rate": 1.4852763483330646e-08, + "loss": 0.0001, + "step": 31429 + }, + { + "epoch": 12.781618544123628, + "grad_norm": 0.013330723893697681, + "learning_rate": 1.4797613672818956e-08, + "loss": 0.0001, + "step": 31430 + }, + { + "epoch": 12.782025213501424, + "grad_norm": 0.0011927956010349802, + "learning_rate": 1.4742566365434363e-08, + "loss": 0.0, + "step": 31431 + }, + { + "epoch": 12.78243188287922, + "grad_norm": 0.017822635146524296, + "learning_rate": 1.4687621561744193e-08, + "loss": 0.0002, + "step": 31432 + }, + { + "epoch": 12.782838552257015, + "grad_norm": 0.0006120845064313411, + "learning_rate": 1.4632779262310215e-08, + "loss": 0.0, + "step": 31433 + }, + { + "epoch": 12.783245221634811, + "grad_norm": 0.03799768430197673, + "learning_rate": 1.4578039467696426e-08, + "loss": 0.0002, + "step": 31434 + }, + { + "epoch": 12.783651891012607, + "grad_norm": 0.08476992189735773, + "learning_rate": 1.4523402178465707e-08, + "loss": 0.0007, + "step": 31435 + }, + { + "epoch": 12.784058560390402, + "grad_norm": 0.03358806811442388, + "learning_rate": 1.4468867395176501e-08, + "loss": 0.0002, + "step": 31436 + }, + { + "epoch": 12.784465229768198, + "grad_norm": 0.007777725025478852, + "learning_rate": 1.441443511839058e-08, + "loss": 0.0, + "step": 31437 + }, + { + "epoch": 12.784871899145994, + "grad_norm": 9.930850211013197e-05, + "learning_rate": 1.4360105348665277e-08, + "loss": 0.0, + "step": 31438 + }, + { + "epoch": 12.78527856852379, + "grad_norm": 0.00838910927403595, + "learning_rate": 1.4305878086560143e-08, + "loss": 0.0001, + "step": 31439 + }, + { + "epoch": 12.785685237901586, + "grad_norm": 0.0006549499524320341, + "learning_rate": 1.4251753332630292e-08, + "loss": 0.0, + "step": 31440 + }, + { + "epoch": 12.786091907279381, + "grad_norm": 0.017711044733115588, + "learning_rate": 1.4197731087433053e-08, + "loss": 0.0002, + "step": 31441 + }, + { + "epoch": 12.786498576657177, + "grad_norm": 0.02165341166458895, + "learning_rate": 1.4143811351520208e-08, + "loss": 0.0001, + "step": 31442 + }, + { + "epoch": 12.786905246034973, + "grad_norm": 0.03801952020846712, + "learning_rate": 1.408999412544798e-08, + "loss": 0.0004, + "step": 31443 + }, + { + "epoch": 12.787311915412769, + "grad_norm": 7.14959809125822e-07, + "learning_rate": 1.403627940976704e-08, + "loss": 0.0, + "step": 31444 + }, + { + "epoch": 12.787718584790564, + "grad_norm": 0.0004416150519486789, + "learning_rate": 1.3982667205030276e-08, + "loss": 0.0, + "step": 31445 + }, + { + "epoch": 12.788125254168362, + "grad_norm": 0.02665384265063325, + "learning_rate": 1.392915751178725e-08, + "loss": 0.0002, + "step": 31446 + }, + { + "epoch": 12.788531923546158, + "grad_norm": 0.0025298842803946646, + "learning_rate": 1.3875750330586412e-08, + "loss": 0.0, + "step": 31447 + }, + { + "epoch": 12.788938592923953, + "grad_norm": 0.058998564917153294, + "learning_rate": 1.3822445661978434e-08, + "loss": 0.0006, + "step": 31448 + }, + { + "epoch": 12.78934526230175, + "grad_norm": 0.003488467557775469, + "learning_rate": 1.3769243506507323e-08, + "loss": 0.0, + "step": 31449 + }, + { + "epoch": 12.789751931679545, + "grad_norm": 0.005058148819820609, + "learning_rate": 1.3716143864721532e-08, + "loss": 0.0001, + "step": 31450 + }, + { + "epoch": 12.79015860105734, + "grad_norm": 0.001755816879264035, + "learning_rate": 1.3663146737166178e-08, + "loss": 0.0, + "step": 31451 + }, + { + "epoch": 12.790565270435136, + "grad_norm": 0.007425452703618359, + "learning_rate": 1.361025212438416e-08, + "loss": 0.0001, + "step": 31452 + }, + { + "epoch": 12.790971939812932, + "grad_norm": 0.01834765310098339, + "learning_rate": 1.3557460026918379e-08, + "loss": 0.0002, + "step": 31453 + }, + { + "epoch": 12.791378609190728, + "grad_norm": 1.5707935714348687e-05, + "learning_rate": 1.3504770445312842e-08, + "loss": 0.0, + "step": 31454 + }, + { + "epoch": 12.791785278568524, + "grad_norm": 0.0032257320069235453, + "learning_rate": 1.34521833801049e-08, + "loss": 0.0, + "step": 31455 + }, + { + "epoch": 12.79219194794632, + "grad_norm": 0.004641914680300442, + "learning_rate": 1.3399698831837448e-08, + "loss": 0.0, + "step": 31456 + }, + { + "epoch": 12.792598617324115, + "grad_norm": 8.385944480944451e-05, + "learning_rate": 1.3347316801048948e-08, + "loss": 0.0, + "step": 31457 + }, + { + "epoch": 12.793005286701911, + "grad_norm": 0.0015756819994745322, + "learning_rate": 1.3295037288275636e-08, + "loss": 0.0, + "step": 31458 + }, + { + "epoch": 12.793411956079707, + "grad_norm": 0.0037061523666874234, + "learning_rate": 1.324286029405486e-08, + "loss": 0.0, + "step": 31459 + }, + { + "epoch": 12.793818625457503, + "grad_norm": 0.003337511426119215, + "learning_rate": 1.3190785818922858e-08, + "loss": 0.0, + "step": 31460 + }, + { + "epoch": 12.794225294835298, + "grad_norm": 0.03603931123511171, + "learning_rate": 1.3138813863413646e-08, + "loss": 0.0004, + "step": 31461 + }, + { + "epoch": 12.794631964213094, + "grad_norm": 0.03919161478640275, + "learning_rate": 1.3086944428060132e-08, + "loss": 0.0003, + "step": 31462 + }, + { + "epoch": 12.79503863359089, + "grad_norm": 0.000568801737682608, + "learning_rate": 1.3035177513395225e-08, + "loss": 0.0, + "step": 31463 + }, + { + "epoch": 12.795445302968686, + "grad_norm": 0.0005322498457133395, + "learning_rate": 1.2983513119951829e-08, + "loss": 0.0, + "step": 31464 + }, + { + "epoch": 12.795851972346483, + "grad_norm": 0.0066332434392732805, + "learning_rate": 1.2931951248258412e-08, + "loss": 0.0, + "step": 31465 + }, + { + "epoch": 12.796258641724279, + "grad_norm": 0.00024450323160214905, + "learning_rate": 1.288049189884455e-08, + "loss": 0.0, + "step": 31466 + }, + { + "epoch": 12.796665311102075, + "grad_norm": 0.07021604646145947, + "learning_rate": 1.2829135072239817e-08, + "loss": 0.0006, + "step": 31467 + }, + { + "epoch": 12.79707198047987, + "grad_norm": 0.01120877693554403, + "learning_rate": 1.2777880768969354e-08, + "loss": 0.0001, + "step": 31468 + }, + { + "epoch": 12.797478649857666, + "grad_norm": 0.0027312127632760346, + "learning_rate": 1.2726728989560511e-08, + "loss": 0.0, + "step": 31469 + }, + { + "epoch": 12.797885319235462, + "grad_norm": 0.00015508290700152834, + "learning_rate": 1.2675679734538426e-08, + "loss": 0.0, + "step": 31470 + }, + { + "epoch": 12.798291988613258, + "grad_norm": 0.0012237727984078766, + "learning_rate": 1.2624733004428235e-08, + "loss": 0.0, + "step": 31471 + }, + { + "epoch": 12.798698657991054, + "grad_norm": 0.002229152729518149, + "learning_rate": 1.2573888799750633e-08, + "loss": 0.0, + "step": 31472 + }, + { + "epoch": 12.79910532736885, + "grad_norm": 0.2129644634948185, + "learning_rate": 1.2523147121028534e-08, + "loss": 0.0012, + "step": 31473 + }, + { + "epoch": 12.799511996746645, + "grad_norm": 0.00013446280675290636, + "learning_rate": 1.2472507968782633e-08, + "loss": 0.0, + "step": 31474 + }, + { + "epoch": 12.79991866612444, + "grad_norm": 0.09760716952101177, + "learning_rate": 1.2421971343533623e-08, + "loss": 0.0009, + "step": 31475 + }, + { + "epoch": 12.800325335502237, + "grad_norm": 0.0025766893284723926, + "learning_rate": 1.2371537245799981e-08, + "loss": 0.0, + "step": 31476 + }, + { + "epoch": 12.800732004880032, + "grad_norm": 0.07100032224464985, + "learning_rate": 1.2321205676097959e-08, + "loss": 0.0004, + "step": 31477 + }, + { + "epoch": 12.801138674257828, + "grad_norm": 0.0005318432810471677, + "learning_rate": 1.227097663494714e-08, + "loss": 0.0, + "step": 31478 + }, + { + "epoch": 12.801545343635624, + "grad_norm": 0.017307466502834244, + "learning_rate": 1.222085012286045e-08, + "loss": 0.0001, + "step": 31479 + }, + { + "epoch": 12.80195201301342, + "grad_norm": 0.0002613641052491805, + "learning_rate": 1.2170826140353031e-08, + "loss": 0.0, + "step": 31480 + }, + { + "epoch": 12.802358682391215, + "grad_norm": 0.00017391309788017262, + "learning_rate": 1.2120904687940027e-08, + "loss": 0.0, + "step": 31481 + }, + { + "epoch": 12.802765351769011, + "grad_norm": 3.770372850155713e-05, + "learning_rate": 1.207108576613214e-08, + "loss": 0.0, + "step": 31482 + }, + { + "epoch": 12.803172021146807, + "grad_norm": 0.002556264165691678, + "learning_rate": 1.2021369375441184e-08, + "loss": 0.0, + "step": 31483 + }, + { + "epoch": 12.803578690524603, + "grad_norm": 0.004002507434424456, + "learning_rate": 1.1971755516377858e-08, + "loss": 0.0, + "step": 31484 + }, + { + "epoch": 12.803985359902398, + "grad_norm": 0.0002679407724444801, + "learning_rate": 1.1922244189451759e-08, + "loss": 0.0, + "step": 31485 + }, + { + "epoch": 12.804392029280194, + "grad_norm": 0.006226582893844894, + "learning_rate": 1.1872835395170256e-08, + "loss": 0.0001, + "step": 31486 + }, + { + "epoch": 12.804798698657992, + "grad_norm": 0.005308941054319651, + "learning_rate": 1.1823529134040723e-08, + "loss": 0.0001, + "step": 31487 + }, + { + "epoch": 12.805205368035788, + "grad_norm": 0.0010870367109560573, + "learning_rate": 1.177432540657053e-08, + "loss": 0.0, + "step": 31488 + }, + { + "epoch": 12.805612037413583, + "grad_norm": 0.004107384668368165, + "learning_rate": 1.172522421326261e-08, + "loss": 0.0, + "step": 31489 + }, + { + "epoch": 12.806018706791379, + "grad_norm": 0.0010093349086921263, + "learning_rate": 1.1676225554623221e-08, + "loss": 0.0, + "step": 31490 + }, + { + "epoch": 12.806425376169175, + "grad_norm": 0.0017127962989729876, + "learning_rate": 1.1627329431154189e-08, + "loss": 0.0, + "step": 31491 + }, + { + "epoch": 12.80683204554697, + "grad_norm": 0.004117696521044248, + "learning_rate": 1.157853584335733e-08, + "loss": 0.0, + "step": 31492 + }, + { + "epoch": 12.807238714924766, + "grad_norm": 0.005541241421085059, + "learning_rate": 1.1529844791734469e-08, + "loss": 0.0, + "step": 31493 + }, + { + "epoch": 12.807645384302562, + "grad_norm": 0.021640673184949912, + "learning_rate": 1.1481256276785202e-08, + "loss": 0.0001, + "step": 31494 + }, + { + "epoch": 12.808052053680358, + "grad_norm": 0.02922408975235098, + "learning_rate": 1.1432770299006912e-08, + "loss": 0.0002, + "step": 31495 + }, + { + "epoch": 12.808458723058154, + "grad_norm": 0.0005665432186669577, + "learning_rate": 1.1384386858898089e-08, + "loss": 0.0, + "step": 31496 + }, + { + "epoch": 12.80886539243595, + "grad_norm": 0.010316009534886868, + "learning_rate": 1.1336105956957222e-08, + "loss": 0.0001, + "step": 31497 + }, + { + "epoch": 12.809272061813745, + "grad_norm": 0.09784628838110919, + "learning_rate": 1.128792759367725e-08, + "loss": 0.0007, + "step": 31498 + }, + { + "epoch": 12.809678731191541, + "grad_norm": 0.0026690533643328046, + "learning_rate": 1.1239851769554444e-08, + "loss": 0.0, + "step": 31499 + }, + { + "epoch": 12.810085400569337, + "grad_norm": 0.36683921284213544, + "learning_rate": 1.1191878485081743e-08, + "loss": 0.0039, + "step": 31500 + }, + { + "epoch": 12.810492069947133, + "grad_norm": 0.011851808519124018, + "learning_rate": 1.1144007740750973e-08, + "loss": 0.0001, + "step": 31501 + }, + { + "epoch": 12.810898739324928, + "grad_norm": 0.0014820701708776263, + "learning_rate": 1.1096239537055075e-08, + "loss": 0.0, + "step": 31502 + }, + { + "epoch": 12.811305408702724, + "grad_norm": 0.0021868515424502176, + "learning_rate": 1.1048573874482549e-08, + "loss": 0.0, + "step": 31503 + }, + { + "epoch": 12.81171207808052, + "grad_norm": 0.017790519532476054, + "learning_rate": 1.100101075352522e-08, + "loss": 0.0002, + "step": 31504 + }, + { + "epoch": 12.812118747458316, + "grad_norm": 0.0006805890123307687, + "learning_rate": 1.095355017466937e-08, + "loss": 0.0, + "step": 31505 + }, + { + "epoch": 12.812525416836113, + "grad_norm": 0.00046736776832794684, + "learning_rate": 1.0906192138402383e-08, + "loss": 0.0, + "step": 31506 + }, + { + "epoch": 12.812932086213909, + "grad_norm": 7.978702106664082e-06, + "learning_rate": 1.0858936645210539e-08, + "loss": 0.0, + "step": 31507 + }, + { + "epoch": 12.813338755591705, + "grad_norm": 0.016424982604903018, + "learning_rate": 1.0811783695579004e-08, + "loss": 0.0001, + "step": 31508 + }, + { + "epoch": 12.8137454249695, + "grad_norm": 6.268861995181418e-05, + "learning_rate": 1.0764733289992946e-08, + "loss": 0.0, + "step": 31509 + }, + { + "epoch": 12.814152094347296, + "grad_norm": 0.004098496408919803, + "learning_rate": 1.0717785428933092e-08, + "loss": 0.0, + "step": 31510 + }, + { + "epoch": 12.814558763725092, + "grad_norm": 0.0399137949059512, + "learning_rate": 1.06709401128835e-08, + "loss": 0.0003, + "step": 31511 + }, + { + "epoch": 12.814965433102888, + "grad_norm": 0.004115108654185985, + "learning_rate": 1.0624197342324893e-08, + "loss": 0.0, + "step": 31512 + }, + { + "epoch": 12.815372102480683, + "grad_norm": 0.0003075471102679943, + "learning_rate": 1.0577557117735781e-08, + "loss": 0.0, + "step": 31513 + }, + { + "epoch": 12.81577877185848, + "grad_norm": 0.07139158192688222, + "learning_rate": 1.053101943959578e-08, + "loss": 0.0004, + "step": 31514 + }, + { + "epoch": 12.816185441236275, + "grad_norm": 0.06509094195601708, + "learning_rate": 1.0484584308381173e-08, + "loss": 0.0006, + "step": 31515 + }, + { + "epoch": 12.81659211061407, + "grad_norm": 0.08019365306166704, + "learning_rate": 1.043825172457158e-08, + "loss": 0.0005, + "step": 31516 + }, + { + "epoch": 12.816998779991867, + "grad_norm": 0.005328201454070276, + "learning_rate": 1.0392021688639953e-08, + "loss": 0.0, + "step": 31517 + }, + { + "epoch": 12.817405449369662, + "grad_norm": 0.04249393122363296, + "learning_rate": 1.0345894201062578e-08, + "loss": 0.0005, + "step": 31518 + }, + { + "epoch": 12.817812118747458, + "grad_norm": 0.028734587113108924, + "learning_rate": 1.0299869262312411e-08, + "loss": 0.0002, + "step": 31519 + }, + { + "epoch": 12.818218788125254, + "grad_norm": 0.003176179302078278, + "learning_rate": 1.0253946872861298e-08, + "loss": 0.0, + "step": 31520 + }, + { + "epoch": 12.81862545750305, + "grad_norm": 0.0021737314496005286, + "learning_rate": 1.0208127033181082e-08, + "loss": 0.0, + "step": 31521 + }, + { + "epoch": 12.819032126880845, + "grad_norm": 0.009617248590820826, + "learning_rate": 1.0162409743742495e-08, + "loss": 0.0001, + "step": 31522 + }, + { + "epoch": 12.819438796258641, + "grad_norm": 0.003959716320999541, + "learning_rate": 1.0116795005014058e-08, + "loss": 0.0, + "step": 31523 + }, + { + "epoch": 12.819845465636437, + "grad_norm": 0.009957439032813653, + "learning_rate": 1.0071282817465388e-08, + "loss": 0.0001, + "step": 31524 + }, + { + "epoch": 12.820252135014233, + "grad_norm": 0.028468504193714838, + "learning_rate": 1.0025873181561674e-08, + "loss": 0.0003, + "step": 31525 + }, + { + "epoch": 12.820658804392028, + "grad_norm": 0.00037185156932974164, + "learning_rate": 9.980566097771427e-09, + "loss": 0.0, + "step": 31526 + }, + { + "epoch": 12.821065473769824, + "grad_norm": 0.001390787607011018, + "learning_rate": 9.935361566557611e-09, + "loss": 0.0, + "step": 31527 + }, + { + "epoch": 12.821472143147622, + "grad_norm": 0.0047847368818768575, + "learning_rate": 9.89025958838541e-09, + "loss": 0.0001, + "step": 31528 + }, + { + "epoch": 12.821878812525418, + "grad_norm": 0.001134211300175328, + "learning_rate": 9.845260163717785e-09, + "loss": 0.0, + "step": 31529 + }, + { + "epoch": 12.822285481903213, + "grad_norm": 3.20164893141765e-05, + "learning_rate": 9.80036329301659e-09, + "loss": 0.0, + "step": 31530 + }, + { + "epoch": 12.822692151281009, + "grad_norm": 0.002259095116758579, + "learning_rate": 9.75556897674257e-09, + "loss": 0.0, + "step": 31531 + }, + { + "epoch": 12.823098820658805, + "grad_norm": 0.017577367632085568, + "learning_rate": 9.710877215356463e-09, + "loss": 0.0002, + "step": 31532 + }, + { + "epoch": 12.8235054900366, + "grad_norm": 0.00767395411490733, + "learning_rate": 9.666288009314573e-09, + "loss": 0.0, + "step": 31533 + }, + { + "epoch": 12.823912159414396, + "grad_norm": 0.02768379581763124, + "learning_rate": 9.621801359077643e-09, + "loss": 0.0003, + "step": 31534 + }, + { + "epoch": 12.824318828792192, + "grad_norm": 0.00038820429981597544, + "learning_rate": 9.577417265099754e-09, + "loss": 0.0, + "step": 31535 + }, + { + "epoch": 12.824725498169988, + "grad_norm": 0.0042991452962492455, + "learning_rate": 9.533135727838315e-09, + "loss": 0.0, + "step": 31536 + }, + { + "epoch": 12.825132167547784, + "grad_norm": 0.006655757103769553, + "learning_rate": 9.488956747747414e-09, + "loss": 0.0001, + "step": 31537 + }, + { + "epoch": 12.82553883692558, + "grad_norm": 0.00010210183920302244, + "learning_rate": 9.444880325281125e-09, + "loss": 0.0, + "step": 31538 + }, + { + "epoch": 12.825945506303375, + "grad_norm": 0.004025922120113746, + "learning_rate": 9.400906460890202e-09, + "loss": 0.0, + "step": 31539 + }, + { + "epoch": 12.826352175681171, + "grad_norm": 0.07881671283025252, + "learning_rate": 9.357035155027616e-09, + "loss": 0.0005, + "step": 31540 + }, + { + "epoch": 12.826758845058967, + "grad_norm": 0.01297139918344459, + "learning_rate": 9.313266408143007e-09, + "loss": 0.0002, + "step": 31541 + }, + { + "epoch": 12.827165514436762, + "grad_norm": 0.0022079193212016643, + "learning_rate": 9.269600220686015e-09, + "loss": 0.0, + "step": 31542 + }, + { + "epoch": 12.827572183814558, + "grad_norm": 0.045287458543057775, + "learning_rate": 9.22603659310517e-09, + "loss": 0.0002, + "step": 31543 + }, + { + "epoch": 12.827978853192354, + "grad_norm": 0.0006551099471533605, + "learning_rate": 9.182575525846782e-09, + "loss": 0.0, + "step": 31544 + }, + { + "epoch": 12.82838552257015, + "grad_norm": 0.0001778255366429642, + "learning_rate": 9.139217019358271e-09, + "loss": 0.0, + "step": 31545 + }, + { + "epoch": 12.828792191947946, + "grad_norm": 0.001405145612512962, + "learning_rate": 9.095961074082615e-09, + "loss": 0.0, + "step": 31546 + }, + { + "epoch": 12.829198861325743, + "grad_norm": 0.0005536999638121077, + "learning_rate": 9.052807690466125e-09, + "loss": 0.0, + "step": 31547 + }, + { + "epoch": 12.829605530703539, + "grad_norm": 0.000715457949885072, + "learning_rate": 9.00975686895178e-09, + "loss": 0.0, + "step": 31548 + }, + { + "epoch": 12.830012200081335, + "grad_norm": 0.010266157495057267, + "learning_rate": 8.966808609979228e-09, + "loss": 0.0001, + "step": 31549 + }, + { + "epoch": 12.83041886945913, + "grad_norm": 0.009774834234686974, + "learning_rate": 8.923962913991447e-09, + "loss": 0.0001, + "step": 31550 + }, + { + "epoch": 12.830825538836926, + "grad_norm": 0.00014893440580542814, + "learning_rate": 8.881219781428085e-09, + "loss": 0.0, + "step": 31551 + }, + { + "epoch": 12.831232208214722, + "grad_norm": 0.02844205343317533, + "learning_rate": 8.838579212726573e-09, + "loss": 0.0003, + "step": 31552 + }, + { + "epoch": 12.831638877592518, + "grad_norm": 0.006838985216615226, + "learning_rate": 8.796041208325445e-09, + "loss": 0.0001, + "step": 31553 + }, + { + "epoch": 12.832045546970313, + "grad_norm": 0.17216650147107457, + "learning_rate": 8.753605768662132e-09, + "loss": 0.0017, + "step": 31554 + }, + { + "epoch": 12.83245221634811, + "grad_norm": 0.01096767363269455, + "learning_rate": 8.71127289417073e-09, + "loss": 0.0001, + "step": 31555 + }, + { + "epoch": 12.832858885725905, + "grad_norm": 2.9878052255112964e-06, + "learning_rate": 8.669042585286447e-09, + "loss": 0.0, + "step": 31556 + }, + { + "epoch": 12.8332655551037, + "grad_norm": 0.010986434379660792, + "learning_rate": 8.62691484244449e-09, + "loss": 0.0001, + "step": 31557 + }, + { + "epoch": 12.833672224481496, + "grad_norm": 0.1308823949148304, + "learning_rate": 8.584889666074514e-09, + "loss": 0.0008, + "step": 31558 + }, + { + "epoch": 12.834078893859292, + "grad_norm": 0.0033911638510525015, + "learning_rate": 8.542967056609508e-09, + "loss": 0.0, + "step": 31559 + }, + { + "epoch": 12.834485563237088, + "grad_norm": 0.0009149755910351996, + "learning_rate": 8.501147014479128e-09, + "loss": 0.0, + "step": 31560 + }, + { + "epoch": 12.834892232614884, + "grad_norm": 0.9816642040593139, + "learning_rate": 8.45942954011414e-09, + "loss": 0.0103, + "step": 31561 + }, + { + "epoch": 12.83529890199268, + "grad_norm": 4.539424590877863e-05, + "learning_rate": 8.417814633940868e-09, + "loss": 0.0, + "step": 31562 + }, + { + "epoch": 12.835705571370475, + "grad_norm": 0.01172829054781836, + "learning_rate": 8.376302296387862e-09, + "loss": 0.0001, + "step": 31563 + }, + { + "epoch": 12.836112240748271, + "grad_norm": 0.028187371267306048, + "learning_rate": 8.334892527880335e-09, + "loss": 0.0002, + "step": 31564 + }, + { + "epoch": 12.836518910126067, + "grad_norm": 3.925412927990251e-05, + "learning_rate": 8.293585328843502e-09, + "loss": 0.0, + "step": 31565 + }, + { + "epoch": 12.836925579503863, + "grad_norm": 0.0008690127385877659, + "learning_rate": 8.252380699702578e-09, + "loss": 0.0, + "step": 31566 + }, + { + "epoch": 12.837332248881658, + "grad_norm": 0.01948081156767142, + "learning_rate": 8.21127864087945e-09, + "loss": 0.0002, + "step": 31567 + }, + { + "epoch": 12.837738918259454, + "grad_norm": 0.00047693925507902264, + "learning_rate": 8.170279152797111e-09, + "loss": 0.0, + "step": 31568 + }, + { + "epoch": 12.838145587637252, + "grad_norm": 0.13658718134513337, + "learning_rate": 8.129382235874117e-09, + "loss": 0.0016, + "step": 31569 + }, + { + "epoch": 12.838552257015047, + "grad_norm": 0.00449581613112698, + "learning_rate": 8.088587890532352e-09, + "loss": 0.0, + "step": 31570 + }, + { + "epoch": 12.838958926392843, + "grad_norm": 0.014822036674086202, + "learning_rate": 8.047896117190369e-09, + "loss": 0.0001, + "step": 31571 + }, + { + "epoch": 12.839365595770639, + "grad_norm": 0.04407539588769229, + "learning_rate": 8.007306916265612e-09, + "loss": 0.0003, + "step": 31572 + }, + { + "epoch": 12.839772265148435, + "grad_norm": 0.06301161411774894, + "learning_rate": 7.966820288174414e-09, + "loss": 0.0006, + "step": 31573 + }, + { + "epoch": 12.84017893452623, + "grad_norm": 0.017450787765241758, + "learning_rate": 7.926436233333112e-09, + "loss": 0.0001, + "step": 31574 + }, + { + "epoch": 12.840585603904026, + "grad_norm": 0.009594998024112587, + "learning_rate": 7.886154752155817e-09, + "loss": 0.0001, + "step": 31575 + }, + { + "epoch": 12.840992273281822, + "grad_norm": 0.0812218494008387, + "learning_rate": 7.84597584505553e-09, + "loss": 0.0006, + "step": 31576 + }, + { + "epoch": 12.841398942659618, + "grad_norm": 0.00019009596386725537, + "learning_rate": 7.805899512445258e-09, + "loss": 0.0, + "step": 31577 + }, + { + "epoch": 12.841805612037414, + "grad_norm": 0.003197930828974685, + "learning_rate": 7.765925754736891e-09, + "loss": 0.0, + "step": 31578 + }, + { + "epoch": 12.84221228141521, + "grad_norm": 0.006450506027142869, + "learning_rate": 7.726054572340103e-09, + "loss": 0.0001, + "step": 31579 + }, + { + "epoch": 12.842618950793005, + "grad_norm": 0.06095048523808228, + "learning_rate": 7.686285965664564e-09, + "loss": 0.0007, + "step": 31580 + }, + { + "epoch": 12.8430256201708, + "grad_norm": 1.2860126102775265e-05, + "learning_rate": 7.646619935117726e-09, + "loss": 0.0, + "step": 31581 + }, + { + "epoch": 12.843432289548597, + "grad_norm": 0.0028833742547558546, + "learning_rate": 7.607056481108154e-09, + "loss": 0.0, + "step": 31582 + }, + { + "epoch": 12.843838958926392, + "grad_norm": 0.008756467926049429, + "learning_rate": 7.567595604041079e-09, + "loss": 0.0001, + "step": 31583 + }, + { + "epoch": 12.844245628304188, + "grad_norm": 0.02307904026662143, + "learning_rate": 7.528237304320618e-09, + "loss": 0.0002, + "step": 31584 + }, + { + "epoch": 12.844652297681984, + "grad_norm": 0.030802699898682964, + "learning_rate": 7.488981582353117e-09, + "loss": 0.0003, + "step": 31585 + }, + { + "epoch": 12.84505896705978, + "grad_norm": 0.00012521532020908486, + "learning_rate": 7.449828438540474e-09, + "loss": 0.0, + "step": 31586 + }, + { + "epoch": 12.845465636437575, + "grad_norm": 0.010454450013191676, + "learning_rate": 7.410777873283481e-09, + "loss": 0.0001, + "step": 31587 + }, + { + "epoch": 12.845872305815373, + "grad_norm": 0.17408176871177175, + "learning_rate": 7.3718298869840386e-09, + "loss": 0.0008, + "step": 31588 + }, + { + "epoch": 12.846278975193169, + "grad_norm": 0.013219456300039796, + "learning_rate": 7.332984480042937e-09, + "loss": 0.0001, + "step": 31589 + }, + { + "epoch": 12.846685644570965, + "grad_norm": 0.006935461419966931, + "learning_rate": 7.294241652857637e-09, + "loss": 0.0, + "step": 31590 + }, + { + "epoch": 12.84709231394876, + "grad_norm": 0.01670262158812718, + "learning_rate": 7.2556014058255966e-09, + "loss": 0.0001, + "step": 31591 + }, + { + "epoch": 12.847498983326556, + "grad_norm": 0.004277817354323348, + "learning_rate": 7.2170637393442766e-09, + "loss": 0.0, + "step": 31592 + }, + { + "epoch": 12.847905652704352, + "grad_norm": 0.15311869264167624, + "learning_rate": 7.178628653810027e-09, + "loss": 0.0011, + "step": 31593 + }, + { + "epoch": 12.848312322082148, + "grad_norm": 0.013748422814163469, + "learning_rate": 7.140296149614756e-09, + "loss": 0.0001, + "step": 31594 + }, + { + "epoch": 12.848718991459943, + "grad_norm": 6.578335577865357e-05, + "learning_rate": 7.1020662271548135e-09, + "loss": 0.0, + "step": 31595 + }, + { + "epoch": 12.849125660837739, + "grad_norm": 0.004849728216041827, + "learning_rate": 7.063938886820998e-09, + "loss": 0.0001, + "step": 31596 + }, + { + "epoch": 12.849532330215535, + "grad_norm": 0.0010481560620985283, + "learning_rate": 7.025914129006328e-09, + "loss": 0.0, + "step": 31597 + }, + { + "epoch": 12.84993899959333, + "grad_norm": 0.00014890928402239268, + "learning_rate": 6.987991954098272e-09, + "loss": 0.0, + "step": 31598 + }, + { + "epoch": 12.850345668971126, + "grad_norm": 0.06960880099811526, + "learning_rate": 6.950172362488739e-09, + "loss": 0.0005, + "step": 31599 + }, + { + "epoch": 12.850752338348922, + "grad_norm": 0.001381118584399173, + "learning_rate": 6.9124553545651954e-09, + "loss": 0.0, + "step": 31600 + }, + { + "epoch": 12.851159007726718, + "grad_norm": 0.00044248714269217495, + "learning_rate": 6.8748409307139996e-09, + "loss": 0.0, + "step": 31601 + }, + { + "epoch": 12.851565677104514, + "grad_norm": 0.027467528447789057, + "learning_rate": 6.83732909132262e-09, + "loss": 0.0002, + "step": 31602 + }, + { + "epoch": 12.85197234648231, + "grad_norm": 0.0019019802428089083, + "learning_rate": 6.799919836776303e-09, + "loss": 0.0, + "step": 31603 + }, + { + "epoch": 12.852379015860105, + "grad_norm": 0.006386691395608964, + "learning_rate": 6.762613167456966e-09, + "loss": 0.0, + "step": 31604 + }, + { + "epoch": 12.852785685237901, + "grad_norm": 0.2107195013570512, + "learning_rate": 6.725409083748746e-09, + "loss": 0.002, + "step": 31605 + }, + { + "epoch": 12.853192354615697, + "grad_norm": 0.04104942353312549, + "learning_rate": 6.6883075860346705e-09, + "loss": 0.0003, + "step": 31606 + }, + { + "epoch": 12.853599023993493, + "grad_norm": 0.058784520262472995, + "learning_rate": 6.651308674694434e-09, + "loss": 0.0002, + "step": 31607 + }, + { + "epoch": 12.854005693371288, + "grad_norm": 0.00016287146600779765, + "learning_rate": 6.614412350107735e-09, + "loss": 0.0, + "step": 31608 + }, + { + "epoch": 12.854412362749084, + "grad_norm": 0.00017616177369897154, + "learning_rate": 6.577618612654269e-09, + "loss": 0.0, + "step": 31609 + }, + { + "epoch": 12.854819032126882, + "grad_norm": 0.06106169039012634, + "learning_rate": 6.5409274627104e-09, + "loss": 0.0005, + "step": 31610 + }, + { + "epoch": 12.855225701504677, + "grad_norm": 0.005255139520291005, + "learning_rate": 6.504338900653606e-09, + "loss": 0.0, + "step": 31611 + }, + { + "epoch": 12.855632370882473, + "grad_norm": 0.00784525066416115, + "learning_rate": 6.467852926859142e-09, + "loss": 0.0001, + "step": 31612 + }, + { + "epoch": 12.856039040260269, + "grad_norm": 0.00038619223791755446, + "learning_rate": 6.431469541702262e-09, + "loss": 0.0, + "step": 31613 + }, + { + "epoch": 12.856445709638065, + "grad_norm": 0.0004987440687731703, + "learning_rate": 6.395188745556002e-09, + "loss": 0.0, + "step": 31614 + }, + { + "epoch": 12.85685237901586, + "grad_norm": 0.0005498367854064461, + "learning_rate": 6.359010538793398e-09, + "loss": 0.0, + "step": 31615 + }, + { + "epoch": 12.857259048393656, + "grad_norm": 0.002330012167296335, + "learning_rate": 6.322934921784152e-09, + "loss": 0.0, + "step": 31616 + }, + { + "epoch": 12.857665717771452, + "grad_norm": 0.031411207201435566, + "learning_rate": 6.28696189490019e-09, + "loss": 0.0003, + "step": 31617 + }, + { + "epoch": 12.858072387149248, + "grad_norm": 0.012341598304179378, + "learning_rate": 6.2510914585101055e-09, + "loss": 0.0001, + "step": 31618 + }, + { + "epoch": 12.858479056527043, + "grad_norm": 0.0009865804464779708, + "learning_rate": 6.215323612982493e-09, + "loss": 0.0, + "step": 31619 + }, + { + "epoch": 12.85888572590484, + "grad_norm": 0.001278884182996999, + "learning_rate": 6.179658358684837e-09, + "loss": 0.0, + "step": 31620 + }, + { + "epoch": 12.859292395282635, + "grad_norm": 0.0064419403774988145, + "learning_rate": 6.1440956959824e-09, + "loss": 0.0001, + "step": 31621 + }, + { + "epoch": 12.85969906466043, + "grad_norm": 0.001372119639443243, + "learning_rate": 6.108635625240444e-09, + "loss": 0.0, + "step": 31622 + }, + { + "epoch": 12.860105734038227, + "grad_norm": 5.87362626421681e-05, + "learning_rate": 6.073278146823125e-09, + "loss": 0.0, + "step": 31623 + }, + { + "epoch": 12.860512403416022, + "grad_norm": 0.21845846511269432, + "learning_rate": 6.038023261093484e-09, + "loss": 0.0018, + "step": 31624 + }, + { + "epoch": 12.860919072793818, + "grad_norm": 0.004260077577882227, + "learning_rate": 6.002870968413454e-09, + "loss": 0.0, + "step": 31625 + }, + { + "epoch": 12.861325742171614, + "grad_norm": 0.08958314651528985, + "learning_rate": 5.967821269143859e-09, + "loss": 0.0009, + "step": 31626 + }, + { + "epoch": 12.86173241154941, + "grad_norm": 0.012852989911554537, + "learning_rate": 5.932874163644408e-09, + "loss": 0.0001, + "step": 31627 + }, + { + "epoch": 12.862139080927205, + "grad_norm": 0.0035656508202191665, + "learning_rate": 5.8980296522737065e-09, + "loss": 0.0, + "step": 31628 + }, + { + "epoch": 12.862545750305003, + "grad_norm": 0.0019626450075699736, + "learning_rate": 5.863287735390355e-09, + "loss": 0.0, + "step": 31629 + }, + { + "epoch": 12.862952419682799, + "grad_norm": 3.281385412406317e-06, + "learning_rate": 5.828648413349625e-09, + "loss": 0.0, + "step": 31630 + }, + { + "epoch": 12.863359089060594, + "grad_norm": 0.02849397687653668, + "learning_rate": 5.794111686507897e-09, + "loss": 0.0002, + "step": 31631 + }, + { + "epoch": 12.86376575843839, + "grad_norm": 0.00025685675598791335, + "learning_rate": 5.759677555219334e-09, + "loss": 0.0, + "step": 31632 + }, + { + "epoch": 12.864172427816186, + "grad_norm": 0.0017819722565999255, + "learning_rate": 5.725346019838096e-09, + "loss": 0.0, + "step": 31633 + }, + { + "epoch": 12.864579097193982, + "grad_norm": 0.0005703644808382227, + "learning_rate": 5.691117080716124e-09, + "loss": 0.0, + "step": 31634 + }, + { + "epoch": 12.864985766571778, + "grad_norm": 4.8367678545679286e-05, + "learning_rate": 5.656990738205359e-09, + "loss": 0.0, + "step": 31635 + }, + { + "epoch": 12.865392435949573, + "grad_norm": 0.00029756968899614445, + "learning_rate": 5.622966992654411e-09, + "loss": 0.0, + "step": 31636 + }, + { + "epoch": 12.865799105327369, + "grad_norm": 0.0007530908700511047, + "learning_rate": 5.58904584441522e-09, + "loss": 0.0, + "step": 31637 + }, + { + "epoch": 12.866205774705165, + "grad_norm": 0.00012866378406946777, + "learning_rate": 5.555227293833065e-09, + "loss": 0.0, + "step": 31638 + }, + { + "epoch": 12.86661244408296, + "grad_norm": 0.011881971869802008, + "learning_rate": 5.521511341257668e-09, + "loss": 0.0001, + "step": 31639 + }, + { + "epoch": 12.867019113460756, + "grad_norm": 0.03943669022957747, + "learning_rate": 5.487897987034308e-09, + "loss": 0.0002, + "step": 31640 + }, + { + "epoch": 12.867425782838552, + "grad_norm": 0.0015634222381666427, + "learning_rate": 5.454387231507152e-09, + "loss": 0.0, + "step": 31641 + }, + { + "epoch": 12.867832452216348, + "grad_norm": 0.050104179004156646, + "learning_rate": 5.4209790750203715e-09, + "loss": 0.0004, + "step": 31642 + }, + { + "epoch": 12.868239121594144, + "grad_norm": 0.007623636868179772, + "learning_rate": 5.387673517918135e-09, + "loss": 0.0001, + "step": 31643 + }, + { + "epoch": 12.86864579097194, + "grad_norm": 0.002802999497816289, + "learning_rate": 5.354470560541281e-09, + "loss": 0.0, + "step": 31644 + }, + { + "epoch": 12.869052460349735, + "grad_norm": 0.03491884569118294, + "learning_rate": 5.321370203231757e-09, + "loss": 0.0002, + "step": 31645 + }, + { + "epoch": 12.869459129727531, + "grad_norm": 0.006798145572260356, + "learning_rate": 5.288372446328183e-09, + "loss": 0.0, + "step": 31646 + }, + { + "epoch": 12.869865799105327, + "grad_norm": 0.009179854007974879, + "learning_rate": 5.255477290169176e-09, + "loss": 0.0001, + "step": 31647 + }, + { + "epoch": 12.870272468483122, + "grad_norm": 0.003009375021930165, + "learning_rate": 5.222684735093353e-09, + "loss": 0.0, + "step": 31648 + }, + { + "epoch": 12.870679137860918, + "grad_norm": 9.776786619159163e-05, + "learning_rate": 5.189994781437113e-09, + "loss": 0.0, + "step": 31649 + }, + { + "epoch": 12.871085807238714, + "grad_norm": 0.010567891658421567, + "learning_rate": 5.157407429535744e-09, + "loss": 0.0001, + "step": 31650 + }, + { + "epoch": 12.871492476616512, + "grad_norm": 0.0013517651681530234, + "learning_rate": 5.124922679723421e-09, + "loss": 0.0, + "step": 31651 + }, + { + "epoch": 12.871899145994307, + "grad_norm": 0.004554263949223816, + "learning_rate": 5.092540532334323e-09, + "loss": 0.0, + "step": 31652 + }, + { + "epoch": 12.872305815372103, + "grad_norm": 9.817434193341129e-05, + "learning_rate": 5.060260987701515e-09, + "loss": 0.0, + "step": 31653 + }, + { + "epoch": 12.872712484749899, + "grad_norm": 0.0007314534262613381, + "learning_rate": 5.028084046154735e-09, + "loss": 0.0, + "step": 31654 + }, + { + "epoch": 12.873119154127695, + "grad_norm": 2.9337979268718314e-05, + "learning_rate": 4.996009708024829e-09, + "loss": 0.0, + "step": 31655 + }, + { + "epoch": 12.87352582350549, + "grad_norm": 6.195304804569039e-05, + "learning_rate": 4.964037973641534e-09, + "loss": 0.0, + "step": 31656 + }, + { + "epoch": 12.873932492883286, + "grad_norm": 0.0007385765104916302, + "learning_rate": 4.9321688433323634e-09, + "loss": 0.0, + "step": 31657 + }, + { + "epoch": 12.874339162261082, + "grad_norm": 0.05503156300485918, + "learning_rate": 4.900402317424835e-09, + "loss": 0.0004, + "step": 31658 + }, + { + "epoch": 12.874745831638878, + "grad_norm": 0.008941165396406436, + "learning_rate": 4.868738396245354e-09, + "loss": 0.0, + "step": 31659 + }, + { + "epoch": 12.875152501016673, + "grad_norm": 0.03431302401263035, + "learning_rate": 4.837177080119215e-09, + "loss": 0.0003, + "step": 31660 + }, + { + "epoch": 12.87555917039447, + "grad_norm": 0.00012158890578335274, + "learning_rate": 4.805718369369494e-09, + "loss": 0.0, + "step": 31661 + }, + { + "epoch": 12.875965839772265, + "grad_norm": 0.0002957755705097362, + "learning_rate": 4.774362264320376e-09, + "loss": 0.0, + "step": 31662 + }, + { + "epoch": 12.87637250915006, + "grad_norm": 0.020219719352982453, + "learning_rate": 4.743108765291604e-09, + "loss": 0.0001, + "step": 31663 + }, + { + "epoch": 12.876779178527856, + "grad_norm": 0.031773887795116904, + "learning_rate": 4.711957872606254e-09, + "loss": 0.0002, + "step": 31664 + }, + { + "epoch": 12.877185847905652, + "grad_norm": 0.0007329677756454155, + "learning_rate": 4.68090958658296e-09, + "loss": 0.0, + "step": 31665 + }, + { + "epoch": 12.877592517283448, + "grad_norm": 0.004173185689556679, + "learning_rate": 4.6499639075403555e-09, + "loss": 0.0, + "step": 31666 + }, + { + "epoch": 12.877999186661244, + "grad_norm": 0.0023076628605585648, + "learning_rate": 4.619120835795965e-09, + "loss": 0.0, + "step": 31667 + }, + { + "epoch": 12.87840585603904, + "grad_norm": 0.09691872151830129, + "learning_rate": 4.588380371667311e-09, + "loss": 0.0006, + "step": 31668 + }, + { + "epoch": 12.878812525416835, + "grad_norm": 0.008897010706697498, + "learning_rate": 4.5577425154696985e-09, + "loss": 0.0, + "step": 31669 + }, + { + "epoch": 12.879219194794633, + "grad_norm": 0.22475643556012884, + "learning_rate": 4.527207267518429e-09, + "loss": 0.0021, + "step": 31670 + }, + { + "epoch": 12.879625864172429, + "grad_norm": 0.02765317987119107, + "learning_rate": 4.496774628124367e-09, + "loss": 0.0002, + "step": 31671 + }, + { + "epoch": 12.880032533550224, + "grad_norm": 0.009943268130006034, + "learning_rate": 4.466444597602814e-09, + "loss": 0.0001, + "step": 31672 + }, + { + "epoch": 12.88043920292802, + "grad_norm": 0.02449563101298627, + "learning_rate": 4.436217176262414e-09, + "loss": 0.0002, + "step": 31673 + }, + { + "epoch": 12.880845872305816, + "grad_norm": 0.0006095172193301248, + "learning_rate": 4.406092364416248e-09, + "loss": 0.0, + "step": 31674 + }, + { + "epoch": 12.881252541683612, + "grad_norm": 0.015610605828509047, + "learning_rate": 4.376070162371848e-09, + "loss": 0.0001, + "step": 31675 + }, + { + "epoch": 12.881659211061407, + "grad_norm": 0.11043357381334928, + "learning_rate": 4.346150570437857e-09, + "loss": 0.0009, + "step": 31676 + }, + { + "epoch": 12.882065880439203, + "grad_norm": 0.001611354839093581, + "learning_rate": 4.316333588920696e-09, + "loss": 0.0, + "step": 31677 + }, + { + "epoch": 12.882472549816999, + "grad_norm": 0.00012836883697409684, + "learning_rate": 4.286619218127896e-09, + "loss": 0.0, + "step": 31678 + }, + { + "epoch": 12.882879219194795, + "grad_norm": 0.00035350304318621645, + "learning_rate": 4.25700745836255e-09, + "loss": 0.0, + "step": 31679 + }, + { + "epoch": 12.88328588857259, + "grad_norm": 0.0025166615962622873, + "learning_rate": 4.227498309931077e-09, + "loss": 0.0, + "step": 31680 + }, + { + "epoch": 12.883692557950386, + "grad_norm": 0.006331912874979967, + "learning_rate": 4.198091773134349e-09, + "loss": 0.0, + "step": 31681 + }, + { + "epoch": 12.884099227328182, + "grad_norm": 0.05515701590891042, + "learning_rate": 4.168787848275458e-09, + "loss": 0.0005, + "step": 31682 + }, + { + "epoch": 12.884505896705978, + "grad_norm": 0.009600112673347047, + "learning_rate": 4.139586535654161e-09, + "loss": 0.0001, + "step": 31683 + }, + { + "epoch": 12.884912566083774, + "grad_norm": 0.0021084880023185515, + "learning_rate": 4.110487835571331e-09, + "loss": 0.0, + "step": 31684 + }, + { + "epoch": 12.88531923546157, + "grad_norm": 0.01956364355831378, + "learning_rate": 4.081491748325617e-09, + "loss": 0.0002, + "step": 31685 + }, + { + "epoch": 12.885725904839365, + "grad_norm": 0.0006340380116085809, + "learning_rate": 4.0525982742134485e-09, + "loss": 0.0, + "step": 31686 + }, + { + "epoch": 12.88613257421716, + "grad_norm": 0.011401389452182772, + "learning_rate": 4.0238074135323654e-09, + "loss": 0.0001, + "step": 31687 + }, + { + "epoch": 12.886539243594957, + "grad_norm": 0.0022031230911178155, + "learning_rate": 3.9951191665776874e-09, + "loss": 0.0, + "step": 31688 + }, + { + "epoch": 12.886945912972752, + "grad_norm": 0.037882903996977885, + "learning_rate": 3.966533533643624e-09, + "loss": 0.0004, + "step": 31689 + }, + { + "epoch": 12.887352582350548, + "grad_norm": 0.17894691599553328, + "learning_rate": 3.938050515024383e-09, + "loss": 0.0007, + "step": 31690 + }, + { + "epoch": 12.887759251728344, + "grad_norm": 0.0005435424981067338, + "learning_rate": 3.909670111011954e-09, + "loss": 0.0, + "step": 31691 + }, + { + "epoch": 12.888165921106141, + "grad_norm": 0.02511254659020991, + "learning_rate": 3.881392321898325e-09, + "loss": 0.0002, + "step": 31692 + }, + { + "epoch": 12.888572590483937, + "grad_norm": 0.010047826647246611, + "learning_rate": 3.853217147972155e-09, + "loss": 0.0001, + "step": 31693 + }, + { + "epoch": 12.888979259861733, + "grad_norm": 0.006057300860261247, + "learning_rate": 3.825144589524321e-09, + "loss": 0.0, + "step": 31694 + }, + { + "epoch": 12.889385929239529, + "grad_norm": 0.015503808727048537, + "learning_rate": 3.797174646842372e-09, + "loss": 0.0001, + "step": 31695 + }, + { + "epoch": 12.889792598617325, + "grad_norm": 0.06829011867528037, + "learning_rate": 3.769307320212745e-09, + "loss": 0.0004, + "step": 31696 + }, + { + "epoch": 12.89019926799512, + "grad_norm": 0.00022578897535162652, + "learning_rate": 3.741542609922988e-09, + "loss": 0.0, + "step": 31697 + }, + { + "epoch": 12.890605937372916, + "grad_norm": 0.0004316054485053626, + "learning_rate": 3.7138805162573178e-09, + "loss": 0.0, + "step": 31698 + }, + { + "epoch": 12.891012606750712, + "grad_norm": 7.403166400045877e-05, + "learning_rate": 3.6863210394988413e-09, + "loss": 0.0, + "step": 31699 + }, + { + "epoch": 12.891419276128508, + "grad_norm": 0.00011386956614369161, + "learning_rate": 3.6588641799317757e-09, + "loss": 0.0, + "step": 31700 + }, + { + "epoch": 12.891825945506303, + "grad_norm": 0.0036488967219975407, + "learning_rate": 3.631509937838118e-09, + "loss": 0.0, + "step": 31701 + }, + { + "epoch": 12.892232614884099, + "grad_norm": 0.030739051055830486, + "learning_rate": 3.6042583134965336e-09, + "loss": 0.0001, + "step": 31702 + }, + { + "epoch": 12.892639284261895, + "grad_norm": 0.011750570929877024, + "learning_rate": 3.57710930719013e-09, + "loss": 0.0001, + "step": 31703 + }, + { + "epoch": 12.89304595363969, + "grad_norm": 0.0006765255994795654, + "learning_rate": 3.5500629191942415e-09, + "loss": 0.0, + "step": 31704 + }, + { + "epoch": 12.893452623017486, + "grad_norm": 0.01949728957478868, + "learning_rate": 3.5231191497886453e-09, + "loss": 0.0001, + "step": 31705 + }, + { + "epoch": 12.893859292395282, + "grad_norm": 0.06780185978702305, + "learning_rate": 3.4962779992497865e-09, + "loss": 0.0005, + "step": 31706 + }, + { + "epoch": 12.894265961773078, + "grad_norm": 0.020395170734958983, + "learning_rate": 3.4695394678530005e-09, + "loss": 0.0002, + "step": 31707 + }, + { + "epoch": 12.894672631150874, + "grad_norm": 0.21310346248983955, + "learning_rate": 3.442903555871402e-09, + "loss": 0.0018, + "step": 31708 + }, + { + "epoch": 12.89507930052867, + "grad_norm": 0.0007362039019255788, + "learning_rate": 3.4163702635803266e-09, + "loss": 0.0, + "step": 31709 + }, + { + "epoch": 12.895485969906465, + "grad_norm": 0.04193424686671298, + "learning_rate": 3.3899395912517786e-09, + "loss": 0.0004, + "step": 31710 + }, + { + "epoch": 12.895892639284263, + "grad_norm": 0.01058546496122032, + "learning_rate": 3.3636115391555424e-09, + "loss": 0.0001, + "step": 31711 + }, + { + "epoch": 12.896299308662059, + "grad_norm": 0.03695065420972652, + "learning_rate": 3.3373861075636227e-09, + "loss": 0.0002, + "step": 31712 + }, + { + "epoch": 12.896705978039854, + "grad_norm": 0.0005242150526489829, + "learning_rate": 3.3112632967446934e-09, + "loss": 0.0, + "step": 31713 + }, + { + "epoch": 12.89711264741765, + "grad_norm": 0.03809193712686051, + "learning_rate": 3.285243106966318e-09, + "loss": 0.0002, + "step": 31714 + }, + { + "epoch": 12.897519316795446, + "grad_norm": 0.0028540456090128193, + "learning_rate": 3.259325538497171e-09, + "loss": 0.0, + "step": 31715 + }, + { + "epoch": 12.897925986173242, + "grad_norm": 0.002491426088708925, + "learning_rate": 3.2335105916014852e-09, + "loss": 0.0, + "step": 31716 + }, + { + "epoch": 12.898332655551037, + "grad_norm": 0.01720480576541328, + "learning_rate": 3.207798266545714e-09, + "loss": 0.0001, + "step": 31717 + }, + { + "epoch": 12.898739324928833, + "grad_norm": 0.0005903854476038129, + "learning_rate": 3.1821885635929805e-09, + "loss": 0.0, + "step": 31718 + }, + { + "epoch": 12.899145994306629, + "grad_norm": 0.037837828681491485, + "learning_rate": 3.1566814830064072e-09, + "loss": 0.0003, + "step": 31719 + }, + { + "epoch": 12.899552663684425, + "grad_norm": 0.010326591640645942, + "learning_rate": 3.1312770250480073e-09, + "loss": 0.0001, + "step": 31720 + }, + { + "epoch": 12.89995933306222, + "grad_norm": 0.00043014952898830737, + "learning_rate": 3.1059751899775726e-09, + "loss": 0.0, + "step": 31721 + }, + { + "epoch": 12.900366002440016, + "grad_norm": 0.043589379250853225, + "learning_rate": 3.0807759780560053e-09, + "loss": 0.0003, + "step": 31722 + }, + { + "epoch": 12.900772671817812, + "grad_norm": 0.00525650161784273, + "learning_rate": 3.055679389541988e-09, + "loss": 0.0, + "step": 31723 + }, + { + "epoch": 12.901179341195608, + "grad_norm": 0.04823979444784772, + "learning_rate": 3.0306854246919813e-09, + "loss": 0.0003, + "step": 31724 + }, + { + "epoch": 12.901586010573403, + "grad_norm": 2.0363676855781182e-05, + "learning_rate": 3.005794083763558e-09, + "loss": 0.0, + "step": 31725 + }, + { + "epoch": 12.9019926799512, + "grad_norm": 0.0020441412142942154, + "learning_rate": 2.9810053670131787e-09, + "loss": 0.0, + "step": 31726 + }, + { + "epoch": 12.902399349328995, + "grad_norm": 0.00117658462828789, + "learning_rate": 2.9563192746928647e-09, + "loss": 0.0, + "step": 31727 + }, + { + "epoch": 12.90280601870679, + "grad_norm": 0.0003357171585556259, + "learning_rate": 2.931735807057967e-09, + "loss": 0.0, + "step": 31728 + }, + { + "epoch": 12.903212688084587, + "grad_norm": 0.02093766863430237, + "learning_rate": 2.9072549643605062e-09, + "loss": 0.0002, + "step": 31729 + }, + { + "epoch": 12.903619357462382, + "grad_norm": 0.0007077919246666492, + "learning_rate": 2.882876746850283e-09, + "loss": 0.0, + "step": 31730 + }, + { + "epoch": 12.904026026840178, + "grad_norm": 0.015393649628365686, + "learning_rate": 2.858601154779317e-09, + "loss": 0.0001, + "step": 31731 + }, + { + "epoch": 12.904432696217976, + "grad_norm": 0.010477371805750202, + "learning_rate": 2.834428188397409e-09, + "loss": 0.0001, + "step": 31732 + }, + { + "epoch": 12.904839365595771, + "grad_norm": 0.006737007948714234, + "learning_rate": 2.8103578479499184e-09, + "loss": 0.0001, + "step": 31733 + }, + { + "epoch": 12.905246034973567, + "grad_norm": 0.00812404914779722, + "learning_rate": 2.786390133686645e-09, + "loss": 0.0001, + "step": 31734 + }, + { + "epoch": 12.905652704351363, + "grad_norm": 0.005059452065235459, + "learning_rate": 2.762525045851838e-09, + "loss": 0.0001, + "step": 31735 + }, + { + "epoch": 12.906059373729159, + "grad_norm": 0.0032083058821917765, + "learning_rate": 2.7387625846919675e-09, + "loss": 0.0, + "step": 31736 + }, + { + "epoch": 12.906466043106954, + "grad_norm": 0.0005154513182170725, + "learning_rate": 2.7151027504501714e-09, + "loss": 0.0, + "step": 31737 + }, + { + "epoch": 12.90687271248475, + "grad_norm": 0.020539636182602, + "learning_rate": 2.691545543368479e-09, + "loss": 0.0002, + "step": 31738 + }, + { + "epoch": 12.907279381862546, + "grad_norm": 0.029942927009078107, + "learning_rate": 2.668090963690029e-09, + "loss": 0.0002, + "step": 31739 + }, + { + "epoch": 12.907686051240342, + "grad_norm": 0.05680247810887519, + "learning_rate": 2.6447390116557394e-09, + "loss": 0.0004, + "step": 31740 + }, + { + "epoch": 12.908092720618138, + "grad_norm": 0.019506661906730585, + "learning_rate": 2.6214896875043083e-09, + "loss": 0.0001, + "step": 31741 + }, + { + "epoch": 12.908499389995933, + "grad_norm": 0.01662354958534824, + "learning_rate": 2.5983429914755444e-09, + "loss": 0.0002, + "step": 31742 + }, + { + "epoch": 12.908906059373729, + "grad_norm": 0.025876217362267123, + "learning_rate": 2.5752989238059243e-09, + "loss": 0.0001, + "step": 31743 + }, + { + "epoch": 12.909312728751525, + "grad_norm": 0.016409458707653205, + "learning_rate": 2.5523574847319264e-09, + "loss": 0.0002, + "step": 31744 + }, + { + "epoch": 12.90971939812932, + "grad_norm": 0.005122726379459651, + "learning_rate": 2.529518674490028e-09, + "loss": 0.0, + "step": 31745 + }, + { + "epoch": 12.910126067507116, + "grad_norm": 0.0026772139750953425, + "learning_rate": 2.506782493314486e-09, + "loss": 0.0, + "step": 31746 + }, + { + "epoch": 12.910532736884912, + "grad_norm": 0.0020911509482280776, + "learning_rate": 2.4841489414373365e-09, + "loss": 0.0, + "step": 31747 + }, + { + "epoch": 12.910939406262708, + "grad_norm": 0.0033494217320145484, + "learning_rate": 2.4616180190928375e-09, + "loss": 0.0, + "step": 31748 + }, + { + "epoch": 12.911346075640504, + "grad_norm": 0.00024259009564141205, + "learning_rate": 2.4391897265119146e-09, + "loss": 0.0, + "step": 31749 + }, + { + "epoch": 12.9117527450183, + "grad_norm": 0.001037828795382649, + "learning_rate": 2.4168640639243845e-09, + "loss": 0.0, + "step": 31750 + }, + { + "epoch": 12.912159414396095, + "grad_norm": 0.16672161614186665, + "learning_rate": 2.394641031558953e-09, + "loss": 0.0013, + "step": 31751 + }, + { + "epoch": 12.912566083773893, + "grad_norm": 0.05322313195719207, + "learning_rate": 2.372520629644326e-09, + "loss": 0.0005, + "step": 31752 + }, + { + "epoch": 12.912972753151688, + "grad_norm": 2.6746665909192, + "learning_rate": 2.3505028584069887e-09, + "loss": 0.0298, + "step": 31753 + }, + { + "epoch": 12.913379422529484, + "grad_norm": 0.005838481629599795, + "learning_rate": 2.328587718073427e-09, + "loss": 0.0001, + "step": 31754 + }, + { + "epoch": 12.91378609190728, + "grad_norm": 0.03514010258128004, + "learning_rate": 2.3067752088690164e-09, + "loss": 0.0002, + "step": 31755 + }, + { + "epoch": 12.914192761285076, + "grad_norm": 0.011188541617390585, + "learning_rate": 2.2850653310180214e-09, + "loss": 0.0001, + "step": 31756 + }, + { + "epoch": 12.914599430662872, + "grad_norm": 0.0051990202830331025, + "learning_rate": 2.2634580847413766e-09, + "loss": 0.0, + "step": 31757 + }, + { + "epoch": 12.915006100040667, + "grad_norm": 0.023695750151527792, + "learning_rate": 2.241953470263347e-09, + "loss": 0.0003, + "step": 31758 + }, + { + "epoch": 12.915412769418463, + "grad_norm": 0.0004006309141181214, + "learning_rate": 2.2205514878026468e-09, + "loss": 0.0, + "step": 31759 + }, + { + "epoch": 12.915819438796259, + "grad_norm": 0.05984040628666849, + "learning_rate": 2.1992521375791e-09, + "loss": 0.0004, + "step": 31760 + }, + { + "epoch": 12.916226108174055, + "grad_norm": 0.004360004025818028, + "learning_rate": 2.1780554198136404e-09, + "loss": 0.0, + "step": 31761 + }, + { + "epoch": 12.91663277755185, + "grad_norm": 0.02918035212270741, + "learning_rate": 2.1569613347205422e-09, + "loss": 0.0001, + "step": 31762 + }, + { + "epoch": 12.917039446929646, + "grad_norm": 0.0007093492607063133, + "learning_rate": 2.1359698825196284e-09, + "loss": 0.0, + "step": 31763 + }, + { + "epoch": 12.917446116307442, + "grad_norm": 0.012850115452182929, + "learning_rate": 2.1150810634240627e-09, + "loss": 0.0001, + "step": 31764 + }, + { + "epoch": 12.917852785685238, + "grad_norm": 0.0014652706228807815, + "learning_rate": 2.0942948776481175e-09, + "loss": 0.0, + "step": 31765 + }, + { + "epoch": 12.918259455063033, + "grad_norm": 0.0024656302866759, + "learning_rate": 2.0736113254071766e-09, + "loss": 0.0, + "step": 31766 + }, + { + "epoch": 12.91866612444083, + "grad_norm": 0.0003846753133897191, + "learning_rate": 2.053030406912182e-09, + "loss": 0.0, + "step": 31767 + }, + { + "epoch": 12.919072793818625, + "grad_norm": 0.003516467399190895, + "learning_rate": 2.0325521223751866e-09, + "loss": 0.0, + "step": 31768 + }, + { + "epoch": 12.91947946319642, + "grad_norm": 0.00016836813333459278, + "learning_rate": 2.012176472004912e-09, + "loss": 0.0, + "step": 31769 + }, + { + "epoch": 12.919886132574216, + "grad_norm": 0.0016800619112200142, + "learning_rate": 1.9919034560123006e-09, + "loss": 0.0, + "step": 31770 + }, + { + "epoch": 12.920292801952012, + "grad_norm": 0.0006810805684409601, + "learning_rate": 1.9717330746049645e-09, + "loss": 0.0, + "step": 31771 + }, + { + "epoch": 12.920699471329808, + "grad_norm": 0.016432297895322145, + "learning_rate": 1.951665327989405e-09, + "loss": 0.0001, + "step": 31772 + }, + { + "epoch": 12.921106140707606, + "grad_norm": 0.000488833819935056, + "learning_rate": 1.931700216371013e-09, + "loss": 0.0, + "step": 31773 + }, + { + "epoch": 12.921512810085401, + "grad_norm": 0.00044963474436554324, + "learning_rate": 1.9118377399562903e-09, + "loss": 0.0, + "step": 31774 + }, + { + "epoch": 12.921919479463197, + "grad_norm": 0.006573105823971949, + "learning_rate": 1.892077898949518e-09, + "loss": 0.0, + "step": 31775 + }, + { + "epoch": 12.922326148840993, + "grad_norm": 0.031181010465395394, + "learning_rate": 1.8724206935505363e-09, + "loss": 0.0003, + "step": 31776 + }, + { + "epoch": 12.922732818218789, + "grad_norm": 0.0004834464454840938, + "learning_rate": 1.8528661239647361e-09, + "loss": 0.0, + "step": 31777 + }, + { + "epoch": 12.923139487596584, + "grad_norm": 0.07246436822764628, + "learning_rate": 1.8334141903908476e-09, + "loss": 0.0005, + "step": 31778 + }, + { + "epoch": 12.92354615697438, + "grad_norm": 0.0053720399148719, + "learning_rate": 1.8140648930287108e-09, + "loss": 0.0, + "step": 31779 + }, + { + "epoch": 12.923952826352176, + "grad_norm": 0.0008636107097647838, + "learning_rate": 1.7948182320770557e-09, + "loss": 0.0, + "step": 31780 + }, + { + "epoch": 12.924359495729972, + "grad_norm": 0.03859388942847733, + "learning_rate": 1.775674207733502e-09, + "loss": 0.0002, + "step": 31781 + }, + { + "epoch": 12.924766165107767, + "grad_norm": 0.020233155511246555, + "learning_rate": 1.7566328201945592e-09, + "loss": 0.0002, + "step": 31782 + }, + { + "epoch": 12.925172834485563, + "grad_norm": 9.795378129133542e-05, + "learning_rate": 1.7376940696567368e-09, + "loss": 0.0, + "step": 31783 + }, + { + "epoch": 12.925579503863359, + "grad_norm": 0.005206759426930561, + "learning_rate": 1.7188579563121033e-09, + "loss": 0.0, + "step": 31784 + }, + { + "epoch": 12.925986173241155, + "grad_norm": 0.007956627487948006, + "learning_rate": 1.7001244803560578e-09, + "loss": 0.0001, + "step": 31785 + }, + { + "epoch": 12.92639284261895, + "grad_norm": 0.026109151354185768, + "learning_rate": 1.6814936419806693e-09, + "loss": 0.0002, + "step": 31786 + }, + { + "epoch": 12.926799511996746, + "grad_norm": 0.0020808074555910116, + "learning_rate": 1.662965441376896e-09, + "loss": 0.0, + "step": 31787 + }, + { + "epoch": 12.927206181374542, + "grad_norm": 0.003227395802540135, + "learning_rate": 1.6445398787345856e-09, + "loss": 0.0, + "step": 31788 + }, + { + "epoch": 12.927612850752338, + "grad_norm": 0.00850113654488471, + "learning_rate": 1.6262169542424767e-09, + "loss": 0.0001, + "step": 31789 + }, + { + "epoch": 12.928019520130134, + "grad_norm": 0.0032064218147676783, + "learning_rate": 1.6079966680904168e-09, + "loss": 0.0, + "step": 31790 + }, + { + "epoch": 12.92842618950793, + "grad_norm": 0.025069099586879185, + "learning_rate": 1.5898790204627034e-09, + "loss": 0.0003, + "step": 31791 + }, + { + "epoch": 12.928832858885725, + "grad_norm": 0.00020702563053876847, + "learning_rate": 1.5718640115480744e-09, + "loss": 0.0, + "step": 31792 + }, + { + "epoch": 12.929239528263523, + "grad_norm": 4.804498197443308, + "learning_rate": 1.5539516415308264e-09, + "loss": 0.0826, + "step": 31793 + }, + { + "epoch": 12.929646197641318, + "grad_norm": 0.006547156865048891, + "learning_rate": 1.5361419105941467e-09, + "loss": 0.0001, + "step": 31794 + }, + { + "epoch": 12.930052867019114, + "grad_norm": 0.006484287689877447, + "learning_rate": 1.5184348189212217e-09, + "loss": 0.0, + "step": 31795 + }, + { + "epoch": 12.93045953639691, + "grad_norm": 0.012775590220027628, + "learning_rate": 1.500830366693018e-09, + "loss": 0.0001, + "step": 31796 + }, + { + "epoch": 12.930866205774706, + "grad_norm": 0.0022886986529660792, + "learning_rate": 1.4833285540905018e-09, + "loss": 0.0, + "step": 31797 + }, + { + "epoch": 12.931272875152501, + "grad_norm": 0.006145639902790514, + "learning_rate": 1.4659293812946397e-09, + "loss": 0.0001, + "step": 31798 + }, + { + "epoch": 12.931679544530297, + "grad_norm": 0.02015925011158612, + "learning_rate": 1.448632848483067e-09, + "loss": 0.0003, + "step": 31799 + }, + { + "epoch": 12.932086213908093, + "grad_norm": 0.01107103603491162, + "learning_rate": 1.4314389558334196e-09, + "loss": 0.0001, + "step": 31800 + }, + { + "epoch": 12.932492883285889, + "grad_norm": 0.15015078437664528, + "learning_rate": 1.414347703522223e-09, + "loss": 0.0012, + "step": 31801 + }, + { + "epoch": 12.932899552663685, + "grad_norm": 0.0016735628711759324, + "learning_rate": 1.3973590917248925e-09, + "loss": 0.0, + "step": 31802 + }, + { + "epoch": 12.93330622204148, + "grad_norm": 0.0003300226382957135, + "learning_rate": 1.3804731206168432e-09, + "loss": 0.0, + "step": 31803 + }, + { + "epoch": 12.933712891419276, + "grad_norm": 1.8400533748070491, + "learning_rate": 1.3636897903690493e-09, + "loss": 0.019, + "step": 31804 + }, + { + "epoch": 12.934119560797072, + "grad_norm": 0.0005527616387658113, + "learning_rate": 1.3470091011558162e-09, + "loss": 0.0, + "step": 31805 + }, + { + "epoch": 12.934526230174868, + "grad_norm": 0.00809674811465113, + "learning_rate": 1.3304310531470077e-09, + "loss": 0.0, + "step": 31806 + }, + { + "epoch": 12.934932899552663, + "grad_norm": 0.054381596330410766, + "learning_rate": 1.3139556465147085e-09, + "loss": 0.0006, + "step": 31807 + }, + { + "epoch": 12.935339568930459, + "grad_norm": 0.004388336980342383, + "learning_rate": 1.297582881425452e-09, + "loss": 0.0, + "step": 31808 + }, + { + "epoch": 12.935746238308255, + "grad_norm": 0.019475616804647024, + "learning_rate": 1.2813127580502128e-09, + "loss": 0.0001, + "step": 31809 + }, + { + "epoch": 12.93615290768605, + "grad_norm": 0.000770219989391614, + "learning_rate": 1.2651452765533034e-09, + "loss": 0.0, + "step": 31810 + }, + { + "epoch": 12.936559577063846, + "grad_norm": 0.00011135016070112562, + "learning_rate": 1.249080437103478e-09, + "loss": 0.0, + "step": 31811 + }, + { + "epoch": 12.936966246441642, + "grad_norm": 0.0016232620911835526, + "learning_rate": 1.2331182398628295e-09, + "loss": 0.0, + "step": 31812 + }, + { + "epoch": 12.937372915819438, + "grad_norm": 0.00010115398833482616, + "learning_rate": 1.2172586849967804e-09, + "loss": 0.0, + "step": 31813 + }, + { + "epoch": 12.937779585197235, + "grad_norm": 0.005741145106228588, + "learning_rate": 1.2015017726674239e-09, + "loss": 0.0001, + "step": 31814 + }, + { + "epoch": 12.938186254575031, + "grad_norm": 0.009846524555541589, + "learning_rate": 1.1858475030379623e-09, + "loss": 0.0001, + "step": 31815 + }, + { + "epoch": 12.938592923952827, + "grad_norm": 0.0044121383580692924, + "learning_rate": 1.1702958762671579e-09, + "loss": 0.0, + "step": 31816 + }, + { + "epoch": 12.938999593330623, + "grad_norm": 0.002830268941940101, + "learning_rate": 1.1548468925159929e-09, + "loss": 0.0, + "step": 31817 + }, + { + "epoch": 12.939406262708419, + "grad_norm": 0.039278612430041514, + "learning_rate": 1.139500551943229e-09, + "loss": 0.0004, + "step": 31818 + }, + { + "epoch": 12.939812932086214, + "grad_norm": 0.009639702218525089, + "learning_rate": 1.1242568547054078e-09, + "loss": 0.0001, + "step": 31819 + }, + { + "epoch": 12.94021960146401, + "grad_norm": 0.00623287437058395, + "learning_rate": 1.1091158009590707e-09, + "loss": 0.0001, + "step": 31820 + }, + { + "epoch": 12.940626270841806, + "grad_norm": 0.030290468925493953, + "learning_rate": 1.094077390859649e-09, + "loss": 0.0002, + "step": 31821 + }, + { + "epoch": 12.941032940219602, + "grad_norm": 0.008328018991371509, + "learning_rate": 1.079141624562574e-09, + "loss": 0.0001, + "step": 31822 + }, + { + "epoch": 12.941439609597397, + "grad_norm": 0.001394699377799775, + "learning_rate": 1.0643085022199461e-09, + "loss": 0.0, + "step": 31823 + }, + { + "epoch": 12.941846278975193, + "grad_norm": 0.00015513206905409295, + "learning_rate": 1.0495780239849762e-09, + "loss": 0.0, + "step": 31824 + }, + { + "epoch": 12.942252948352989, + "grad_norm": 0.0652097433001266, + "learning_rate": 1.0349501900086544e-09, + "loss": 0.0006, + "step": 31825 + }, + { + "epoch": 12.942659617730785, + "grad_norm": 0.005243931975608449, + "learning_rate": 1.020425000440861e-09, + "loss": 0.0001, + "step": 31826 + }, + { + "epoch": 12.94306628710858, + "grad_norm": 0.0014289185814153839, + "learning_rate": 1.006002455430366e-09, + "loss": 0.0, + "step": 31827 + }, + { + "epoch": 12.943472956486376, + "grad_norm": 0.002425757766188137, + "learning_rate": 9.91682555125939e-10, + "loss": 0.0, + "step": 31828 + }, + { + "epoch": 12.943879625864172, + "grad_norm": 0.0006966343624780571, + "learning_rate": 9.774652996741296e-10, + "loss": 0.0, + "step": 31829 + }, + { + "epoch": 12.944286295241968, + "grad_norm": 0.005978172563349425, + "learning_rate": 9.63350689220377e-10, + "loss": 0.0, + "step": 31830 + }, + { + "epoch": 12.944692964619763, + "grad_norm": 2.4271688737967742e-05, + "learning_rate": 9.493387239112305e-10, + "loss": 0.0, + "step": 31831 + }, + { + "epoch": 12.94509963399756, + "grad_norm": 0.016164874786720195, + "learning_rate": 9.35429403888799e-10, + "loss": 0.0001, + "step": 31832 + }, + { + "epoch": 12.945506303375355, + "grad_norm": 0.014240914932416249, + "learning_rate": 9.216227292974111e-10, + "loss": 0.0001, + "step": 31833 + }, + { + "epoch": 12.945912972753153, + "grad_norm": 0.019718484753425324, + "learning_rate": 9.079187002780653e-10, + "loss": 0.0002, + "step": 31834 + }, + { + "epoch": 12.946319642130948, + "grad_norm": 0.002021198393702438, + "learning_rate": 8.943173169706499e-10, + "loss": 0.0, + "step": 31835 + }, + { + "epoch": 12.946726311508744, + "grad_norm": 0.010317506322390918, + "learning_rate": 8.808185795161628e-10, + "loss": 0.0001, + "step": 31836 + }, + { + "epoch": 12.94713298088654, + "grad_norm": 0.005840703315963919, + "learning_rate": 8.67422488052272e-10, + "loss": 0.0, + "step": 31837 + }, + { + "epoch": 12.947539650264336, + "grad_norm": 0.0014060738484243437, + "learning_rate": 8.541290427166449e-10, + "loss": 0.0, + "step": 31838 + }, + { + "epoch": 12.947946319642131, + "grad_norm": 0.0013369336519573265, + "learning_rate": 8.409382436458391e-10, + "loss": 0.0, + "step": 31839 + }, + { + "epoch": 12.948352989019927, + "grad_norm": 0.007730675398838701, + "learning_rate": 8.278500909753018e-10, + "loss": 0.0001, + "step": 31840 + }, + { + "epoch": 12.948759658397723, + "grad_norm": 0.023705138942623123, + "learning_rate": 8.148645848393699e-10, + "loss": 0.0002, + "step": 31841 + }, + { + "epoch": 12.949166327775519, + "grad_norm": 0.0016649493533233382, + "learning_rate": 8.019817253712703e-10, + "loss": 0.0, + "step": 31842 + }, + { + "epoch": 12.949572997153314, + "grad_norm": 1.4330712559611035e-05, + "learning_rate": 7.892015127031194e-10, + "loss": 0.0, + "step": 31843 + }, + { + "epoch": 12.94997966653111, + "grad_norm": 0.5117807466205626, + "learning_rate": 7.765239469659236e-10, + "loss": 0.0026, + "step": 31844 + }, + { + "epoch": 12.950386335908906, + "grad_norm": 0.011219610146523198, + "learning_rate": 7.639490282917994e-10, + "loss": 0.0001, + "step": 31845 + }, + { + "epoch": 12.950793005286702, + "grad_norm": 0.003167801623337038, + "learning_rate": 7.514767568073123e-10, + "loss": 0.0, + "step": 31846 + }, + { + "epoch": 12.951199674664498, + "grad_norm": 0.005256478100035854, + "learning_rate": 7.39107132641248e-10, + "loss": 0.0001, + "step": 31847 + }, + { + "epoch": 12.951606344042293, + "grad_norm": 0.0006455858943928567, + "learning_rate": 7.268401559201721e-10, + "loss": 0.0, + "step": 31848 + }, + { + "epoch": 12.952013013420089, + "grad_norm": 0.024734561974390584, + "learning_rate": 7.146758267717601e-10, + "loss": 0.0002, + "step": 31849 + }, + { + "epoch": 12.952419682797885, + "grad_norm": 0.05975715623199902, + "learning_rate": 7.02614145319247e-10, + "loss": 0.0005, + "step": 31850 + }, + { + "epoch": 12.95282635217568, + "grad_norm": 0.0019881874964503897, + "learning_rate": 6.906551116869775e-10, + "loss": 0.0, + "step": 31851 + }, + { + "epoch": 12.953233021553476, + "grad_norm": 0.009228961686350167, + "learning_rate": 6.787987259981865e-10, + "loss": 0.0001, + "step": 31852 + }, + { + "epoch": 12.953639690931272, + "grad_norm": 0.0008592622677917368, + "learning_rate": 6.670449883738883e-10, + "loss": 0.0, + "step": 31853 + }, + { + "epoch": 12.954046360309068, + "grad_norm": 0.00888312902028833, + "learning_rate": 6.55393898933987e-10, + "loss": 0.0001, + "step": 31854 + }, + { + "epoch": 12.954453029686865, + "grad_norm": 0.004625988190644497, + "learning_rate": 6.438454577994969e-10, + "loss": 0.0, + "step": 31855 + }, + { + "epoch": 12.954859699064661, + "grad_norm": 0.0013924243586342338, + "learning_rate": 6.323996650892117e-10, + "loss": 0.0, + "step": 31856 + }, + { + "epoch": 12.955266368442457, + "grad_norm": 0.0034645287995585024, + "learning_rate": 6.21056520919705e-10, + "loss": 0.0, + "step": 31857 + }, + { + "epoch": 12.955673037820253, + "grad_norm": 0.0007240659285238787, + "learning_rate": 6.098160254075503e-10, + "loss": 0.0, + "step": 31858 + }, + { + "epoch": 12.956079707198048, + "grad_norm": 0.0021825045798087982, + "learning_rate": 5.986781786682106e-10, + "loss": 0.0, + "step": 31859 + }, + { + "epoch": 12.956486376575844, + "grad_norm": 0.0026376082717219753, + "learning_rate": 5.876429808149286e-10, + "loss": 0.0, + "step": 31860 + }, + { + "epoch": 12.95689304595364, + "grad_norm": 0.004015339631612799, + "learning_rate": 5.767104319631678e-10, + "loss": 0.0, + "step": 31861 + }, + { + "epoch": 12.957299715331436, + "grad_norm": 0.00014419373756398616, + "learning_rate": 5.658805322239502e-10, + "loss": 0.0, + "step": 31862 + }, + { + "epoch": 12.957706384709232, + "grad_norm": 0.041828716382177764, + "learning_rate": 5.551532817094086e-10, + "loss": 0.0004, + "step": 31863 + }, + { + "epoch": 12.958113054087027, + "grad_norm": 0.0009445506578436873, + "learning_rate": 5.445286805272343e-10, + "loss": 0.0, + "step": 31864 + }, + { + "epoch": 12.958519723464823, + "grad_norm": 0.06943660321595566, + "learning_rate": 5.3400672878956e-10, + "loss": 0.0006, + "step": 31865 + }, + { + "epoch": 12.958926392842619, + "grad_norm": 0.047747230700646744, + "learning_rate": 5.23587426601857e-10, + "loss": 0.0003, + "step": 31866 + }, + { + "epoch": 12.959333062220415, + "grad_norm": 0.0299598659462169, + "learning_rate": 5.132707740729271e-10, + "loss": 0.0002, + "step": 31867 + }, + { + "epoch": 12.95973973159821, + "grad_norm": 0.06786468852800807, + "learning_rate": 5.030567713071311e-10, + "loss": 0.0006, + "step": 31868 + }, + { + "epoch": 12.960146400976006, + "grad_norm": 0.005748887297183587, + "learning_rate": 4.929454184099403e-10, + "loss": 0.0001, + "step": 31869 + }, + { + "epoch": 12.960553070353802, + "grad_norm": 0.07602107388326548, + "learning_rate": 4.829367154868258e-10, + "loss": 0.0009, + "step": 31870 + }, + { + "epoch": 12.960959739731598, + "grad_norm": 0.004647237601874175, + "learning_rate": 4.73030662637708e-10, + "loss": 0.0, + "step": 31871 + }, + { + "epoch": 12.961366409109393, + "grad_norm": 0.003689925344291101, + "learning_rate": 4.632272599658372e-10, + "loss": 0.0, + "step": 31872 + }, + { + "epoch": 12.96177307848719, + "grad_norm": 0.011037991707158206, + "learning_rate": 4.535265075722439e-10, + "loss": 0.0001, + "step": 31873 + }, + { + "epoch": 12.962179747864985, + "grad_norm": 0.14026763035995318, + "learning_rate": 4.4392840555462777e-10, + "loss": 0.0017, + "step": 31874 + }, + { + "epoch": 12.962586417242782, + "grad_norm": 0.003105789424454514, + "learning_rate": 4.3443295401401906e-10, + "loss": 0.0, + "step": 31875 + }, + { + "epoch": 12.962993086620578, + "grad_norm": 0.007661564585301697, + "learning_rate": 4.25040153045897e-10, + "loss": 0.0001, + "step": 31876 + }, + { + "epoch": 12.963399755998374, + "grad_norm": 0.0033590965037580607, + "learning_rate": 4.1575000274685085e-10, + "loss": 0.0, + "step": 31877 + }, + { + "epoch": 12.96380642537617, + "grad_norm": 0.005168057660337184, + "learning_rate": 4.0656250321458036e-10, + "loss": 0.0, + "step": 31878 + }, + { + "epoch": 12.964213094753966, + "grad_norm": 0.006070510132326325, + "learning_rate": 3.9747765454012377e-10, + "loss": 0.0001, + "step": 31879 + }, + { + "epoch": 12.964619764131761, + "grad_norm": 0.0001516019354747048, + "learning_rate": 3.8849545681896026e-10, + "loss": 0.0, + "step": 31880 + }, + { + "epoch": 12.965026433509557, + "grad_norm": 0.003220157586288114, + "learning_rate": 3.796159101421282e-10, + "loss": 0.0, + "step": 31881 + }, + { + "epoch": 12.965433102887353, + "grad_norm": 0.003585553660281611, + "learning_rate": 3.7083901460177596e-10, + "loss": 0.0, + "step": 31882 + }, + { + "epoch": 12.965839772265149, + "grad_norm": 0.0005410409372503406, + "learning_rate": 3.621647702867215e-10, + "loss": 0.0, + "step": 31883 + }, + { + "epoch": 12.966246441642944, + "grad_norm": 0.0007583462773064471, + "learning_rate": 3.535931772868928e-10, + "loss": 0.0, + "step": 31884 + }, + { + "epoch": 12.96665311102074, + "grad_norm": 0.00021108435028445056, + "learning_rate": 3.4512423569110774e-10, + "loss": 0.0, + "step": 31885 + }, + { + "epoch": 12.967059780398536, + "grad_norm": 0.004179214267710583, + "learning_rate": 3.3675794558485353e-10, + "loss": 0.0, + "step": 31886 + }, + { + "epoch": 12.967466449776332, + "grad_norm": 0.002179814517379071, + "learning_rate": 3.284943070536173e-10, + "loss": 0.0, + "step": 31887 + }, + { + "epoch": 12.967873119154127, + "grad_norm": 0.007540017302151979, + "learning_rate": 3.203333201839964e-10, + "loss": 0.0001, + "step": 31888 + }, + { + "epoch": 12.968279788531923, + "grad_norm": 0.05066290868253994, + "learning_rate": 3.122749850592577e-10, + "loss": 0.0005, + "step": 31889 + }, + { + "epoch": 12.968686457909719, + "grad_norm": 0.003119151626987077, + "learning_rate": 3.0431930176155755e-10, + "loss": 0.0, + "step": 31890 + }, + { + "epoch": 12.969093127287515, + "grad_norm": 0.22771837155576632, + "learning_rate": 2.964662703719423e-10, + "loss": 0.0036, + "step": 31891 + }, + { + "epoch": 12.96949979666531, + "grad_norm": 6.129665747348911e-05, + "learning_rate": 2.8871589097256847e-10, + "loss": 0.0, + "step": 31892 + }, + { + "epoch": 12.969906466043106, + "grad_norm": 0.0006803932009156237, + "learning_rate": 2.8106816364226185e-10, + "loss": 0.0, + "step": 31893 + }, + { + "epoch": 12.970313135420902, + "grad_norm": 0.0018173217644803946, + "learning_rate": 2.735230884587381e-10, + "loss": 0.0, + "step": 31894 + }, + { + "epoch": 12.970719804798698, + "grad_norm": 0.00448119415877538, + "learning_rate": 2.66080665500823e-10, + "loss": 0.0, + "step": 31895 + }, + { + "epoch": 12.971126474176495, + "grad_norm": 0.016783960550847656, + "learning_rate": 2.58740894845122e-10, + "loss": 0.0001, + "step": 31896 + }, + { + "epoch": 12.971533143554291, + "grad_norm": 0.011593912193566484, + "learning_rate": 2.515037765649098e-10, + "loss": 0.0001, + "step": 31897 + }, + { + "epoch": 12.971939812932087, + "grad_norm": 0.035564986110754794, + "learning_rate": 2.443693107367917e-10, + "loss": 0.0003, + "step": 31898 + }, + { + "epoch": 12.972346482309883, + "grad_norm": 0.010235662311017591, + "learning_rate": 2.373374974318221e-10, + "loss": 0.0001, + "step": 31899 + }, + { + "epoch": 12.972753151687678, + "grad_norm": 0.00030702196240752015, + "learning_rate": 2.304083367243859e-10, + "loss": 0.0, + "step": 31900 + }, + { + "epoch": 12.973159821065474, + "grad_norm": 0.006220483163153808, + "learning_rate": 2.2358182868442711e-10, + "loss": 0.0, + "step": 31901 + }, + { + "epoch": 12.97356649044327, + "grad_norm": 0.027790190125813, + "learning_rate": 2.1685797338188985e-10, + "loss": 0.0003, + "step": 31902 + }, + { + "epoch": 12.973973159821066, + "grad_norm": 0.02105694934109302, + "learning_rate": 2.1023677088671813e-10, + "loss": 0.0002, + "step": 31903 + }, + { + "epoch": 12.974379829198861, + "grad_norm": 0.02630202635733188, + "learning_rate": 2.037182212655253e-10, + "loss": 0.0002, + "step": 31904 + }, + { + "epoch": 12.974786498576657, + "grad_norm": 0.34019389694607927, + "learning_rate": 1.9730232458603506e-10, + "loss": 0.0035, + "step": 31905 + }, + { + "epoch": 12.975193167954453, + "grad_norm": 0.13847918339215318, + "learning_rate": 1.9098908091375046e-10, + "loss": 0.001, + "step": 31906 + }, + { + "epoch": 12.975599837332249, + "grad_norm": 0.1201686587573645, + "learning_rate": 1.8477849031528493e-10, + "loss": 0.001, + "step": 31907 + }, + { + "epoch": 12.976006506710045, + "grad_norm": 0.0004231809434396362, + "learning_rate": 1.7867055285170075e-10, + "loss": 0.0, + "step": 31908 + }, + { + "epoch": 12.97641317608784, + "grad_norm": 0.0003268843474884732, + "learning_rate": 1.7266526858628064e-10, + "loss": 0.0, + "step": 31909 + }, + { + "epoch": 12.976819845465636, + "grad_norm": 0.00035429955895763417, + "learning_rate": 1.6676263758230725e-10, + "loss": 0.0, + "step": 31910 + }, + { + "epoch": 12.977226514843432, + "grad_norm": 0.0007577905533247367, + "learning_rate": 1.6096265989862247e-10, + "loss": 0.0, + "step": 31911 + }, + { + "epoch": 12.977633184221228, + "grad_norm": 0.0019752514733673327, + "learning_rate": 1.5526533559628852e-10, + "loss": 0.0, + "step": 31912 + }, + { + "epoch": 12.978039853599023, + "grad_norm": 0.06481284386433722, + "learning_rate": 1.4967066473192682e-10, + "loss": 0.0009, + "step": 31913 + }, + { + "epoch": 12.978446522976819, + "grad_norm": 0.024408256776549744, + "learning_rate": 1.4417864736548935e-10, + "loss": 0.0002, + "step": 31914 + }, + { + "epoch": 12.978853192354615, + "grad_norm": 0.10563940528415611, + "learning_rate": 1.3878928355026688e-10, + "loss": 0.0006, + "step": 31915 + }, + { + "epoch": 12.979259861732412, + "grad_norm": 0.03964404457983829, + "learning_rate": 1.3350257334399098e-10, + "loss": 0.0004, + "step": 31916 + }, + { + "epoch": 12.979666531110208, + "grad_norm": 2.7096366904853175, + "learning_rate": 1.2831851679995233e-10, + "loss": 0.0297, + "step": 31917 + }, + { + "epoch": 12.980073200488004, + "grad_norm": 0.0003384723554667709, + "learning_rate": 1.2323711397144167e-10, + "loss": 0.0, + "step": 31918 + }, + { + "epoch": 12.9804798698658, + "grad_norm": 0.03884498172102127, + "learning_rate": 1.182583649117497e-10, + "loss": 0.0003, + "step": 31919 + }, + { + "epoch": 12.980886539243595, + "grad_norm": 0.018056262649917987, + "learning_rate": 1.1338226966972621e-10, + "loss": 0.0001, + "step": 31920 + }, + { + "epoch": 12.981293208621391, + "grad_norm": 0.0015583288155071203, + "learning_rate": 1.086088282975517e-10, + "loss": 0.0, + "step": 31921 + }, + { + "epoch": 12.981699877999187, + "grad_norm": 0.04403482775840217, + "learning_rate": 1.0393804084296578e-10, + "loss": 0.0003, + "step": 31922 + }, + { + "epoch": 12.982106547376983, + "grad_norm": 0.0028728289669005427, + "learning_rate": 9.9369907353708e-11, + "loss": 0.0, + "step": 31923 + }, + { + "epoch": 12.982513216754779, + "grad_norm": 0.010699657390131305, + "learning_rate": 9.490442787862819e-11, + "loss": 0.0001, + "step": 31924 + }, + { + "epoch": 12.982919886132574, + "grad_norm": 0.019516892067064556, + "learning_rate": 9.054160246102506e-11, + "loss": 0.0002, + "step": 31925 + }, + { + "epoch": 12.98332655551037, + "grad_norm": 0.020065778507478038, + "learning_rate": 8.628143114752796e-11, + "loss": 0.0001, + "step": 31926 + }, + { + "epoch": 12.983733224888166, + "grad_norm": 0.024323514700349293, + "learning_rate": 8.212391398032537e-11, + "loss": 0.0001, + "step": 31927 + }, + { + "epoch": 12.984139894265962, + "grad_norm": 0.004467744303659633, + "learning_rate": 7.806905100382622e-11, + "loss": 0.0, + "step": 31928 + }, + { + "epoch": 12.984546563643757, + "grad_norm": 0.00013651556151358056, + "learning_rate": 7.411684225799854e-11, + "loss": 0.0, + "step": 31929 + }, + { + "epoch": 12.984953233021553, + "grad_norm": 0.009971189448763131, + "learning_rate": 7.026728778503078e-11, + "loss": 0.0001, + "step": 31930 + }, + { + "epoch": 12.985359902399349, + "grad_norm": 0.10893908602763692, + "learning_rate": 6.652038762378077e-11, + "loss": 0.0011, + "step": 31931 + }, + { + "epoch": 12.985766571777145, + "grad_norm": 0.24450365456724438, + "learning_rate": 6.287614181199608e-11, + "loss": 0.0018, + "step": 31932 + }, + { + "epoch": 12.98617324115494, + "grad_norm": 0.005642778361949113, + "learning_rate": 5.93345503874243e-11, + "loss": 0.0, + "step": 31933 + }, + { + "epoch": 12.986579910532736, + "grad_norm": 0.02104961950110045, + "learning_rate": 5.58956133867028e-11, + "loss": 0.0001, + "step": 31934 + }, + { + "epoch": 12.986986579910532, + "grad_norm": 0.002815797185437522, + "learning_rate": 5.2559330845358693e-11, + "loss": 0.0, + "step": 31935 + }, + { + "epoch": 12.987393249288328, + "grad_norm": 0.0008430028431023689, + "learning_rate": 4.932570279780891e-11, + "loss": 0.0, + "step": 31936 + }, + { + "epoch": 12.987799918666125, + "grad_norm": 6.979425394985655e-05, + "learning_rate": 4.6194729276249904e-11, + "loss": 0.0, + "step": 31937 + }, + { + "epoch": 12.988206588043921, + "grad_norm": 0.0028182002135610598, + "learning_rate": 4.3166410313988384e-11, + "loss": 0.0, + "step": 31938 + }, + { + "epoch": 12.988613257421717, + "grad_norm": 0.003809418174498384, + "learning_rate": 4.024074594100036e-11, + "loss": 0.0, + "step": 31939 + }, + { + "epoch": 12.989019926799513, + "grad_norm": 0.0013275859236447775, + "learning_rate": 3.7417736187261855e-11, + "loss": 0.0, + "step": 31940 + }, + { + "epoch": 12.989426596177308, + "grad_norm": 0.003959319455423796, + "learning_rate": 3.4697381082748894e-11, + "loss": 0.0, + "step": 31941 + }, + { + "epoch": 12.989833265555104, + "grad_norm": 0.056543250239858574, + "learning_rate": 3.207968065521705e-11, + "loss": 0.0005, + "step": 31942 + }, + { + "epoch": 12.9902399349329, + "grad_norm": 0.0006534909193465109, + "learning_rate": 2.956463493131168e-11, + "loss": 0.0, + "step": 31943 + }, + { + "epoch": 12.990646604310696, + "grad_norm": 0.0007264771530932746, + "learning_rate": 2.71522439365679e-11, + "loss": 0.0, + "step": 31944 + }, + { + "epoch": 12.991053273688491, + "grad_norm": 0.007267187877952428, + "learning_rate": 2.4842507696520857e-11, + "loss": 0.0, + "step": 31945 + }, + { + "epoch": 12.991459943066287, + "grad_norm": 0.00011264349082534605, + "learning_rate": 2.2635426233375e-11, + "loss": 0.0, + "step": 31946 + }, + { + "epoch": 12.991866612444083, + "grad_norm": 0.008750851600127982, + "learning_rate": 2.0530999571555244e-11, + "loss": 0.0001, + "step": 31947 + }, + { + "epoch": 12.992273281821879, + "grad_norm": 0.02287702856668937, + "learning_rate": 1.8529227731045597e-11, + "loss": 0.0003, + "step": 31948 + }, + { + "epoch": 12.992679951199674, + "grad_norm": 0.00017023683470348945, + "learning_rate": 1.663011073405052e-11, + "loss": 0.0, + "step": 31949 + }, + { + "epoch": 12.99308662057747, + "grad_norm": 0.014761754082542187, + "learning_rate": 1.4833648598333585e-11, + "loss": 0.0001, + "step": 31950 + }, + { + "epoch": 12.993493289955266, + "grad_norm": 2.6917412912215575e-05, + "learning_rate": 1.31398413438788e-11, + "loss": 0.0, + "step": 31951 + }, + { + "epoch": 12.993899959333062, + "grad_norm": 0.001550767420887823, + "learning_rate": 1.1548688987339518e-11, + "loss": 0.0, + "step": 31952 + }, + { + "epoch": 12.994306628710858, + "grad_norm": 0.001867789023474801, + "learning_rate": 1.006019154536908e-11, + "loss": 0.0, + "step": 31953 + }, + { + "epoch": 12.994713298088653, + "grad_norm": 0.43966166449053384, + "learning_rate": 8.67434903351061e-12, + "loss": 0.0045, + "step": 31954 + }, + { + "epoch": 12.995119967466449, + "grad_norm": 0.009008850931636248, + "learning_rate": 7.391161463976559e-12, + "loss": 0.0001, + "step": 31955 + }, + { + "epoch": 12.995526636844245, + "grad_norm": 0.004828960020352483, + "learning_rate": 6.210628852310052e-12, + "loss": 0.0, + "step": 31956 + }, + { + "epoch": 12.995933306222042, + "grad_norm": 3.3703408677415814e-05, + "learning_rate": 5.132751209613318e-12, + "loss": 0.0, + "step": 31957 + }, + { + "epoch": 12.996339975599838, + "grad_norm": 0.027135995856812287, + "learning_rate": 4.15752854809881e-12, + "loss": 0.0002, + "step": 31958 + }, + { + "epoch": 12.996746644977634, + "grad_norm": 0.00026164865931824205, + "learning_rate": 3.28496087553809e-12, + "loss": 0.0, + "step": 31959 + }, + { + "epoch": 12.99715331435543, + "grad_norm": 4.9612208871695736e-05, + "learning_rate": 2.5150482030333877e-12, + "loss": 0.0, + "step": 31960 + }, + { + "epoch": 12.997559983733225, + "grad_norm": 0.013475782660250549, + "learning_rate": 1.8477905372460415e-12, + "loss": 0.0001, + "step": 31961 + }, + { + "epoch": 12.997966653111021, + "grad_norm": 0.003866990497024316, + "learning_rate": 1.2831878848373891e-12, + "loss": 0.0, + "step": 31962 + }, + { + "epoch": 12.998373322488817, + "grad_norm": 0.02812406272529344, + "learning_rate": 8.21240252468769e-13, + "loss": 0.0003, + "step": 31963 + }, + { + "epoch": 12.998779991866613, + "grad_norm": 0.06462188582339552, + "learning_rate": 4.619476445810733e-13, + "loss": 0.0005, + "step": 31964 + }, + { + "epoch": 12.999186661244408, + "grad_norm": 0.0014874418875153955, + "learning_rate": 2.0531006561519407e-13, + "loss": 0.0, + "step": 31965 + }, + { + "epoch": 12.999593330622204, + "grad_norm": 0.0038068005587156063, + "learning_rate": 5.132751668135427e-14, + "loss": 0.0, + "step": 31966 + }, + { + "epoch": 13.0, + "grad_norm": 0.00018707345760821754, + "learning_rate": 0.0, + "loss": 0.0, + "step": 31967 + }, + { + "epoch": 13.0, + "step": 31967, + "total_flos": 3.5583372281931366e+17, + "train_loss": 0.18225262537725156, + "train_runtime": 140420.9903, + "train_samples_per_second": 0.911, + "train_steps_per_second": 0.228 + } + ], + "logging_steps": 1.0, + "max_steps": 31967, + "num_input_tokens_seen": 0, + "num_train_epochs": 13, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.5583372281931366e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}